[cig-commits] r19622 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER: . DATA EXAMPLES/global_PREM_kernels/amplitude/DATA EXAMPLES/global_PREM_kernels/traveltime/DATA EXAMPLES/global_s362ani/DATA EXAMPLES/regional_Greece_small/DATA EXAMPLES/regional_MiddleEast/DATA setup src src/auxiliaries src/cuda src/meshfem3D src/shared src/specfem3D

Tue Feb 14 07:11:09 PST 2012

Author: danielpeter
Date: 2012-02-14 07:11:07 -0800 (Tue, 14 Feb 2012)
New Revision: 19622

Added:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_topography_bathymetry.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_GLL_points.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_output.f90
Modified:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/Par_file
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_PREM_kernels/amplitude/DATA/Par_file
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_PREM_kernels/traveltime/DATA/Par_file
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani/DATA/Par_file
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_MiddleEast/DATA/Par_file
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/configure
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/configure.ac
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/flags.guess
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/auxiliaries/create_movie_AVS_DX.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/calc_jacobian.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_attenuation.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_s362ani.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/broadcast_compute_parameters.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/memory_eval.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_parameter_file.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/save_header_file.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/Makefile.in
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_central_cube.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_central_cube_block.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar_block.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_arrays_source.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_element.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/fix_non_blocking_flags.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/initialize_simulation.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_receivers.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_arrays_buffers_solver.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_forward_arrays.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_volume.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90
Log:
adds modules and cuda routines

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/Par_file
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/Par_file	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/DATA/Par_file	2012-02-14 15:11:07 UTC (rev 19622)
@@ -5,23 +5,23 @@
 SAVE_FORWARD                    = .false.  # save last frame of forward simulation or not
 
 # number of chunks (1,2,3 or 6)
-NCHUNKS                         = 6
+NCHUNKS                         = 1
 
 # angular width of the first chunk (not used if full sphere with six chunks)
-ANGULAR_WIDTH_XI_IN_DEGREES   = 90.d0      # angular size of a chunk
-ANGULAR_WIDTH_ETA_IN_DEGREES  = 90.d0
+ANGULAR_WIDTH_XI_IN_DEGREES   = 20.d0      # angular size of a chunk
+ANGULAR_WIDTH_ETA_IN_DEGREES  = 20.d0
 CENTER_LATITUDE_IN_DEGREES    = 40.d0
-CENTER_LONGITUDE_IN_DEGREES   = 10.d0
-GAMMA_ROTATION_AZIMUTH        = 20.d0
+CENTER_LONGITUDE_IN_DEGREES   = 25.d0
+GAMMA_ROTATION_AZIMUTH        = 0.d0
 
 # number of elements at the surface along the two sides of the first chunk
 # (must be multiple of 16 and 8 * multiple of NPROC below)
-NEX_XI                          = 240
-NEX_ETA                         = 240
+NEX_XI                          = 64
+NEX_ETA                         = 64
 
 # number of MPI processors along the two sides of the first chunk
-NPROC_XI                        = 5
-NPROC_ETA                       = 5
+NPROC_XI                        = 2
+NPROC_ETA                       = 2
 
 # 1D models with real structure:
 # 1D_isotropic_prem, 1D_transversely_isotropic_prem, 1D_iasp91, 1D_1066a, 1D_ak135, 1D_ref, 1D_ref_iso, 1D_jp3d,1D_sea99
@@ -49,10 +49,10 @@
 ATTENUATION                     = .true.
 
 # absorbing boundary conditions for a regional simulation
-ABSORBING_CONDITIONS            = .false.
+ABSORBING_CONDITIONS            = .true.
 
 # record length in minutes
-RECORD_LENGTH_IN_MINUTES        = 15.0d0
+RECORD_LENGTH_IN_MINUTES        = 2.5d0
 
 # save AVS or OpenDX movies
 #MOVIE_COARSE saves movie only at corners of elements (SURFACE OR VOLUME) 
@@ -90,7 +90,7 @@
 LOCAL_PATH                      = ./DATABASES_MPI
 
 # interval at which we output time step info and max of norm of displacement
-NTSTEP_BETWEEN_OUTPUT_INFO      = 1000
+NTSTEP_BETWEEN_OUTPUT_INFO      = 100
 
 # interval in time steps for temporary writing of seismograms
 NTSTEP_BETWEEN_OUTPUT_SEISMOS   = 5000000
@@ -118,3 +118,5 @@
 # print source time function
 PRINT_SOURCE_TIME_FUNCTION      = .false.
 
+# set to true to use GPUs 
+GPU_MODE                        = .false.

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_PREM_kernels/amplitude/DATA/Par_file
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_PREM_kernels/amplitude/DATA/Par_file	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_PREM_kernels/amplitude/DATA/Par_file	2012-02-14 15:11:07 UTC (rev 19622)
@@ -118,3 +118,5 @@
 # print source time function
 PRINT_SOURCE_TIME_FUNCTION      = .false.
 
+# set to true to use GPUs 
+GPU_MODE                        = .false.

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_PREM_kernels/traveltime/DATA/Par_file
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_PREM_kernels/traveltime/DATA/Par_file	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_PREM_kernels/traveltime/DATA/Par_file	2012-02-14 15:11:07 UTC (rev 19622)
@@ -118,3 +118,5 @@
 # print source time function
 PRINT_SOURCE_TIME_FUNCTION      = .false.
 
+# set to true to use GPUs 
+GPU_MODE                        = .false.

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani/DATA/Par_file
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani/DATA/Par_file	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani/DATA/Par_file	2012-02-14 15:11:07 UTC (rev 19622)
@@ -118,3 +118,5 @@
 # print source time function
 PRINT_SOURCE_TIME_FUNCTION      = .false.
 
+# set to true to use GPUs 
+GPU_MODE                        = .false.

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file	2012-02-14 15:11:07 UTC (rev 19622)
@@ -112,3 +112,6 @@
 # print source time function
 PRINT_SOURCE_TIME_FUNCTION      = .false.
 
+# set to true to use GPUs 
+GPU_MODE                        = .false.
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_MiddleEast/DATA/Par_file
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_MiddleEast/DATA/Par_file	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_MiddleEast/DATA/Par_file	2012-02-14 15:11:07 UTC (rev 19622)
@@ -112,3 +112,5 @@
 # print source time function
 PRINT_SOURCE_TIME_FUNCTION      = .false.
 
+# set to true to use GPUs 
+GPU_MODE                        = .false.

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/configure
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/configure	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/configure	2012-02-14 15:11:07 UTC (rev 19622)
@@ -1,24 +1,20 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.67 for Specfem3D Globe 5.1.2.
+# Generated by GNU Autoconf 2.63 for Specfem3D Globe 5.1.2.
 #
 # Report bugs to <jtromp AT princeton.edu>.
 #
-#
 # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
-# 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software
-# Foundation, Inc.
-#
-#
+# 2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
 # This configure script is free software; the Free Software Foundation
 # gives unlimited permission to copy, distribute and modify it.
-## -------------------- ##
-## M4sh Initialization. ##
-## -------------------- ##
+## --------------------- ##
+## M4sh Initialization.  ##
+## --------------------- ##
 
 # Be more Bourne compatible
 DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
   emulate sh
   NULLCMD=:
   # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
@@ -26,15 +22,23 @@
   alias -g '${1+"$@"}'='"$@"'
   setopt NO_GLOB_SUBST
 else
-  case `(set -o) 2>/dev/null` in #(
-  *posix*) :
-    set -o posix ;; #(
-  *) :
-     ;;
+  case `(set -o) 2>/dev/null` in
+  *posix*) set -o posix ;;
 esac
+
 fi
 
 
+
+
+# PATH needs CR
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
 as_nl='
 '
 export as_nl
@@ -42,13 +46,7 @@
 as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
 as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
 as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
-# Prefer a ksh shell builtin over an external printf program on Solaris,
-# but without wasting forks for bash or zsh.
-if test -z "$BASH_VERSION$ZSH_VERSION" \
-    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='print -r --'
-  as_echo_n='print -rn --'
-elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
   as_echo='printf %s\n'
   as_echo_n='printf %s'
 else
@@ -59,7 +57,7 @@
     as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
     as_echo_n_body='eval
       arg=$1;
-      case $arg in #(
+      case $arg in
       *"$as_nl"*)
 	expr "X$arg" : "X\\(.*\\)$as_nl";
 	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
@@ -82,7 +80,14 @@
   }
 fi
 
+# Support unset when possible.
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+  as_unset=unset
+else
+  as_unset=false
+fi
 
+
 # IFS
 # We need space, tab and new line, in precisely that order.  Quoting is
 # there to prevent editors from complaining about space-tab.
@@ -91,15 +96,15 @@
 IFS=" ""	$as_nl"
 
 # Find who we are.  Look in the path if we contain no directory separator.
-case $0 in #((
+case $0 in
   *[\\/]* ) as_myself=$0 ;;
   *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
-  done
+  test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+done
 IFS=$as_save_IFS
 
      ;;
@@ -111,16 +116,12 @@
 fi
 if test ! -f "$as_myself"; then
   $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
-  exit 1
+  { (exit 1); exit 1; }
 fi
 
-# Unset variables that we do not need and which cause bugs (e.g. in
-# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
-# suppresses any "Segmentation fault" message there.  '((' could
-# trigger a bug in pdksh 5.2.14.
-for as_var in BASH_ENV ENV MAIL MAILPATH
-do eval test x\${$as_var+set} = xset \
-  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+# Work around bugs in pre-3.0 UWIN ksh.
+for as_var in ENV MAIL MAILPATH
+do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
 done
 PS1='$ '
 PS2='> '
@@ -132,300 +133,330 @@
 LANGUAGE=C
 export LANGUAGE
 
+# Required to use basename.
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+
+# Name of the executable.
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
 # CDPATH.
-(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
+$as_unset CDPATH
 
+
 if test "x$CONFIG_SHELL" = x; then
-  as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then :
-  emulate sh
-  NULLCMD=:
-  # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
-  # is contrary to our usage.  Disable this feature.
-  alias -g '\${1+\"\$@\"}'='\"\$@\"'
-  setopt NO_GLOB_SUBST
+  if (eval ":") 2>/dev/null; then
+  as_have_required=yes
 else
-  case \`(set -o) 2>/dev/null\` in #(
-  *posix*) :
-    set -o posix ;; #(
-  *) :
-     ;;
-esac
+  as_have_required=no
 fi
-"
-  as_required="as_fn_return () { (exit \$1); }
-as_fn_success () { as_fn_return 0; }
-as_fn_failure () { as_fn_return 1; }
-as_fn_ret_success () { return 0; }
-as_fn_ret_failure () { return 1; }
 
+  if test $as_have_required = yes &&	 (eval ":
+(as_func_return () {
+  (exit \$1)
+}
+as_func_success () {
+  as_func_return 0
+}
+as_func_failure () {
+  as_func_return 1
+}
+as_func_ret_success () {
+  return 0
+}
+as_func_ret_failure () {
+  return 1
+}
+
 exitcode=0
-as_fn_success || { exitcode=1; echo as_fn_success failed.; }
-as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
-as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
-as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
-if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then :
+if as_func_success; then
+  :
+else
+  exitcode=1
+  echo as_func_success failed.
+fi
 
+if as_func_failure; then
+  exitcode=1
+  echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+  :
 else
-  exitcode=1; echo positional parameters were not saved.
+  exitcode=1
+  echo as_func_ret_success failed.
 fi
-test x\$exitcode = x0 || exit 1"
-  as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
-  as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
-  eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
-  test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1
-test \$(( 1 + 1 )) = 2 || exit 1"
-  if (eval "$as_required") 2>/dev/null; then :
-  as_have_required=yes
+
+if as_func_ret_failure; then
+  exitcode=1
+  echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = \"\$1\" ); then
+  :
 else
-  as_have_required=no
+  exitcode=1
+  echo positional parameters were not saved.
 fi
-  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then :
 
+test \$exitcode = 0) || { (exit 1); exit 1; }
+
+(
+  as_lineno_1=\$LINENO
+  as_lineno_2=\$LINENO
+  test \"x\$as_lineno_1\" != \"x\$as_lineno_2\" &&
+  test \"x\`expr \$as_lineno_1 + 1\`\" = \"x\$as_lineno_2\") || { (exit 1); exit 1; }
+") 2> /dev/null; then
+  :
 else
-  as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
-as_found=false
+  as_candidate_shells=
+    as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-  as_found=:
-  case $as_dir in #(
+  case $as_dir in
 	 /*)
 	   for as_base in sh bash ksh sh5; do
-	     # Try only shells that exist, to save several forks.
-	     as_shell=$as_dir/$as_base
-	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
-		    { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then :
-  CONFIG_SHELL=$as_shell as_have_required=yes
-		   if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then :
-  break 2
-fi
-fi
+	     as_candidate_shells="$as_candidate_shells $as_dir/$as_base"
 	   done;;
        esac
-  as_found=false
 done
-$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
-	      { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then :
-  CONFIG_SHELL=$SHELL as_have_required=yes
-fi; }
 IFS=$as_save_IFS
 
 
-      if test "x$CONFIG_SHELL" != x; then :
-  # We cannot yet assume a decent shell, so we have to provide a
-	# neutralization value for shells without unset; and this also
-	# works around shells that cannot unset nonexistent variables.
-	BASH_ENV=/dev/null
-	ENV=/dev/null
-	(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
-	export CONFIG_SHELL
-	exec "$CONFIG_SHELL" "$as_myself" ${1+"$@"}
+      for as_shell in $as_candidate_shells $SHELL; do
+	 # Try only shells that exist, to save several forks.
+	 if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+		{ ("$as_shell") 2> /dev/null <<\_ASEOF
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in
+  *posix*) set -o posix ;;
+esac
+
 fi
 
-    if test x$as_have_required = xno; then :
-  $as_echo "$0: This script requires a shell more modern than all"
-  $as_echo "$0: the shells that I found on your system."
-  if test x${ZSH_VERSION+set} = xset ; then
-    $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should"
-    $as_echo "$0: be upgraded to zsh 4.3.4 or later."
-  else
-    $as_echo "$0: Please tell bug-autoconf at gnu.org and jtromp AT
-$0: princeton.edu about your system, including any error
-$0: possibly output before this message. Then install a
-$0: modern shell, or manually run the script under such a
-$0: shell if you do have one."
-  fi
-  exit 1
+
+:
+_ASEOF
+}; then
+  CONFIG_SHELL=$as_shell
+	       as_have_required=yes
+	       if { "$as_shell" 2> /dev/null <<\_ASEOF
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else
+  case `(set -o) 2>/dev/null` in
+  *posix*) set -o posix ;;
+esac
+
 fi
+
+
+:
+(as_func_return () {
+  (exit $1)
+}
+as_func_success () {
+  as_func_return 0
+}
+as_func_failure () {
+  as_func_return 1
+}
+as_func_ret_success () {
+  return 0
+}
+as_func_ret_failure () {
+  return 1
+}
+
+exitcode=0
+if as_func_success; then
+  :
+else
+  exitcode=1
+  echo as_func_success failed.
 fi
+
+if as_func_failure; then
+  exitcode=1
+  echo as_func_failure succeeded.
 fi
-SHELL=${CONFIG_SHELL-/bin/sh}
-export SHELL
-# Unset more variables known to interfere with behavior of common tools.
-CLICOLOR_FORCE= GREP_OPTIONS=
-unset CLICOLOR_FORCE GREP_OPTIONS
 
-## --------------------- ##
-## M4sh Shell Functions. ##
-## --------------------- ##
-# as_fn_unset VAR
-# ---------------
-# Portably unset VAR.
-as_fn_unset ()
-{
-  { eval $1=; unset $1;}
-}
-as_unset=as_fn_unset
+if as_func_ret_success; then
+  :
+else
+  exitcode=1
+  echo as_func_ret_success failed.
+fi
 
-# as_fn_set_status STATUS
-# -----------------------
-# Set $? to STATUS, without forking.
-as_fn_set_status ()
-{
-  return $1
-} # as_fn_set_status
+if as_func_ret_failure; then
+  exitcode=1
+  echo as_func_ret_failure succeeded.
+fi
 
-# as_fn_exit STATUS
-# -----------------
-# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
-as_fn_exit ()
-{
-  set +e
-  as_fn_set_status $1
-  exit $1
-} # as_fn_exit
+if ( set x; as_func_ret_success y && test x = "$1" ); then
+  :
+else
+  exitcode=1
+  echo positional parameters were not saved.
+fi
 
-# as_fn_mkdir_p
-# -------------
-# Create "$as_dir" as a directory, including parents if necessary.
-as_fn_mkdir_p ()
-{
+test $exitcode = 0) || { (exit 1); exit 1; }
 
-  case $as_dir in #(
-  -*) as_dir=./$as_dir;;
-  esac
-  test -d "$as_dir" || eval $as_mkdir_p || {
-    as_dirs=
-    while :; do
-      case $as_dir in #(
-      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
-      *) as_qdir=$as_dir;;
-      esac
-      as_dirs="'$as_qdir' $as_dirs"
-      as_dir=`$as_dirname -- "$as_dir" ||
-$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$as_dir" : 'X\(//\)[^/]' \| \
-	 X"$as_dir" : 'X\(//\)$' \| \
-	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_dir" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-      test -d "$as_dir" && break
-    done
-    test -z "$as_dirs" || eval "mkdir $as_dirs"
-  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+(
+  as_lineno_1=$LINENO
+  as_lineno_2=$LINENO
+  test "x$as_lineno_1" != "x$as_lineno_2" &&
+  test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2") || { (exit 1); exit 1; }
 
+_ASEOF
+}; then
+  break
+fi
 
-} # as_fn_mkdir_p
-# as_fn_append VAR VALUE
-# ----------------------
-# Append the text in VALUE to the end of the definition contained in VAR. Take
-# advantage of any shell optimizations that allow amortized linear growth over
-# repeated appends, instead of the typical quadratic growth present in naive
-# implementations.
-if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
-  eval 'as_fn_append ()
-  {
-    eval $1+=\$2
-  }'
-else
-  as_fn_append ()
-  {
-    eval $1=\$$1\$2
-  }
-fi # as_fn_append
+fi
 
-# as_fn_arith ARG...
-# ------------------
-# Perform arithmetic evaluation on the ARGs, and store the result in the
-# global $as_val. Take advantage of shells that can avoid forks. The arguments
-# must be portable across $(()) and expr.
-if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
-  eval 'as_fn_arith ()
-  {
-    as_val=$(( $* ))
-  }'
-else
-  as_fn_arith ()
-  {
-    as_val=`expr "$@" || test $? -eq 1`
-  }
-fi # as_fn_arith
+      done
 
+      if test "x$CONFIG_SHELL" != x; then
+  for as_var in BASH_ENV ENV
+	do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
+	done
+	export CONFIG_SHELL
+	exec "$CONFIG_SHELL" "$as_myself" ${1+"$@"}
+fi
 
-# as_fn_error STATUS ERROR [LINENO LOG_FD]
-# ----------------------------------------
-# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
-# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
-# script with STATUS, using 1 if that was 0.
-as_fn_error ()
-{
-  as_status=$1; test $as_status -eq 0 && as_status=1
-  if test "$4"; then
-    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
-  fi
-  $as_echo "$as_me: error: $2" >&2
-  as_fn_exit $as_status
-} # as_fn_error
 
-if expr a : '\(a\)' >/dev/null 2>&1 &&
-   test "X`expr 00001 : '.*\(...\)'`" = X001; then
-  as_expr=expr
+    if test $as_have_required = no; then
+  echo This script requires a shell more modern than all the
+      echo shells that I found on your system.  Please install a
+      echo modern shell, or manually run the script under such a
+      echo shell if you do have one.
+      { (exit 1); exit 1; }
+fi
+
+
+fi
+
+fi
+
+
+
+(eval "as_func_return () {
+  (exit \$1)
+}
+as_func_success () {
+  as_func_return 0
+}
+as_func_failure () {
+  as_func_return 1
+}
+as_func_ret_success () {
+  return 0
+}
+as_func_ret_failure () {
+  return 1
+}
+
+exitcode=0
+if as_func_success; then
+  :
 else
-  as_expr=false
+  exitcode=1
+  echo as_func_success failed.
 fi
 
-if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
-  as_basename=basename
+if as_func_failure; then
+  exitcode=1
+  echo as_func_failure succeeded.
+fi
+
+if as_func_ret_success; then
+  :
 else
-  as_basename=false
+  exitcode=1
+  echo as_func_ret_success failed.
 fi
 
-if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
-  as_dirname=dirname
+if as_func_ret_failure; then
+  exitcode=1
+  echo as_func_ret_failure succeeded.
+fi
+
+if ( set x; as_func_ret_success y && test x = \"\$1\" ); then
+  :
 else
-  as_dirname=false
+  exitcode=1
+  echo positional parameters were not saved.
 fi
 
-as_me=`$as_basename -- "$0" ||
-$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
-	 X"$0" : 'X\(//\)$' \| \
-	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X/"$0" |
-    sed '/^.*\/\([^/][^/]*\)\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\/\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
+test \$exitcode = 0") || {
+  echo No shell found that supports shell functions.
+  echo Please tell bug-autoconf at gnu.org about your system,
+  echo including any error possibly output before this message.
+  echo This can help us improve future autoconf versions.
+  echo Configuration will now proceed without shell functions.
+}
 
-# Avoid depending upon Character Ranges.
-as_cr_letters='abcdefghijklmnopqrstuvwxyz'
-as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-as_cr_Letters=$as_cr_letters$as_cr_LETTERS
-as_cr_digits='0123456789'
-as_cr_alnum=$as_cr_Letters$as_cr_digits
 
 
-  as_lineno_1=$LINENO as_lineno_1a=$LINENO
-  as_lineno_2=$LINENO as_lineno_2a=$LINENO
-  eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" &&
-  test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || {
-  # Blame Lee E. McMahon (1931-1989) for sed's syntax.  :-)
+  as_lineno_1=$LINENO
+  as_lineno_2=$LINENO
+  test "x$as_lineno_1" != "x$as_lineno_2" &&
+  test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || {
+
+  # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
+  # uniformly replaced by the line number.  The first 'sed' inserts a
+  # line-number line after each line using $LINENO; the second 'sed'
+  # does the real work.  The second script uses 'N' to pair each
+  # line-number line with the line containing $LINENO, and appends
+  # trailing '-' during substitution so that $LINENO is not a special
+  # case at line end.
+  # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
+  # scripts with optimization help from Paolo Bonzini.  Blame Lee
+  # E. McMahon (1931-1989) for sed's syntax.  :-)
   sed -n '
     p
     /[$]LINENO/=
@@ -442,7 +473,8 @@
       s/-\n.*//
     ' >$as_me.lineno &&
   chmod +x "$as_me.lineno" ||
-    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
+    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
+   { (exit 1); exit 1; }; }
 
   # Don't try to exec as it changes $[0], causing all sort of problems
   # (the dirname of $[0] is not the place where we might find the
@@ -452,18 +484,29 @@
   exit
 }
 
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
 ECHO_C= ECHO_N= ECHO_T=
-case `echo -n x` in #(((((
+case `echo -n x` in
 -n*)
-  case `echo 'xy\c'` in
+  case `echo 'x\c'` in
   *c*) ECHO_T='	';;	# ECHO_T is single tab character.
-  xy)  ECHO_C='\c';;
-  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
-       ECHO_T='	';;
+  *)   ECHO_C='\c';;
   esac;;
 *)
   ECHO_N='-n';;
 esac
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
 
 rm -f conf$$ conf$$.exe conf$$.file
 if test -d conf$$.dir; then
@@ -493,7 +536,7 @@
 rmdir conf$$.dir 2>/dev/null
 
 if mkdir -p . 2>/dev/null; then
-  as_mkdir_p='mkdir -p "$as_dir"'
+  as_mkdir_p=:
 else
   test -d ./-p && rmdir ./-p
   as_mkdir_p=false
@@ -512,10 +555,10 @@
       if test -d "$1"; then
 	test -d "$1/.";
       else
-	case $1 in #(
+	case $1 in
 	-*)set "./$1";;
 	esac;
-	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #((
+	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in
 	???[sx]*):;;*)false;;esac;fi
     '\'' sh
   '
@@ -529,11 +572,11 @@
 as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'"
 
 
-test -n "$DJDIR" || exec 7<&0 </dev/null
-exec 6>&1
 
+exec 7<&0 </dev/null 6>&1
+
 # Name of the host.
-# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status,
+# hostname on some systems (SVR3.2, Linux) returns a bogus exit status,
 # so uname gets run too.
 ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
 
@@ -548,6 +591,7 @@
 subdirs=
 MFLAGS=
 MAKEFLAGS=
+SHELL=${CONFIG_SHELL-/bin/sh}
 
 # Identity of this package.
 PACKAGE_NAME='Specfem3D Globe'
@@ -555,7 +599,6 @@
 PACKAGE_VERSION='5.1.2'
 PACKAGE_STRING='Specfem3D Globe 5.1.2'
 PACKAGE_BUGREPORT='jtromp AT princeton.edu'
-PACKAGE_URL=''
 
 ac_unique_file="src/specfem3D/specfem3D.F90"
 # Factoring default headers for most tests.
@@ -597,13 +640,16 @@
 ac_subst_vars='LTLIBOBJS
 LIBOBJS
 LOCAL_PATH_IS_ALSO_GLOBAL
+RANLIB
+ARFLAGS
+AR
+MPI_INC
+CUDA_INC
+CUDA_LIB
 FLAGS_NO_CHECK
 FLAGS_CHECK
 MPILIBS
 MPIFC
-AR
-ARFLAGS
-RANLIB
 EGREP
 GREP
 CPP
@@ -620,6 +666,8 @@
 LDFLAGS
 FCFLAGS
 FC
+COND_CUDA_FALSE
+COND_CUDA_TRUE
 CUSTOM_MPI_TYPE
 CUSTOM_REAL
 target_alias
@@ -652,7 +700,6 @@
 program_transform_name
 prefix
 exec_prefix
-PACKAGE_URL
 PACKAGE_BUGREPORT
 PACKAGE_STRING
 PACKAGE_VERSION
@@ -664,6 +711,7 @@
 ac_user_opts='
 enable_option_checking
 enable_double_precision
+with_cuda
 '
       ac_precious_vars='build_alias
 host_alias
@@ -680,6 +728,12 @@
 MPILIBS
 FLAGS_CHECK
 FLAGS_NO_CHECK
+CUDA_LIB
+CUDA_INC
+MPI_INC
+AR
+ARFLAGS
+RANLIB
 LOCAL_PATH_IS_ALSO_GLOBAL'
 
 
@@ -743,9 +797,8 @@
   fi
 
   case $ac_option in
-  *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
-  *=)   ac_optarg= ;;
-  *)    ac_optarg=yes ;;
+  *=*)	ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+  *)	ac_optarg=yes ;;
   esac
 
   # Accept the important Cygnus configure options, so we can diagnose typos.
@@ -790,7 +843,8 @@
     ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid feature name: $ac_useropt"
+      { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2
+   { (exit 1); exit 1; }; }
     ac_useropt_orig=$ac_useropt
     ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
@@ -816,7 +870,8 @@
     ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid feature name: $ac_useropt"
+      { $as_echo "$as_me: error: invalid feature name: $ac_useropt" >&2
+   { (exit 1); exit 1; }; }
     ac_useropt_orig=$ac_useropt
     ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
@@ -1020,7 +1075,8 @@
     ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid package name: $ac_useropt"
+      { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2
+   { (exit 1); exit 1; }; }
     ac_useropt_orig=$ac_useropt
     ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
@@ -1036,7 +1092,8 @@
     ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
     # Reject names that are not valid shell variable names.
     expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
-      as_fn_error $? "invalid package name: $ac_useropt"
+      { $as_echo "$as_me: error: invalid package name: $ac_useropt" >&2
+   { (exit 1); exit 1; }; }
     ac_useropt_orig=$ac_useropt
     ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'`
     case $ac_user_opts in
@@ -1066,17 +1123,17 @@
   | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
     x_libraries=$ac_optarg ;;
 
-  -*) as_fn_error $? "unrecognized option: \`$ac_option'
-Try \`$0 --help' for more information"
+  -*) { $as_echo "$as_me: error: unrecognized option: $ac_option
+Try \`$0 --help' for more information." >&2
+   { (exit 1); exit 1; }; }
     ;;
 
   *=*)
     ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
     # Reject names that are not valid shell variable names.
-    case $ac_envvar in #(
-      '' | [0-9]* | *[!_$as_cr_alnum]* )
-      as_fn_error $? "invalid variable name: \`$ac_envvar'" ;;
-    esac
+    expr "x$ac_envvar" : ".*[^_$as_cr_alnum]" >/dev/null &&
+      { $as_echo "$as_me: error: invalid variable name: $ac_envvar" >&2
+   { (exit 1); exit 1; }; }
     eval $ac_envvar=\$ac_optarg
     export $ac_envvar ;;
 
@@ -1093,13 +1150,15 @@
 
 if test -n "$ac_prev"; then
   ac_option=--`echo $ac_prev | sed 's/_/-/g'`
-  as_fn_error $? "missing argument to $ac_option"
+  { $as_echo "$as_me: error: missing argument to $ac_option" >&2
+   { (exit 1); exit 1; }; }
 fi
 
 if test -n "$ac_unrecognized_opts"; then
   case $enable_option_checking in
     no) ;;
-    fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
+    fatal) { $as_echo "$as_me: error: unrecognized options: $ac_unrecognized_opts" >&2
+   { (exit 1); exit 1; }; } ;;
     *)     $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
   esac
 fi
@@ -1122,7 +1181,8 @@
     [\\/$]* | ?:[\\/]* )  continue;;
     NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
   esac
-  as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val"
+  { $as_echo "$as_me: error: expected an absolute directory name for --$ac_var: $ac_val" >&2
+   { (exit 1); exit 1; }; }
 done
 
 # There might be people who depend on the old broken behavior: `$host'
@@ -1136,8 +1196,8 @@
 if test "x$host_alias" != x; then
   if test "x$build_alias" = x; then
     cross_compiling=maybe
-    $as_echo "$as_me: WARNING: if you wanted to set the --build type, don't use --host.
-    If a cross compiler is detected then cross compile mode will be used" >&2
+    $as_echo "$as_me: WARNING: If you wanted to set the --build type, don't use --host.
+    If a cross compiler is detected then cross compile mode will be used." >&2
   elif test "x$build_alias" != "x$host_alias"; then
     cross_compiling=yes
   fi
@@ -1152,9 +1212,11 @@
 ac_pwd=`pwd` && test -n "$ac_pwd" &&
 ac_ls_di=`ls -di .` &&
 ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
-  as_fn_error $? "working directory cannot be determined"
+  { $as_echo "$as_me: error: working directory cannot be determined" >&2
+   { (exit 1); exit 1; }; }
 test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
-  as_fn_error $? "pwd does not report name of working directory"
+  { $as_echo "$as_me: error: pwd does not report name of working directory" >&2
+   { (exit 1); exit 1; }; }
 
 
 # Find the source files, if location was not specified.
@@ -1193,11 +1255,13 @@
 fi
 if test ! -r "$srcdir/$ac_unique_file"; then
   test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
-  as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir"
+  { $as_echo "$as_me: error: cannot find sources ($ac_unique_file) in $srcdir" >&2
+   { (exit 1); exit 1; }; }
 fi
 ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work"
 ac_abs_confdir=`(
-	cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
+	cd "$srcdir" && test -r "./$ac_unique_file" || { $as_echo "$as_me: error: $ac_msg" >&2
+   { (exit 1); exit 1; }; }
 	pwd)`
 # When building in place, set srcdir=.
 if test "$ac_abs_confdir" = "$ac_pwd"; then
@@ -1237,7 +1301,7 @@
       --help=short        display options specific to this package
       --help=recursive    display the short help of all the included packages
   -V, --version           display version information and exit
-  -q, --quiet, --silent   do not print \`checking ...' messages
+  -q, --quiet, --silent   do not print \`checking...' messages
       --cache-file=FILE   cache test results in FILE [disabled]
   -C, --config-cache      alias for \`--cache-file=config.cache'
   -n, --no-create         do not create output files
@@ -1295,6 +1359,11 @@
   --enable-double-precision
                           solver in double precision [default=no]
 
+Optional Packages:
+  --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
+  --without-PACKAGE       do not use PACKAGE (same as --with-PACKAGE=no)
+  --with-cuda             build cuda GPU enabled version [default=no]
+
 Some influential environment variables:
   FC          Fortran compiler command
   FCFLAGS     Fortran compiler flags
@@ -1303,7 +1372,7 @@
   LIBS        libraries to pass to the linker, e.g. -l<library>
   CC          C compiler command
   CFLAGS      C compiler flags
-  CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
+  CPPFLAGS    C/C++/Objective C preprocessor flags, e.g. -I<include dir> if
               you have headers in a nonstandard directory <include dir>
   CPP         C preprocessor
   MPIFC       MPI Fortran compiler command
@@ -1312,6 +1381,13 @@
   FLAGS_NO_CHECK
               Fortran compiler flags for creating fast, production-run code
               for critical subroutines
+  CUDA_LIB    Location of CUDA library libcudart
+  CUDA_INC    Location of CUDA include files
+  MPI_INC     Location of MPI include mpi.h, which is needed by nvcc when
+              compiling cuda files
+  AR          ar library creation
+  ARFLAGS     ar flag library creation
+  RANLIB      ranlib library creation
   LOCAL_PATH_IS_ALSO_GLOBAL
               files on a local path on each node are also seen as global with
               same path [default=true]
@@ -1383,347 +1459,21 @@
 if $ac_init_version; then
   cat <<\_ACEOF
 Specfem3D Globe configure 5.1.2
-generated by GNU Autoconf 2.67
+generated by GNU Autoconf 2.63
 
-Copyright (C) 2010 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
+2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
 This configure script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it.
 _ACEOF
   exit
 fi
-
-## ------------------------ ##
-## Autoconf initialization. ##
-## ------------------------ ##
-
-# ac_fn_fc_try_compile LINENO
-# ---------------------------
-# Try to compile conftest.$ac_ext, and return whether this succeeded.
-ac_fn_fc_try_compile ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext
-  if { { ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compile") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_fc_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_retval=1
-fi
-  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
-  as_fn_set_status $ac_retval
-
-} # ac_fn_fc_try_compile
-
-# ac_fn_c_try_compile LINENO
-# --------------------------
-# Try to compile conftest.$ac_ext, and return whether this succeeded.
-ac_fn_c_try_compile ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext
-  if { { ac_try="$ac_compile"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compile") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest.$ac_objext; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_retval=1
-fi
-  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
-  as_fn_set_status $ac_retval
-
-} # ac_fn_c_try_compile
-
-# ac_fn_c_try_link LINENO
-# -----------------------
-# Try to link conftest.$ac_ext, and return whether this succeeded.
-ac_fn_c_try_link ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  rm -f conftest.$ac_objext conftest$ac_exeext
-  if { { ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && {
-	 test -z "$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       } && test -s conftest$ac_exeext && {
-	 test "$cross_compiling" = yes ||
-	 $as_test_x conftest$ac_exeext
-       }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-	ac_retval=1
-fi
-  # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information
-  # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would
-  # interfere with the next link command; also delete a directory that is
-  # left behind by Apple's compiler.  We do this before executing the actions.
-  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
-  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
-  as_fn_set_status $ac_retval
-
-} # ac_fn_c_try_link
-
-# ac_fn_c_try_cpp LINENO
-# ----------------------
-# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
-ac_fn_c_try_cpp ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if { { ac_try="$ac_cpp conftest.$ac_ext"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
-  ac_status=$?
-  if test -s conftest.err; then
-    grep -v '^ *+' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-    mv -f conftest.er1 conftest.err
-  fi
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } > conftest.i && {
-	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
-	 test ! -s conftest.err
-       }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-    ac_retval=1
-fi
-  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
-  as_fn_set_status $ac_retval
-
-} # ac_fn_c_try_cpp
-
-# ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES
-# -------------------------------------------------------
-# Tests whether HEADER exists, giving a warning if it cannot be compiled using
-# the include files in INCLUDES and setting the cache variable VAR
-# accordingly.
-ac_fn_c_check_header_mongrel ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if eval "test \"\${$3+set}\"" = set; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval "test \"\${$3+set}\"" = set; then :
-  $as_echo_n "(cached) " >&6
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-else
-  # Is the header compilable?
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5
-$as_echo_n "checking $2 usability... " >&6; }
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-$4
-#include <$2>
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  ac_header_compiler=yes
-else
-  ac_header_compiler=no
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5
-$as_echo "$ac_header_compiler" >&6; }
-
-# Is the header present?
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5
-$as_echo_n "checking $2 presence... " >&6; }
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-#include <$2>
-_ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
-  ac_header_preproc=yes
-else
-  ac_header_preproc=no
-fi
-rm -f conftest.err conftest.i conftest.$ac_ext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5
-$as_echo "$ac_header_preproc" >&6; }
-
-# So?  What about this header?
-case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in #((
-  yes:no: )
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5
-$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
-$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
-    ;;
-  no:yes:* )
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5
-$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     check for missing prerequisite headers?" >&5
-$as_echo "$as_me: WARNING: $2:     check for missing prerequisite headers?" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5
-$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&5
-$as_echo "$as_me: WARNING: $2:     section \"Present But Cannot Be Compiled\"" >&2;}
-    { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5
-$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;}
-( $as_echo "## -------------------------------------- ##
-## Report this to jtromp AT princeton.edu ##
-## -------------------------------------- ##"
-     ) | sed "s/^/$as_me: WARNING:     /" >&2
-    ;;
-esac
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval "test \"\${$3+set}\"" = set; then :
-  $as_echo_n "(cached) " >&6
-else
-  eval "$3=\$ac_header_compiler"
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-fi
-  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
-
-} # ac_fn_c_check_header_mongrel
-
-# ac_fn_c_try_run LINENO
-# ----------------------
-# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes
-# that executables *can* be run.
-ac_fn_c_try_run ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  if { { ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; } && { ac_try='./conftest$ac_exeext'
-  { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then :
-  ac_retval=0
-else
-  $as_echo "$as_me: program exited with status $ac_status" >&5
-       $as_echo "$as_me: failed program was:" >&5
-sed 's/^/| /' conftest.$ac_ext >&5
-
-       ac_retval=$ac_status
-fi
-  rm -rf conftest.dSYM conftest_ipa8_conftest.oo
-  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
-  as_fn_set_status $ac_retval
-
-} # ac_fn_c_try_run
-
-# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES
-# -------------------------------------------------------
-# Tests whether HEADER exists and can be compiled using the include files in
-# INCLUDES, setting the cache variable VAR accordingly.
-ac_fn_c_check_header_compile ()
-{
-  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-  { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5
-$as_echo_n "checking for $2... " >&6; }
-if eval "test \"\${$3+set}\"" = set; then :
-  $as_echo_n "(cached) " >&6
-else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
-/* end confdefs.h.  */
-$4
-#include <$2>
-_ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
-  eval "$3=yes"
-else
-  eval "$3=no"
-fi
-rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
-fi
-eval ac_res=\$$3
-	       { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5
-$as_echo "$ac_res" >&6; }
-  eval $as_lineno_stack; test "x$as_lineno_stack" = x && { as_lineno=; unset as_lineno;}
-
-} # ac_fn_c_check_header_compile
 cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
 It was created by Specfem3D Globe $as_me 5.1.2, which was
-generated by GNU Autoconf 2.67.  Invocation command line was
+generated by GNU Autoconf 2.63.  Invocation command line was
 
   $ $0 $@
 
@@ -1759,8 +1509,8 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    $as_echo "PATH: $as_dir"
-  done
+  $as_echo "PATH: $as_dir"
+done
 IFS=$as_save_IFS
 
 } >&5
@@ -1797,9 +1547,9 @@
       ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
     esac
     case $ac_pass in
-    1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
+    1) ac_configure_args0="$ac_configure_args0 '$ac_arg'" ;;
     2)
-      as_fn_append ac_configure_args1 " '$ac_arg'"
+      ac_configure_args1="$ac_configure_args1 '$ac_arg'"
       if test $ac_must_keep_next = true; then
 	ac_must_keep_next=false # Got value, back to normal.
       else
@@ -1815,13 +1565,13 @@
 	  -* ) ac_must_keep_next=true ;;
 	esac
       fi
-      as_fn_append ac_configure_args " '$ac_arg'"
+      ac_configure_args="$ac_configure_args '$ac_arg'"
       ;;
     esac
   done
 done
-{ ac_configure_args0=; unset ac_configure_args0;}
-{ ac_configure_args1=; unset ac_configure_args1;}
+$as_unset ac_configure_args0 || test "${ac_configure_args0+set}" != set || { ac_configure_args0=; export ac_configure_args0; }
+$as_unset ac_configure_args1 || test "${ac_configure_args1+set}" != set || { ac_configure_args1=; export ac_configure_args1; }
 
 # When interrupted or exit'd, cleanup temporary files, and complete
 # config.log.  We remove comments because anyway the quotes in there
@@ -1833,9 +1583,11 @@
   {
     echo
 
-    $as_echo "## ---------------- ##
+    cat <<\_ASBOX
+## ---------------- ##
 ## Cache variables. ##
-## ---------------- ##"
+## ---------------- ##
+_ASBOX
     echo
     # The following way of writing the cache mishandles newlines in values,
 (
@@ -1844,13 +1596,13 @@
     case $ac_val in #(
     *${as_nl}*)
       case $ac_var in #(
-      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+      *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5
 $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
       esac
       case $ac_var in #(
       _ | IFS | as_nl) ;; #(
       BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
-      *) { eval $ac_var=; unset $ac_var;} ;;
+      *) $as_unset $ac_var ;;
       esac ;;
     esac
   done
@@ -1869,9 +1621,11 @@
 )
     echo
 
-    $as_echo "## ----------------- ##
+    cat <<\_ASBOX
+## ----------------- ##
 ## Output variables. ##
-## ----------------- ##"
+## ----------------- ##
+_ASBOX
     echo
     for ac_var in $ac_subst_vars
     do
@@ -1884,9 +1638,11 @@
     echo
 
     if test -n "$ac_subst_files"; then
-      $as_echo "## ------------------- ##
+      cat <<\_ASBOX
+## ------------------- ##
 ## File substitutions. ##
-## ------------------- ##"
+## ------------------- ##
+_ASBOX
       echo
       for ac_var in $ac_subst_files
       do
@@ -1900,9 +1656,11 @@
     fi
 
     if test -s confdefs.h; then
-      $as_echo "## ----------- ##
+      cat <<\_ASBOX
+## ----------- ##
 ## confdefs.h. ##
-## ----------- ##"
+## ----------- ##
+_ASBOX
       echo
       cat confdefs.h
       echo
@@ -1916,53 +1674,46 @@
     exit $exit_status
 ' 0
 for ac_signal in 1 2 13 15; do
-  trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal
+  trap 'ac_signal='$ac_signal'; { (exit 1); exit 1; }' $ac_signal
 done
 ac_signal=0
 
 # confdefs.h avoids OS command line length limits that DEFS can exceed.
 rm -f -r conftest* confdefs.h
 
-$as_echo "/* confdefs.h */" > confdefs.h
-
 # Predefined preprocessor variables.
 
 cat >>confdefs.h <<_ACEOF
 #define PACKAGE_NAME "$PACKAGE_NAME"
 _ACEOF
 
+
 cat >>confdefs.h <<_ACEOF
 #define PACKAGE_TARNAME "$PACKAGE_TARNAME"
 _ACEOF
 
+
 cat >>confdefs.h <<_ACEOF
 #define PACKAGE_VERSION "$PACKAGE_VERSION"
 _ACEOF
 
+
 cat >>confdefs.h <<_ACEOF
 #define PACKAGE_STRING "$PACKAGE_STRING"
 _ACEOF
 
+
 cat >>confdefs.h <<_ACEOF
 #define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT"
 _ACEOF
 
-cat >>confdefs.h <<_ACEOF
-#define PACKAGE_URL "$PACKAGE_URL"
-_ACEOF
 
-
 # Let the site file select an alternate cache file if it wants to.
 # Prefer an explicitly selected file to automatically selected ones.
 ac_site_file1=NONE
 ac_site_file2=NONE
 if test -n "$CONFIG_SITE"; then
-  # We do not want a PATH search for config.site.
-  case $CONFIG_SITE in #((
-    -*)  ac_site_file1=./$CONFIG_SITE;;
-    */*) ac_site_file1=$CONFIG_SITE;;
-    *)   ac_site_file1=./$CONFIG_SITE;;
-  esac
+  ac_site_file1=$CONFIG_SITE
 elif test "x$prefix" != xNONE; then
   ac_site_file1=$prefix/share/config.site
   ac_site_file2=$prefix/etc/config.site
@@ -1973,23 +1724,19 @@
 for ac_site_file in "$ac_site_file1" "$ac_site_file2"
 do
   test "x$ac_site_file" = xNONE && continue
-  if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
+  if test -r "$ac_site_file"; then
+    { $as_echo "$as_me:$LINENO: loading site script $ac_site_file" >&5
 $as_echo "$as_me: loading site script $ac_site_file" >&6;}
     sed 's/^/| /' "$ac_site_file" >&5
-    . "$ac_site_file" \
-      || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "failed to load site script $ac_site_file
-See \`config.log' for more details" "$LINENO" 5 ; }
+    . "$ac_site_file"
   fi
 done
 
 if test -r "$cache_file"; then
-  # Some versions of bash will fail to source /dev/null (special files
-  # actually), so we avoid doing that.  DJGPP emulates it as a regular file.
-  if test /dev/null != "$cache_file" && test -f "$cache_file"; then
-    { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
+  # Some versions of bash will fail to source /dev/null (special
+  # files actually), so we avoid doing that.
+  if test -f "$cache_file"; then
+    { $as_echo "$as_me:$LINENO: loading cache $cache_file" >&5
 $as_echo "$as_me: loading cache $cache_file" >&6;}
     case $cache_file in
       [\\/]* | ?:[\\/]* ) . "$cache_file";;
@@ -1997,7 +1744,7 @@
     esac
   fi
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
+  { $as_echo "$as_me:$LINENO: creating cache $cache_file" >&5
 $as_echo "$as_me: creating cache $cache_file" >&6;}
   >$cache_file
 fi
@@ -2012,11 +1759,11 @@
   eval ac_new_val=\$ac_env_${ac_var}_value
   case $ac_old_set,$ac_new_set in
     set,)
-      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
+      { $as_echo "$as_me:$LINENO: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5
 $as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;}
       ac_cache_corrupted=: ;;
     ,set)
-      { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5
+      { $as_echo "$as_me:$LINENO: error: \`$ac_var' was not set in the previous run" >&5
 $as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;}
       ac_cache_corrupted=: ;;
     ,);;
@@ -2026,17 +1773,17 @@
 	ac_old_val_w=`echo x $ac_old_val`
 	ac_new_val_w=`echo x $ac_new_val`
 	if test "$ac_old_val_w" != "$ac_new_val_w"; then
-	  { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5
+	  { $as_echo "$as_me:$LINENO: error: \`$ac_var' has changed since the previous run:" >&5
 $as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;}
 	  ac_cache_corrupted=:
 	else
-	  { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
+	  { $as_echo "$as_me:$LINENO: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5
 $as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;}
 	  eval $ac_var=\$ac_old_val
 	fi
-	{ $as_echo "$as_me:${as_lineno-$LINENO}:   former value:  \`$ac_old_val'" >&5
+	{ $as_echo "$as_me:$LINENO:   former value:  \`$ac_old_val'" >&5
 $as_echo "$as_me:   former value:  \`$ac_old_val'" >&2;}
-	{ $as_echo "$as_me:${as_lineno-$LINENO}:   current value: \`$ac_new_val'" >&5
+	{ $as_echo "$as_me:$LINENO:   current value: \`$ac_new_val'" >&5
 $as_echo "$as_me:   current value: \`$ac_new_val'" >&2;}
       fi;;
   esac
@@ -2048,21 +1795,44 @@
     esac
     case " $ac_configure_args " in
       *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
-      *) as_fn_append ac_configure_args " '$ac_arg'" ;;
+      *) ac_configure_args="$ac_configure_args '$ac_arg'" ;;
     esac
   fi
 done
 if $ac_cache_corrupted; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+  { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-  { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
+  { $as_echo "$as_me:$LINENO: error: changes in the environment can compromise the build" >&5
 $as_echo "$as_me: error: changes in the environment can compromise the build" >&2;}
-  as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5
+  { { $as_echo "$as_me:$LINENO: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&5
+$as_echo "$as_me: error: run \`make distclean' and/or \`rm $cache_file' and start over" >&2;}
+   { (exit 1); exit 1; }; }
 fi
-## -------------------- ##
-## Main body of script. ##
-## -------------------- ##
 
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
@@ -2079,7 +1849,7 @@
 # 'configure' options
 
 # Check whether --enable-double-precision was given.
-if test "${enable_double_precision+set}" = set; then :
+if test "${enable_double_precision+set}" = set; then
   enableval=$enable_double_precision; want_double_precision="$enableval"
 else
   want_double_precision=no
@@ -2096,6 +1866,23 @@
 
 
 
+# Check whether --with-cuda was given.
+if test "${with_cuda+set}" = set; then
+  withval=$with_cuda; want_cuda="$withval"
+else
+  want_cuda=no
+fi
+
+ if test "$want_cuda" = yes; then
+  COND_CUDA_TRUE=
+  COND_CUDA_FALSE='#'
+else
+  COND_CUDA_TRUE='#'
+  COND_CUDA_FALSE=
+fi
+
+
+
 # Checks for programs.
 
 # a courtesy to the installed base of users
@@ -2111,13 +1898,13 @@
 ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_fc_compiler_gnu
 if test -n "$ac_tool_prefix"; then
-  for ac_prog in gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn xlf90 f90 pgf90 pghpf epcf90 g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77
+  for ac_prog in gfortran g95 xlf95 f95 fort ifort ifc efc pgf95 lf95 ftn xlf90 f90 pgf90 pghpf epcf90 g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77
   do
     # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
 set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_FC+set}" = set; then :
+if test "${ac_cv_prog_FC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$FC"; then
@@ -2128,24 +1915,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_FC="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 FC=$ac_cv_prog_FC
 if test -n "$FC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $FC" >&5
+  { $as_echo "$as_me:$LINENO: result: $FC" >&5
 $as_echo "$FC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -2155,13 +1942,13 @@
 fi
 if test -z "$FC"; then
   ac_ct_FC=$FC
-  for ac_prog in gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn xlf90 f90 pgf90 pghpf epcf90 g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77
+  for ac_prog in gfortran g95 xlf95 f95 fort ifort ifc efc pgf95 lf95 ftn xlf90 f90 pgf90 pghpf epcf90 g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_ac_ct_FC+set}" = set; then :
+if test "${ac_cv_prog_ac_ct_FC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$ac_ct_FC"; then
@@ -2172,24 +1959,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_ac_ct_FC="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 ac_ct_FC=$ac_cv_prog_ac_ct_FC
 if test -n "$ac_ct_FC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_FC" >&5
+  { $as_echo "$as_me:$LINENO: result: $ac_ct_FC" >&5
 $as_echo "$ac_ct_FC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -2202,7 +1989,7 @@
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
@@ -2212,32 +1999,45 @@
 
 
 # Provide some information about the compiler.
-$as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran compiler version" >&5
+$as_echo "$as_me:$LINENO: checking for Fortran compiler version" >&5
 set X $ac_compile
 ac_compiler=$2
-for ac_option in --version -v -V -qversion; do
-  { { ac_try="$ac_compiler $ac_option >&5"
+{ (ac_try="$ac_compiler --version >&5"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compiler --version >&5") 2>&5
   ac_status=$?
-  if test -s conftest.err; then
-    sed '10a\
-... rest of stderr output deleted ...
-         10q' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-  fi
-  rm -f conftest.er1 conftest.err
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-done
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+{ (ac_try="$ac_compiler -v >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compiler -v >&5") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+{ (ac_try="$ac_compiler -V >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compiler -V >&5") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
 rm -f a.out
 
-cat > conftest.$ac_ext <<_ACEOF
+cat >conftest.$ac_ext <<_ACEOF
       program main
 
       end
@@ -2247,8 +2047,8 @@
 # Try to create an executable without -o first, disregard a.out.
 # It will help us diagnose broken compilers, and finding out an intuition
 # of exeext.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the Fortran compiler works" >&5
-$as_echo_n "checking whether the Fortran compiler works... " >&6; }
+{ $as_echo "$as_me:$LINENO: checking for Fortran compiler default output file name" >&5
+$as_echo_n "checking for Fortran compiler default output file name... " >&6; }
 ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
 
 # The possible output files:
@@ -2264,17 +2064,17 @@
 done
 rm -f $ac_rmfiles
 
-if { { ac_try="$ac_link_default"
+if { (ac_try="$ac_link_default"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
   (eval "$ac_link_default") 2>&5
   ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then :
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; then
   # Autoconf-2.13 could set the ac_cv_exeext variable to `no'.
 # So ignore a value of `no', otherwise this would lead to `EXEEXT = no'
 # in a Makefile.  We should not override ac_cv_exeext if it was cached,
@@ -2291,7 +2091,7 @@
 	# certainly right.
 	break;;
     *.* )
-	if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
+        if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no;
 	then :; else
 	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
 	fi
@@ -2310,41 +2110,84 @@
 else
   ac_file=''
 fi
-if test -z "$ac_file"; then :
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
-$as_echo "no" >&6; }
-$as_echo "$as_me: failed program was:" >&5
+
+{ $as_echo "$as_me:$LINENO: result: $ac_file" >&5
+$as_echo "$ac_file" >&6; }
+if test -z "$ac_file"; then
+  $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error 77 "Fortran compiler cannot create executables
-See \`config.log' for more details" "$LINENO" 5 ; }
-else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
-$as_echo "yes" >&6; }
+{ { $as_echo "$as_me:$LINENO: error: Fortran compiler cannot create executables
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: Fortran compiler cannot create executables
+See \`config.log' for more details." >&2;}
+   { (exit 77); exit 77; }; }; }
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran compiler default output file name" >&5
-$as_echo_n "checking for Fortran compiler default output file name... " >&6; }
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
-$as_echo "$ac_file" >&6; }
+
 ac_exeext=$ac_cv_exeext
 
+# Check that the compiler produces executables we can run.  If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:$LINENO: checking whether the Fortran compiler works" >&5
+$as_echo_n "checking whether the Fortran compiler works... " >&6; }
+# FIXME: These cross compiler hacks should be removed for Autoconf 3.0
+# If not cross compiling, check that we can run a simple program.
+if test "$cross_compiling" != yes; then
+  if { ac_try='./$ac_file'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+    cross_compiling=no
+  else
+    if test "$cross_compiling" = maybe; then
+	cross_compiling=yes
+    else
+	{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
+$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
+{ { $as_echo "$as_me:$LINENO: error: cannot run Fortran compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot run Fortran compiled programs.
+If you meant to cross compile, use \`--host'.
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }; }
+    fi
+  fi
+fi
+{ $as_echo "$as_me:$LINENO: result: yes" >&5
+$as_echo "yes" >&6; }
+
 rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
 ac_clean_files=$ac_clean_files_save
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
+# Check that the compiler produces executables we can run.  If not, either
+# the compiler is broken, or we cross compile.
+{ $as_echo "$as_me:$LINENO: checking whether we are cross compiling" >&5
+$as_echo_n "checking whether we are cross compiling... " >&6; }
+{ $as_echo "$as_me:$LINENO: result: $cross_compiling" >&5
+$as_echo "$cross_compiling" >&6; }
+
+{ $as_echo "$as_me:$LINENO: checking for suffix of executables" >&5
 $as_echo_n "checking for suffix of executables... " >&6; }
-if { { ac_try="$ac_link"
+if { (ac_try="$ac_link"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
   (eval "$ac_link") 2>&5
   ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then :
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; then
   # If both `conftest.exe' and `conftest' are `present' (well, observable)
 # catch `conftest.exe'.  For instance with Cygwin, `ls conftest' will
 # work properly (i.e., refer to `conftest.exe'), while it won't with
@@ -2359,93 +2202,44 @@
   esac
 done
 else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+  { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot compute suffix of executables: cannot compile and link
-See \`config.log' for more details" "$LINENO" 5 ; }
+{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compute suffix of executables: cannot compile and link
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }; }
 fi
-rm -f conftest conftest$ac_cv_exeext
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
+
+rm -f conftest$ac_cv_exeext
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_exeext" >&5
 $as_echo "$ac_cv_exeext" >&6; }
 
 rm -f conftest.$ac_ext
 EXEEXT=$ac_cv_exeext
 ac_exeext=$EXEEXT
-cat > conftest.$ac_ext <<_ACEOF
-      program main
-      open(unit=9,file='conftest.out')
-      close(unit=9)
-
-      end
-_ACEOF
-ac_clean_files="$ac_clean_files conftest.out"
-# Check that the compiler produces executables we can run.  If not, either
-# the compiler is broken, or we cross compile.
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
-$as_echo_n "checking whether we are cross compiling... " >&6; }
-if test "$cross_compiling" != yes; then
-  { { ac_try="$ac_link"
-case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_link") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-  if { ac_try='./conftest$ac_cv_exeext'
-  { { case "(($ac_try" in
-  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
-  *) ac_try_echo=$ac_try;;
-esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_try") 2>&5
-  ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; }; then
-    cross_compiling=no
-  else
-    if test "$cross_compiling" = maybe; then
-	cross_compiling=yes
-    else
-	{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
-$as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot run Fortran compiled programs.
-If you meant to cross compile, use \`--host'.
-See \`config.log' for more details" "$LINENO" 5 ; }
-    fi
-  fi
-fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
-$as_echo "$cross_compiling" >&6; }
-
-rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out
-ac_clean_files=$ac_clean_files_save
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
+{ $as_echo "$as_me:$LINENO: checking for suffix of object files" >&5
 $as_echo_n "checking for suffix of object files... " >&6; }
-if test "${ac_cv_objext+set}" = set; then :
+if test "${ac_cv_objext+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
-  cat > conftest.$ac_ext <<_ACEOF
+  cat >conftest.$ac_ext <<_ACEOF
       program main
 
       end
 _ACEOF
 rm -f conftest.o conftest.obj
-if { { ac_try="$ac_compile"
+if { (ac_try="$ac_compile"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
   (eval "$ac_compile") 2>&5
   ac_status=$?
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }; then :
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; then
   for ac_file in conftest.o conftest.obj conftest.*; do
   test -f "$ac_file" || continue;
   case $ac_file in
@@ -2458,14 +2252,18 @@
   $as_echo "$as_me: failed program was:" >&5
 sed 's/^/| /' conftest.$ac_ext >&5
 
-{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot compute suffix of object files: cannot compile
-See \`config.log' for more details" "$LINENO" 5 ; }
+{ { $as_echo "$as_me:$LINENO: error: cannot compute suffix of object files: cannot compile
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compute suffix of object files: cannot compile
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }; }
 fi
+
 rm -f conftest.$ac_cv_objext conftest.$ac_ext
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_objext" >&5
 $as_echo "$ac_cv_objext" >&6; }
 OBJEXT=$ac_cv_objext
 ac_objext=$OBJEXT
@@ -2473,12 +2271,12 @@
 # input file.  (Note that this only needs to work for GNU compilers.)
 ac_save_ext=$ac_ext
 ac_ext=F
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU Fortran compiler" >&5
+{ $as_echo "$as_me:$LINENO: checking whether we are using the GNU Fortran compiler" >&5
 $as_echo_n "checking whether we are using the GNU Fortran compiler... " >&6; }
-if test "${ac_cv_fc_compiler_gnu+set}" = set; then :
+if test "${ac_cv_fc_compiler_gnu+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
-  cat > conftest.$ac_ext <<_ACEOF
+  cat >conftest.$ac_ext <<_ACEOF
       program main
 #ifndef __GNUC__
        choke me
@@ -2486,44 +2284,86 @@
 
       end
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_fc_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_compiler_gnu=yes
 else
-  ac_compiler_gnu=no
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_compiler_gnu=no
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 ac_cv_fc_compiler_gnu=$ac_compiler_gnu
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_compiler_gnu" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_fc_compiler_gnu" >&5
 $as_echo "$ac_cv_fc_compiler_gnu" >&6; }
 ac_ext=$ac_save_ext
-ac_test_FCFLAGS=${FCFLAGS+set}
-ac_save_FCFLAGS=$FCFLAGS
+ac_test_FFLAGS=${FCFLAGS+set}
+ac_save_FFLAGS=$FCFLAGS
 FCFLAGS=
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $FC accepts -g" >&5
+{ $as_echo "$as_me:$LINENO: checking whether $FC accepts -g" >&5
 $as_echo_n "checking whether $FC accepts -g... " >&6; }
-if test "${ac_cv_prog_fc_g+set}" = set; then :
+if test "${ac_cv_prog_fc_g+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   FCFLAGS=-g
-cat > conftest.$ac_ext <<_ACEOF
+cat >conftest.$ac_ext <<_ACEOF
       program main
 
       end
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_fc_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_prog_fc_g=yes
 else
-  ac_cv_prog_fc_g=no
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_prog_fc_g=no
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_g" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_prog_fc_g" >&5
 $as_echo "$ac_cv_prog_fc_g" >&6; }
-if test "$ac_test_FCFLAGS" = set; then
-  FCFLAGS=$ac_save_FCFLAGS
+if test "$ac_test_FFLAGS" = set; then
+  FCFLAGS=$ac_save_FFLAGS
 elif test $ac_cv_prog_fc_g = yes; then
   if test "x$ac_cv_fc_compiler_gnu" = xyes; then
     FCFLAGS="-g -O2"
@@ -2552,10 +2392,12 @@
 
 
 flags_guess="$SHELL $srcdir/flags.guess"
-{ $as_echo "$as_me:${as_lineno-$LINENO}: running $flags_guess" >&5
+{ $as_echo "$as_me:$LINENO: running $flags_guess" >&5
 $as_echo "$as_me: running $flags_guess" >&6;}
 flags=`$flags_guess` ||
-    as_fn_error $? "$flags_guess failed" "$LINENO" 5
+    { { $as_echo "$as_me:$LINENO: error: $flags_guess failed" >&5
+$as_echo "$as_me: error: $flags_guess failed" >&2;}
+   { (exit 1); exit 1; }; }
 eval $flags
 
 
@@ -2563,21 +2405,38 @@
 ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
 ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_fc_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to get verbose linking output from $FC" >&5
+{ $as_echo "$as_me:$LINENO: checking how to get verbose linking output from $FC" >&5
 $as_echo_n "checking how to get verbose linking output from $FC... " >&6; }
-if test "${ac_cv_prog_fc_v+set}" = set; then :
+if test "${ac_cv_prog_fc_v+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
-  cat > conftest.$ac_ext <<_ACEOF
+  cat >conftest.$ac_ext <<_ACEOF
       program main
 
       end
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_fc_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_prog_fc_v=
 # Try some options frequently used verbose output
 for ac_verb in -v -verbose --verbose -V -\#\#\#; do
-  cat > conftest.$ac_ext <<_ACEOF
+  cat >conftest.$ac_ext <<_ACEOF
       program main
 
       end
@@ -2587,17 +2446,17 @@
 # 1 to this macro) to the Fortran compiler in order to get
 # "verbose" output that we can then parse for the Fortran linker
 # flags.
-ac_save_FCFLAGS=$FCFLAGS
+ac_save_FFLAGS=$FCFLAGS
 FCFLAGS="$FCFLAGS $ac_verb"
 eval "set x $ac_link"
 shift
-$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5
+$as_echo "$as_me:$LINENO: $*" >&5
 # gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH,
 # LIBRARY_PATH; skip all such settings.
 ac_fc_v_output=`eval $ac_link 5>&1 2>&1 |
   grep -v 'Driving:' | grep -v "^[_$as_cr_Letters][_$as_cr_alnum]*="`
 $as_echo "$ac_fc_v_output" >&5
-FCFLAGS=$ac_save_FCFLAGS
+FCFLAGS=$ac_save_FFLAGS
 
 rm -rf conftest*
 
@@ -2626,9 +2485,9 @@
   # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2".
   *-cmdline\ * | *-ignore\ * | *-def\ *)
     ac_fc_v_output=`echo $ac_fc_v_output | sed "\
-	s/-cmdline  *'[^']*'/ /g; s/-cmdline  *\"[^\"]*\"/ /g
-	s/-ignore  *'[^']*'/ /g; s/-ignore  *\"[^\"]*\"/ /g
-	s/-def  *'[^']*'/ /g; s/-def  *\"[^\"]*\"/ /g"` ;;
+        s/-cmdline  *'[^']*'/ /g; s/-cmdline  *\"[^\"]*\"/ /g
+        s/-ignore  *'[^']*'/ /g; s/-ignore  *\"[^\"]*\"/ /g
+        s/-def  *'[^']*'/ /g; s/-def  *\"[^\"]*\"/ /g"` ;;
 
   # If we are using Cray Fortran then delete quotes.
   *cft90*)
@@ -2639,35 +2498,39 @@
   # look for -l* and *.a constructs in the output
   for ac_arg in $ac_fc_v_output; do
      case $ac_arg in
-	[\\/]*.a | ?:[\\/]*.a | -[lLRu]*)
-	  ac_cv_prog_fc_v=$ac_verb
-	  break 2 ;;
+        [\\/]*.a | ?:[\\/]*.a | -[lLRu]*)
+          ac_cv_prog_fc_v=$ac_verb
+          break 2 ;;
      esac
   done
 done
 if test -z "$ac_cv_prog_fc_v"; then
-   { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot determine how to obtain linking information from $FC" >&5
+   { $as_echo "$as_me:$LINENO: WARNING: cannot determine how to obtain linking information from $FC" >&5
 $as_echo "$as_me: WARNING: cannot determine how to obtain linking information from $FC" >&2;}
 fi
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: compilation failed" >&5
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	{ $as_echo "$as_me:$LINENO: WARNING: compilation failed" >&5
 $as_echo "$as_me: WARNING: compilation failed" >&2;}
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_fc_v" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_prog_fc_v" >&5
 $as_echo "$ac_cv_prog_fc_v" >&6; }
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran libraries of $FC" >&5
+{ $as_echo "$as_me:$LINENO: checking for Fortran libraries of $FC" >&5
 $as_echo_n "checking for Fortran libraries of $FC... " >&6; }
-if test "${ac_cv_fc_libs+set}" = set; then :
+if test "${ac_cv_fc_libs+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test "x$FCLIBS" != "x"; then
   ac_cv_fc_libs="$FCLIBS" # Let the user override the test.
 else
 
-cat > conftest.$ac_ext <<_ACEOF
+cat >conftest.$ac_ext <<_ACEOF
       program main
 
       end
@@ -2677,17 +2540,17 @@
 # 1 to this macro) to the Fortran compiler in order to get
 # "verbose" output that we can then parse for the Fortran linker
 # flags.
-ac_save_FCFLAGS=$FCFLAGS
+ac_save_FFLAGS=$FCFLAGS
 FCFLAGS="$FCFLAGS $ac_cv_prog_fc_v"
 eval "set x $ac_link"
 shift
-$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5
+$as_echo "$as_me:$LINENO: $*" >&5
 # gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH,
 # LIBRARY_PATH; skip all such settings.
 ac_fc_v_output=`eval $ac_link 5>&1 2>&1 |
   grep -v 'Driving:' | grep -v "^[_$as_cr_Letters][_$as_cr_alnum]*="`
 $as_echo "$ac_fc_v_output" >&5
-FCFLAGS=$ac_save_FCFLAGS
+FCFLAGS=$ac_save_FFLAGS
 
 rm -rf conftest*
 
@@ -2716,9 +2579,9 @@
   # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2".
   *-cmdline\ * | *-ignore\ * | *-def\ *)
     ac_fc_v_output=`echo $ac_fc_v_output | sed "\
-	s/-cmdline  *'[^']*'/ /g; s/-cmdline  *\"[^\"]*\"/ /g
-	s/-ignore  *'[^']*'/ /g; s/-ignore  *\"[^\"]*\"/ /g
-	s/-def  *'[^']*'/ /g; s/-def  *\"[^\"]*\"/ /g"` ;;
+        s/-cmdline  *'[^']*'/ /g; s/-cmdline  *\"[^\"]*\"/ /g
+        s/-ignore  *'[^']*'/ /g; s/-ignore  *\"[^\"]*\"/ /g
+        s/-def  *'[^']*'/ /g; s/-def  *\"[^\"]*\"/ /g"` ;;
 
   # If we are using Cray Fortran then delete quotes.
   *cft90*)
@@ -2737,8 +2600,8 @@
   shift
   ac_arg=$1
   case $ac_arg in
-	[\\/]*.a | ?:[\\/]*.a)
-	    ac_exists=false
+        [\\/]*.a | ?:[\\/]*.a)
+            ac_exists=false
   for ac_i in $ac_cv_fc_libs; do
     if test x"$ac_arg" = x"$ac_i"; then
       ac_exists=true
@@ -2746,14 +2609,15 @@
     fi
   done
 
-  if test x"$ac_exists" = xtrue; then :
-
+  if test x"$ac_exists" = xtrue; then
+  :
 else
   ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
 fi
-	  ;;
-	-bI:*)
-	    ac_exists=false
+
+          ;;
+        -bI:*)
+            ac_exists=false
   for ac_i in $ac_cv_fc_libs; do
     if test x"$ac_arg" = x"$ac_i"; then
       ac_exists=true
@@ -2761,8 +2625,8 @@
     fi
   done
 
-  if test x"$ac_exists" = xtrue; then :
-
+  if test x"$ac_exists" = xtrue; then
+  :
 else
   if test "$ac_compiler_gnu" = yes; then
   for ac_link_opt in $ac_arg; do
@@ -2772,18 +2636,18 @@
   ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
 fi
 fi
-	  ;;
-	  # Ignore these flags.
-	-lang* | -lcrt*.o | -lc | -lgcc* | -lSystem | -libmil | -little \
-	  |-LANG:=* | -LIST:* | -LNO:* | -link)
-	  ;;
-	-lkernel32)
-	  test x"$CYGWIN" != xyes && ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
-	  ;;
-	-[LRuYz])
-	  # These flags, when seen by themselves, take an argument.
-	  # We remove the space between option and argument and re-iterate
-	  # unless we find an empty arg or a new option (starting with -)
+
+          ;;
+          # Ignore these flags.
+        -lang* | -lcrt*.o | -lc | -lgcc* | -lSystem | -libmil | -LANG:=* | -LIST:* | -LNO:*)
+          ;;
+        -lkernel32)
+          test x"$CYGWIN" != xyes && ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
+          ;;
+        -[LRuYz])
+          # These flags, when seen by themselves, take an argument.
+          # We remove the space between option and argument and re-iterate
+          # unless we find an empty arg or a new option (starting with -)
 	  case $2 in
 	     "" | -*);;
 	     *)
@@ -2792,10 +2656,10 @@
 		set X $ac_arg "$@"
 		;;
 	  esac
-	  ;;
-	-YP,*)
-	  for ac_j in `$as_echo "$ac_arg" | sed -e 's/-YP,/-L/;s/:/ -L/g'`; do
-	      ac_exists=false
+          ;;
+        -YP,*)
+          for ac_j in `$as_echo "$ac_arg" | sed -e 's/-YP,/-L/;s/:/ -L/g'`; do
+              ac_exists=false
   for ac_i in $ac_cv_fc_libs; do
     if test x"$ac_j" = x"$ac_i"; then
       ac_exists=true
@@ -2803,16 +2667,17 @@
     fi
   done
 
-  if test x"$ac_exists" = xtrue; then :
-
+  if test x"$ac_exists" = xtrue; then
+  :
 else
   ac_arg="$ac_arg $ac_j"
-			       ac_cv_fc_libs="$ac_cv_fc_libs $ac_j"
+                               ac_cv_fc_libs="$ac_cv_fc_libs $ac_j"
 fi
-	  done
-	  ;;
-	-[lLR]*)
-	    ac_exists=false
+
+          done
+          ;;
+        -[lLR]*)
+            ac_exists=false
   for ac_i in $ac_cv_fc_libs; do
     if test x"$ac_arg" = x"$ac_i"; then
       ac_exists=true
@@ -2820,16 +2685,17 @@
     fi
   done
 
-  if test x"$ac_exists" = xtrue; then :
-
+  if test x"$ac_exists" = xtrue; then
+  :
 else
   ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
 fi
-	  ;;
+
+          ;;
 	-zallextract*| -zdefaultextract)
 	  ac_cv_fc_libs="$ac_cv_fc_libs $ac_arg"
 	  ;;
-	  # Ignore everything else.
+          # Ignore everything else.
   esac
 done
 # restore positional arguments
@@ -2841,9 +2707,9 @@
 case `(uname -sr) 2>/dev/null` in
    "SunOS 5"*)
       ac_ld_run_path=`$as_echo "$ac_fc_v_output" |
-			sed -n 's,^.*LD_RUN_PATH *= *\(/[^ ]*\).*$,-R\1,p'`
+                        sed -n 's,^.*LD_RUN_PATH *= *\(/[^ ]*\).*$,-R\1,p'`
       test "x$ac_ld_run_path" != x &&
-	if test "$ac_compiler_gnu" = yes; then
+        if test "$ac_compiler_gnu" = yes; then
   for ac_link_opt in $ac_ld_run_path; do
     ac_cv_fc_libs="$ac_cv_fc_libs -Xlinker $ac_link_opt"
   done
@@ -2855,7 +2721,7 @@
 fi # test "x$[]_AC_LANG_PREFIX[]LIBS" = "x"
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_libs" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_fc_libs" >&5
 $as_echo "$ac_cv_fc_libs" >&6; }
 FCLIBS="$ac_cv_fc_libs"
 
@@ -2874,9 +2740,9 @@
 if test -n "$ac_tool_prefix"; then
   # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
 set dummy ${ac_tool_prefix}gcc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_CC+set}" = set; then :
+if test "${ac_cv_prog_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$CC"; then
@@ -2887,24 +2753,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_CC="${ac_tool_prefix}gcc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $CC" >&5
 $as_echo "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -2914,9 +2780,9 @@
   ac_ct_CC=$CC
   # Extract the first word of "gcc", so it can be a program name with args.
 set dummy gcc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_ac_ct_CC+set}" = set; then :
+if test "${ac_cv_prog_ac_ct_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$ac_ct_CC"; then
@@ -2927,24 +2793,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_ac_ct_CC="gcc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 ac_ct_CC=$ac_cv_prog_ac_ct_CC
 if test -n "$ac_ct_CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5
 $as_echo "$ac_ct_CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -2953,7 +2819,7 @@
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
@@ -2967,9 +2833,9 @@
           if test -n "$ac_tool_prefix"; then
     # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
 set dummy ${ac_tool_prefix}cc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_CC+set}" = set; then :
+if test "${ac_cv_prog_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$CC"; then
@@ -2980,24 +2846,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_CC="${ac_tool_prefix}cc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $CC" >&5
 $as_echo "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -3007,9 +2873,9 @@
 if test -z "$CC"; then
   # Extract the first word of "cc", so it can be a program name with args.
 set dummy cc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_CC+set}" = set; then :
+if test "${ac_cv_prog_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$CC"; then
@@ -3021,18 +2887,18 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
        ac_prog_rejected=yes
        continue
      fi
     ac_cv_prog_CC="cc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 if test $ac_prog_rejected = yes; then
@@ -3051,10 +2917,10 @@
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $CC" >&5
 $as_echo "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -3066,9 +2932,9 @@
   do
     # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
 set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_CC+set}" = set; then :
+if test "${ac_cv_prog_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$CC"; then
@@ -3079,24 +2945,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $CC" >&5
 $as_echo "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -3110,9 +2976,9 @@
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_ac_ct_CC+set}" = set; then :
+if test "${ac_cv_prog_ac_ct_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$ac_ct_CC"; then
@@ -3123,24 +2989,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_ac_ct_CC="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 ac_ct_CC=$ac_cv_prog_ac_ct_CC
 if test -n "$ac_ct_CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5
 $as_echo "$ac_ct_CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -3153,7 +3019,7 @@
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
@@ -3164,42 +3030,62 @@
 fi
 
 
-test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+test -z "$CC" && { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "no acceptable C compiler found in \$PATH
-See \`config.log' for more details" "$LINENO" 5 ; }
+{ { $as_echo "$as_me:$LINENO: error: no acceptable C compiler found in \$PATH
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: no acceptable C compiler found in \$PATH
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }; }
 
 # Provide some information about the compiler.
-$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+$as_echo "$as_me:$LINENO: checking for C compiler version" >&5
 set X $ac_compile
 ac_compiler=$2
-for ac_option in --version -v -V -qversion; do
-  { { ac_try="$ac_compiler $ac_option >&5"
+{ (ac_try="$ac_compiler --version >&5"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compiler --version >&5") 2>&5
   ac_status=$?
-  if test -s conftest.err; then
-    sed '10a\
-... rest of stderr output deleted ...
-         10q' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-  fi
-  rm -f conftest.er1 conftest.err
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-done
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+{ (ac_try="$ac_compiler -v >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compiler -v >&5") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+{ (ac_try="$ac_compiler -V >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compiler -V >&5") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
+{ $as_echo "$as_me:$LINENO: checking whether we are using the GNU C compiler" >&5
 $as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
-if test "${ac_cv_c_compiler_gnu+set}" = set; then :
+if test "${ac_cv_c_compiler_gnu+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 #ifdef FC_DUMMY_MAIN
@@ -3221,16 +3107,37 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_compiler_gnu=yes
 else
-  ac_compiler_gnu=no
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_compiler_gnu=no
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 ac_cv_c_compiler_gnu=$ac_compiler_gnu
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_c_compiler_gnu" >&5
 $as_echo "$ac_cv_c_compiler_gnu" >&6; }
 if test $ac_compiler_gnu = yes; then
   GCC=yes
@@ -3239,16 +3146,20 @@
 fi
 ac_test_CFLAGS=${CFLAGS+set}
 ac_save_CFLAGS=$CFLAGS
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
+{ $as_echo "$as_me:$LINENO: checking whether $CC accepts -g" >&5
 $as_echo_n "checking whether $CC accepts -g... " >&6; }
-if test "${ac_cv_prog_cc_g+set}" = set; then :
+if test "${ac_cv_prog_cc_g+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   ac_save_c_werror_flag=$ac_c_werror_flag
    ac_c_werror_flag=yes
    ac_cv_prog_cc_g=no
    CFLAGS="-g"
-   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+   cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 #ifdef FC_DUMMY_MAIN
@@ -3267,11 +3178,35 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_prog_cc_g=yes
 else
-  CFLAGS=""
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	CFLAGS=""
+      cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 #ifdef FC_DUMMY_MAIN
@@ -3290,12 +3225,36 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
-else
-  ac_c_werror_flag=$ac_save_c_werror_flag
+	ac_c_werror_flag=$ac_save_c_werror_flag
 	 CFLAGS="-g"
-	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+	 cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 #ifdef FC_DUMMY_MAIN
@@ -3314,17 +3273,42 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_prog_cc_g=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
    ac_c_werror_flag=$ac_save_c_werror_flag
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_g" >&5
 $as_echo "$ac_cv_prog_cc_g" >&6; }
 if test "$ac_test_CFLAGS" = set; then
   CFLAGS=$ac_save_CFLAGS
@@ -3341,14 +3325,18 @@
     CFLAGS=
   fi
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
+{ $as_echo "$as_me:$LINENO: checking for $CC option to accept ISO C89" >&5
 $as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
-if test "${ac_cv_prog_cc_c89+set}" = set; then :
+if test "${ac_cv_prog_cc_c89+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   ac_cv_prog_cc_c89=no
 ac_save_CC=$CC
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #include <stdarg.h>
 #include <stdio.h>
@@ -3413,9 +3401,32 @@
 	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
 do
   CC="$ac_save_CC $ac_arg"
-  if ac_fn_c_try_compile "$LINENO"; then :
+  rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_prog_cc_c89=$ac_arg
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
 fi
+
 rm -f core conftest.err conftest.$ac_objext
   test "x$ac_cv_prog_cc_c89" != "xno" && break
 done
@@ -3426,19 +3437,17 @@
 # AC_CACHE_VAL
 case "x$ac_cv_prog_cc_c89" in
   x)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+    { $as_echo "$as_me:$LINENO: result: none needed" >&5
 $as_echo "none needed" >&6; } ;;
   xno)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+    { $as_echo "$as_me:$LINENO: result: unsupported" >&5
 $as_echo "unsupported" >&6; } ;;
   *)
     CC="$CC $ac_cv_prog_cc_c89"
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+    { $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c89" >&5
 $as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
 esac
-if test "x$ac_cv_prog_cc_c89" != xno; then :
 
-fi
 
 ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
@@ -3452,9 +3461,9 @@
 ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_fc_compiler_gnu
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for dummy main to link with Fortran libraries" >&5
+{ $as_echo "$as_me:$LINENO: checking for dummy main to link with Fortran libraries" >&5
 $as_echo_n "checking for dummy main to link with Fortran libraries... " >&6; }
-if test "${ac_cv_fc_dummy_main+set}" = set; then :
+if test "${ac_cv_fc_dummy_main+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   ac_fc_dm_save_LIBS=$LIBS
@@ -3467,7 +3476,11 @@
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
  # First, try linking without a dummy main:
- cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+ cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 #ifdef FC_DUMMY_MAIN
@@ -3486,17 +3499,46 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 $as_test_x conftest$ac_exeext
+       }; then
   ac_cv_fortran_dummy_main=none
 else
-  ac_cv_fortran_dummy_main=unknown
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_fortran_dummy_main=unknown
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
 
+rm -rf conftest.dSYM
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext
+
  if test $ac_cv_fortran_dummy_main = unknown; then
    for ac_func in MAIN__ MAIN_ __main MAIN _MAIN __MAIN main_ main__ _main; do
-     cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+     cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #define $ac_fortran_dm_var $ac_func
 #ifdef FC_DUMMY_MAIN
@@ -3515,11 +3557,38 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 $as_test_x conftest$ac_exeext
+       }; then
   ac_cv_fortran_dummy_main=$ac_func; break
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
+
+rm -rf conftest.dSYM
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext
    done
  fi
  ac_ext=${ac_fc_srcext-f}
@@ -3531,10 +3600,10 @@
  LIBS=$ac_fc_dm_save_LIBS
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_dummy_main" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_fc_dummy_main" >&5
 $as_echo "$ac_cv_fc_dummy_main" >&6; }
 FC_DUMMY_MAIN=$ac_cv_fc_dummy_main
-if test "$FC_DUMMY_MAIN" != unknown; then :
+if test "$FC_DUMMY_MAIN" != unknown; then
   if test $FC_DUMMY_MAIN != none; then
 
 cat >>confdefs.h <<_ACEOF
@@ -3543,17 +3612,23 @@
 
   if test "x$ac_cv_fc_dummy_main" = "x$ac_cv_f77_dummy_main"; then
 
-$as_echo "#define FC_DUMMY_MAIN_EQ_F77 1" >>confdefs.h
+cat >>confdefs.h <<\_ACEOF
+#define FC_DUMMY_MAIN_EQ_F77 1
+_ACEOF
 
   fi
 fi
 else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+  { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "linking to Fortran libraries from C fails
-See \`config.log' for more details" "$LINENO" 5 ; }
+{ { $as_echo "$as_me:$LINENO: error: linking to Fortran libraries from C fails
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: linking to Fortran libraries from C fails
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }; }
 fi
 
+
 ac_ext=c
 ac_cpp='$CPP $CPPFLAGS'
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
@@ -3564,12 +3639,12 @@
 ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
 ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_fc_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran name-mangling scheme" >&5
+{ $as_echo "$as_me:$LINENO: checking for Fortran name-mangling scheme" >&5
 $as_echo_n "checking for Fortran name-mangling scheme... " >&6; }
-if test "${ac_cv_fc_mangling+set}" = set; then :
+if test "${ac_cv_fc_mangling+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
-  cat > conftest.$ac_ext <<_ACEOF
+  cat >conftest.$ac_ext <<_ACEOF
       subroutine foobar()
       return
       end
@@ -3577,7 +3652,24 @@
       return
       end
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_fc_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   mv conftest.$ac_objext cfortran_test.$ac_objext
 
   ac_save_LIBS=$LIBS
@@ -3592,7 +3684,11 @@
   for ac_foobar in foobar FOOBAR; do
     for ac_underscore in "" "_"; do
       ac_func="$ac_foobar$ac_underscore"
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+      cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
@@ -3618,11 +3714,38 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 $as_test_x conftest$ac_exeext
+       }; then
   ac_success=yes; break 2
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
+
+rm -rf conftest.dSYM
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext
     done
   done
   ac_ext=${ac_fc_srcext-f}
@@ -3650,7 +3773,11 @@
      ac_success_extra=no
      for ac_extra in "" "_"; do
 	ac_func="$ac_foo_bar$ac_underscore$ac_extra"
-	cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+	cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 /* Override any GCC internal prototype to avoid an error.
@@ -3676,11 +3803,38 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_link "$LINENO"; then :
+rm -f conftest.$ac_objext conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_link") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 $as_test_x conftest$ac_exeext
+       }; then
   ac_success_extra=yes; break
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
 fi
-rm -f core conftest.err conftest.$ac_objext \
-    conftest$ac_exeext conftest.$ac_ext
+
+rm -rf conftest.dSYM
+rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+      conftest$ac_exeext conftest.$ac_ext
      done
      ac_ext=${ac_fc_srcext-f}
 ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
@@ -3689,16 +3843,16 @@
 
      if test "$ac_success_extra" = "yes"; then
 	ac_cv_fc_mangling="$ac_case case"
-	if test -z "$ac_underscore"; then
-	   ac_cv_fc_mangling="$ac_cv_fc_mangling, no underscore"
+        if test -z "$ac_underscore"; then
+           ac_cv_fc_mangling="$ac_cv_fc_mangling, no underscore"
 	else
-	   ac_cv_fc_mangling="$ac_cv_fc_mangling, underscore"
-	fi
-	if test -z "$ac_extra"; then
-	   ac_cv_fc_mangling="$ac_cv_fc_mangling, no extra underscore"
+           ac_cv_fc_mangling="$ac_cv_fc_mangling, underscore"
+        fi
+        if test -z "$ac_extra"; then
+           ac_cv_fc_mangling="$ac_cv_fc_mangling, no extra underscore"
 	else
-	   ac_cv_fc_mangling="$ac_cv_fc_mangling, extra underscore"
-	fi
+           ac_cv_fc_mangling="$ac_cv_fc_mangling, extra underscore"
+        fi
       else
 	ac_cv_fc_mangling="unknown"
       fi
@@ -3710,15 +3864,22 @@
   rm -rf conftest*
   rm -f cfortran_test*
 else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	{ { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot compile a simple Fortran program
-See \`config.log' for more details" "$LINENO" 5 ; }
+{ { $as_echo "$as_me:$LINENO: error: cannot compile a simple Fortran program
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compile a simple Fortran program
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }; }
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_mangling" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_fc_mangling" >&5
 $as_echo "$ac_cv_fc_mangling" >&6; }
 
 ac_ext=c
@@ -3731,51 +3892,85 @@
 ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
 ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_fc_compiler_gnu
+
+
 case $ac_cv_fc_mangling in
   "lower case, no underscore, no extra underscore")
-	  $as_echo "#define FC_FUNC(name,NAME) name" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC(name,NAME) name
+_ACEOF
 
-	  $as_echo "#define FC_FUNC_(name,NAME) name" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC_(name,NAME) name
+_ACEOF
  ;;
   "lower case, no underscore, extra underscore")
-	  $as_echo "#define FC_FUNC(name,NAME) name" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC(name,NAME) name
+_ACEOF
 
-	  $as_echo "#define FC_FUNC_(name,NAME) name ## _" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC_(name,NAME) name ## _
+_ACEOF
  ;;
   "lower case, underscore, no extra underscore")
-	  $as_echo "#define FC_FUNC(name,NAME) name ## _" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC(name,NAME) name ## _
+_ACEOF
 
-	  $as_echo "#define FC_FUNC_(name,NAME) name ## _" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC_(name,NAME) name ## _
+_ACEOF
  ;;
   "lower case, underscore, extra underscore")
-	  $as_echo "#define FC_FUNC(name,NAME) name ## _" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC(name,NAME) name ## _
+_ACEOF
 
-	  $as_echo "#define FC_FUNC_(name,NAME) name ## __" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC_(name,NAME) name ## __
+_ACEOF
  ;;
   "upper case, no underscore, no extra underscore")
-	  $as_echo "#define FC_FUNC(name,NAME) NAME" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC(name,NAME) NAME
+_ACEOF
 
-	  $as_echo "#define FC_FUNC_(name,NAME) NAME" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC_(name,NAME) NAME
+_ACEOF
  ;;
   "upper case, no underscore, extra underscore")
-	  $as_echo "#define FC_FUNC(name,NAME) NAME" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC(name,NAME) NAME
+_ACEOF
 
-	  $as_echo "#define FC_FUNC_(name,NAME) NAME ## _" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC_(name,NAME) NAME ## _
+_ACEOF
  ;;
   "upper case, underscore, no extra underscore")
-	  $as_echo "#define FC_FUNC(name,NAME) NAME ## _" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC(name,NAME) NAME ## _
+_ACEOF
 
-	  $as_echo "#define FC_FUNC_(name,NAME) NAME ## _" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC_(name,NAME) NAME ## _
+_ACEOF
  ;;
   "upper case, underscore, extra underscore")
-	  $as_echo "#define FC_FUNC(name,NAME) NAME ## _" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC(name,NAME) NAME ## _
+_ACEOF
 
-	  $as_echo "#define FC_FUNC_(name,NAME) NAME ## __" >>confdefs.h
+          cat >>confdefs.h <<\_ACEOF
+#define FC_FUNC_(name,NAME) NAME ## __
+_ACEOF
  ;;
   *)
-	  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unknown Fortran name-mangling scheme" >&5
+          { $as_echo "$as_me:$LINENO: WARNING: unknown Fortran name-mangling scheme" >&5
 $as_echo "$as_me: WARNING: unknown Fortran name-mangling scheme" >&2;}
-	  ;;
+          ;;
 esac
 
 ac_ext=c
@@ -3793,9 +3988,9 @@
 ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
 ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_fc_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran flag to compile .f90 files" >&5
+{ $as_echo "$as_me:$LINENO: checking for Fortran flag to compile .f90 files" >&5
 $as_echo_n "checking for Fortran flag to compile .f90 files... " >&6; }
-if test "${ac_cv_fc_srcext_f90+set}" = set; then :
+if test "${ac_cv_fc_srcext_f90+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   ac_ext=f90
@@ -3804,24 +3999,49 @@
 ac_cv_fc_srcext_f90=unknown
 for ac_flag in none -qsuffix=f=f90 -Tf; do
   test "x$ac_flag" != xnone && ac_fcflags_srcext="$ac_flag"
-  cat > conftest.$ac_ext <<_ACEOF
+  cat >conftest.$ac_ext <<_ACEOF
       program main
 
       end
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_fc_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_fc_srcext_f90=$ac_flag; break
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 done
 rm -f conftest.$ac_objext conftest.f90
 ac_fcflags_srcext=$ac_fcflags_srcext_save
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_srcext_f90" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_fc_srcext_f90" >&5
 $as_echo "$ac_cv_fc_srcext_f90" >&6; }
 if test "x$ac_cv_fc_srcext_f90" = xunknown; then
-  as_fn_error $? "Fortran could not compile .f90 files" "$LINENO" 5
+  { { $as_echo "$as_me:$LINENO: error: Fortran could not compile .f90 files" >&5
+$as_echo "$as_me: error: Fortran could not compile .f90 files" >&2;}
+   { (exit 1); exit 1; }; }
 else
   ac_fc_srcext=f90
   if test "x$ac_cv_fc_srcext_f90" = xnone; then
@@ -3843,38 +4063,63 @@
 ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
 ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_fc_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran flag needed to accept free-form source" >&5
-$as_echo_n "checking for Fortran flag needed to accept free-form source... " >&6; }
-if test "${ac_cv_fc_freeform+set}" = set; then :
+{ $as_echo "$as_me:$LINENO: checking for Fortran flag needed to allow free-form source" >&5
+$as_echo_n "checking for Fortran flag needed to allow free-form source... " >&6; }
+if test "${ac_cv_fc_freeform+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   ac_cv_fc_freeform=unknown
 ac_fc_freeform_FCFLAGS_save=$FCFLAGS
 for ac_flag in none -ffree-form -FR -free -qfree -Mfree -Mfreeform \
-	       -freeform "-f free" +source=free -nfix
+               -freeform "-f free"
 do
   test "x$ac_flag" != xnone && FCFLAGS="$ac_fc_freeform_FCFLAGS_save $ac_flag"
-  cat > conftest.$ac_ext <<_ACEOF
+  cat >conftest.$ac_ext <<_ACEOF
 
   program freeform
        ! FIXME: how to best confuse non-freeform compilers?
        print *, 'Hello ', &
-          'world.'
+           'world.'
        end
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_fc_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_fc_freeform=$ac_flag; break
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 done
 rm -f conftest.err conftest.$ac_objext conftest.$ac_ext
 FCFLAGS=$ac_fc_freeform_FCFLAGS_save
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_fc_freeform" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_fc_freeform" >&5
 $as_echo "$ac_cv_fc_freeform" >&6; }
 if test "x$ac_cv_fc_freeform" = xunknown; then
-  as_fn_error 77 "Fortran does not accept free-form source" "$LINENO" 5
+  { { $as_echo "$as_me:$LINENO: error: Fortran does not accept free-form source" >&5
+$as_echo "$as_me: error: Fortran does not accept free-form source" >&2;}
+   { (exit 77); exit 77; }; }
 else
   if test "x$ac_cv_fc_freeform" != xnone; then
     FCFLAGS="$FCFLAGS $ac_cv_fc_freeform"
@@ -3895,9 +4140,9 @@
 if test -n "$ac_tool_prefix"; then
   # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args.
 set dummy ${ac_tool_prefix}gcc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_CC+set}" = set; then :
+if test "${ac_cv_prog_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$CC"; then
@@ -3908,24 +4153,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_CC="${ac_tool_prefix}gcc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $CC" >&5
 $as_echo "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -3935,9 +4180,9 @@
   ac_ct_CC=$CC
   # Extract the first word of "gcc", so it can be a program name with args.
 set dummy gcc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_ac_ct_CC+set}" = set; then :
+if test "${ac_cv_prog_ac_ct_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$ac_ct_CC"; then
@@ -3948,24 +4193,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_ac_ct_CC="gcc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 ac_ct_CC=$ac_cv_prog_ac_ct_CC
 if test -n "$ac_ct_CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5
 $as_echo "$ac_ct_CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -3974,7 +4219,7 @@
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
@@ -3988,9 +4233,9 @@
           if test -n "$ac_tool_prefix"; then
     # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args.
 set dummy ${ac_tool_prefix}cc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_CC+set}" = set; then :
+if test "${ac_cv_prog_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$CC"; then
@@ -4001,24 +4246,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_CC="${ac_tool_prefix}cc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $CC" >&5
 $as_echo "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -4028,9 +4273,9 @@
 if test -z "$CC"; then
   # Extract the first word of "cc", so it can be a program name with args.
 set dummy cc; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_CC+set}" = set; then :
+if test "${ac_cv_prog_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$CC"; then
@@ -4042,18 +4287,18 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then
        ac_prog_rejected=yes
        continue
      fi
     ac_cv_prog_CC="cc"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 if test $ac_prog_rejected = yes; then
@@ -4072,10 +4317,10 @@
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $CC" >&5
 $as_echo "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -4087,9 +4332,9 @@
   do
     # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
 set dummy $ac_tool_prefix$ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_CC+set}" = set; then :
+if test "${ac_cv_prog_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$CC"; then
@@ -4100,24 +4345,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_CC="$ac_tool_prefix$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 CC=$ac_cv_prog_CC
 if test -n "$CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $CC" >&5
 $as_echo "$CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -4131,9 +4376,9 @@
 do
   # Extract the first word of "$ac_prog", so it can be a program name with args.
 set dummy $ac_prog; ac_word=$2
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+{ $as_echo "$as_me:$LINENO: checking for $ac_word" >&5
 $as_echo_n "checking for $ac_word... " >&6; }
-if test "${ac_cv_prog_ac_ct_CC+set}" = set; then :
+if test "${ac_cv_prog_ac_ct_CC+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -n "$ac_ct_CC"; then
@@ -4144,24 +4389,24 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_exec_ext in '' $ac_executable_extensions; do
+  for ac_exec_ext in '' $ac_executable_extensions; do
   if { test -f "$as_dir/$ac_word$ac_exec_ext" && $as_test_x "$as_dir/$ac_word$ac_exec_ext"; }; then
     ac_cv_prog_ac_ct_CC="$ac_prog"
-    $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5
+    $as_echo "$as_me:$LINENO: found $as_dir/$ac_word$ac_exec_ext" >&5
     break 2
   fi
 done
-  done
+done
 IFS=$as_save_IFS
 
 fi
 fi
 ac_ct_CC=$ac_cv_prog_ac_ct_CC
 if test -n "$ac_ct_CC"; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5
+  { $as_echo "$as_me:$LINENO: result: $ac_ct_CC" >&5
 $as_echo "$ac_ct_CC" >&6; }
 else
-  { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+  { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
 fi
 
@@ -4174,7 +4419,7 @@
   else
     case $cross_compiling:$ac_tool_warned in
 yes:)
-{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+{ $as_echo "$as_me:$LINENO: WARNING: using cross tools not prefixed with host triplet" >&5
 $as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
 ac_tool_warned=yes ;;
 esac
@@ -4185,42 +4430,62 @@
 fi
 
 
-test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+test -z "$CC" && { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "no acceptable C compiler found in \$PATH
-See \`config.log' for more details" "$LINENO" 5 ; }
+{ { $as_echo "$as_me:$LINENO: error: no acceptable C compiler found in \$PATH
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: no acceptable C compiler found in \$PATH
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }; }
 
 # Provide some information about the compiler.
-$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5
+$as_echo "$as_me:$LINENO: checking for C compiler version" >&5
 set X $ac_compile
 ac_compiler=$2
-for ac_option in --version -v -V -qversion; do
-  { { ac_try="$ac_compiler $ac_option >&5"
+{ (ac_try="$ac_compiler --version >&5"
 case "(($ac_try" in
   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
   *) ac_try_echo=$ac_try;;
 esac
-eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
-$as_echo "$ac_try_echo"; } >&5
-  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compiler --version >&5") 2>&5
   ac_status=$?
-  if test -s conftest.err; then
-    sed '10a\
-... rest of stderr output deleted ...
-         10q' conftest.err >conftest.er1
-    cat conftest.er1 >&5
-  fi
-  rm -f conftest.er1 conftest.err
-  $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
-  test $ac_status = 0; }
-done
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+{ (ac_try="$ac_compiler -v >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compiler -v >&5") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
+{ (ac_try="$ac_compiler -V >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compiler -V >&5") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5
+{ $as_echo "$as_me:$LINENO: checking whether we are using the GNU C compiler" >&5
 $as_echo_n "checking whether we are using the GNU C compiler... " >&6; }
-if test "${ac_cv_c_compiler_gnu+set}" = set; then :
+if test "${ac_cv_c_compiler_gnu+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 #ifdef FC_DUMMY_MAIN
@@ -4242,16 +4507,37 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_compiler_gnu=yes
 else
-  ac_compiler_gnu=no
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_compiler_gnu=no
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 ac_cv_c_compiler_gnu=$ac_compiler_gnu
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_c_compiler_gnu" >&5
 $as_echo "$ac_cv_c_compiler_gnu" >&6; }
 if test $ac_compiler_gnu = yes; then
   GCC=yes
@@ -4260,16 +4546,20 @@
 fi
 ac_test_CFLAGS=${CFLAGS+set}
 ac_save_CFLAGS=$CFLAGS
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5
+{ $as_echo "$as_me:$LINENO: checking whether $CC accepts -g" >&5
 $as_echo_n "checking whether $CC accepts -g... " >&6; }
-if test "${ac_cv_prog_cc_g+set}" = set; then :
+if test "${ac_cv_prog_cc_g+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   ac_save_c_werror_flag=$ac_c_werror_flag
    ac_c_werror_flag=yes
    ac_cv_prog_cc_g=no
    CFLAGS="-g"
-   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+   cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 #ifdef FC_DUMMY_MAIN
@@ -4288,11 +4578,35 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_prog_cc_g=yes
 else
-  CFLAGS=""
-      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	CFLAGS=""
+      cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 #ifdef FC_DUMMY_MAIN
@@ -4311,12 +4625,36 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
-else
-  ac_c_werror_flag=$ac_save_c_werror_flag
+	ac_c_werror_flag=$ac_save_c_werror_flag
 	 CFLAGS="-g"
-	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+	 cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 
 #ifdef FC_DUMMY_MAIN
@@ -4335,17 +4673,42 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_prog_cc_g=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
    ac_c_werror_flag=$ac_save_c_werror_flag
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_g" >&5
 $as_echo "$ac_cv_prog_cc_g" >&6; }
 if test "$ac_test_CFLAGS" = set; then
   CFLAGS=$ac_save_CFLAGS
@@ -4362,14 +4725,18 @@
     CFLAGS=
   fi
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5
+{ $as_echo "$as_me:$LINENO: checking for $CC option to accept ISO C89" >&5
 $as_echo_n "checking for $CC option to accept ISO C89... " >&6; }
-if test "${ac_cv_prog_cc_c89+set}" = set; then :
+if test "${ac_cv_prog_cc_c89+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   ac_cv_prog_cc_c89=no
 ac_save_CC=$CC
-cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #include <stdarg.h>
 #include <stdio.h>
@@ -4434,9 +4801,32 @@
 	-Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__"
 do
   CC="$ac_save_CC $ac_arg"
-  if ac_fn_c_try_compile "$LINENO"; then :
+  rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_prog_cc_c89=$ac_arg
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+
 fi
+
 rm -f core conftest.err conftest.$ac_objext
   test "x$ac_cv_prog_cc_c89" != "xno" && break
 done
@@ -4447,19 +4837,17 @@
 # AC_CACHE_VAL
 case "x$ac_cv_prog_cc_c89" in
   x)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+    { $as_echo "$as_me:$LINENO: result: none needed" >&5
 $as_echo "none needed" >&6; } ;;
   xno)
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+    { $as_echo "$as_me:$LINENO: result: unsupported" >&5
 $as_echo "unsupported" >&6; } ;;
   *)
     CC="$CC $ac_cv_prog_cc_c89"
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5
+    { $as_echo "$as_me:$LINENO: result: $ac_cv_prog_cc_c89" >&5
 $as_echo "$ac_cv_prog_cc_c89" >&6; } ;;
 esac
-if test "x$ac_cv_prog_cc_c89" != xno; then :
 
-fi
 
 ac_ext=${ac_fc_srcext-f}
 ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
@@ -4478,14 +4866,14 @@
 ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5
+{ $as_echo "$as_me:$LINENO: checking how to run the C preprocessor" >&5
 $as_echo_n "checking how to run the C preprocessor... " >&6; }
 # On Suns, sometimes $CPP names a directory.
 if test -n "$CPP" && test -d "$CPP"; then
   CPP=
 fi
 if test -z "$CPP"; then
-  if test "${ac_cv_prog_CPP+set}" = set; then :
+  if test "${ac_cv_prog_CPP+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
       # Double quotes because CPP needs to be expanded
@@ -4500,7 +4888,11 @@
   # <limits.h> exists even on freestanding compilers.
   # On the NeXT, cc -E runs the code through the compiler's parser,
   # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #ifdef __STDC__
 # include <limits.h>
@@ -4509,34 +4901,78 @@
 #endif
 		     Syntax error
 _ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
-else
   # Broken: fails on valid input.
 continue
 fi
-rm -f conftest.err conftest.i conftest.$ac_ext
 
+rm -f conftest.err conftest.$ac_ext
+
   # OK, works on sane cases.  Now check whether nonexistent headers
   # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #include <ac_nonexistent.h>
 _ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
   # Broken: success on invalid input.
 continue
 else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
   # Passes both tests.
 ac_preproc_ok=:
 break
 fi
-rm -f conftest.err conftest.i conftest.$ac_ext
 
+rm -f conftest.err conftest.$ac_ext
+
 done
 # Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
+rm -f conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then
   break
 fi
 
@@ -4548,7 +4984,7 @@
 else
   ac_cv_prog_CPP=$CPP
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5
+{ $as_echo "$as_me:$LINENO: result: $CPP" >&5
 $as_echo "$CPP" >&6; }
 ac_preproc_ok=false
 for ac_c_preproc_warn_flag in '' yes
@@ -4559,7 +4995,11 @@
   # <limits.h> exists even on freestanding compilers.
   # On the NeXT, cc -E runs the code through the compiler's parser,
   # not just through cpp. "Syntax error" is here to catch this case.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #ifdef __STDC__
 # include <limits.h>
@@ -4568,40 +5008,87 @@
 #endif
 		     Syntax error
 _ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  :
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
-else
   # Broken: fails on valid input.
 continue
 fi
-rm -f conftest.err conftest.i conftest.$ac_ext
 
+rm -f conftest.err conftest.$ac_ext
+
   # OK, works on sane cases.  Now check whether nonexistent headers
   # can be detected and how.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #include <ac_nonexistent.h>
 _ACEOF
-if ac_fn_c_try_cpp "$LINENO"; then :
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
   # Broken: success on invalid input.
 continue
 else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
   # Passes both tests.
 ac_preproc_ok=:
 break
 fi
-rm -f conftest.err conftest.i conftest.$ac_ext
 
+rm -f conftest.err conftest.$ac_ext
+
 done
 # Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped.
-rm -f conftest.i conftest.err conftest.$ac_ext
-if $ac_preproc_ok; then :
-
+rm -f conftest.err conftest.$ac_ext
+if $ac_preproc_ok; then
+  :
 else
-  { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+  { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "C preprocessor \"$CPP\" fails sanity check
-See \`config.log' for more details" "$LINENO" 5 ; }
+{ { $as_echo "$as_me:$LINENO: error: C preprocessor \"$CPP\" fails sanity check
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: C preprocessor \"$CPP\" fails sanity check
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }; }
 fi
 
 ac_ext=c
@@ -4611,9 +5098,9 @@
 ac_compiler_gnu=$ac_cv_c_compiler_gnu
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5
+{ $as_echo "$as_me:$LINENO: checking for grep that handles long lines and -e" >&5
 $as_echo_n "checking for grep that handles long lines and -e... " >&6; }
-if test "${ac_cv_path_GREP+set}" = set; then :
+if test "${ac_cv_path_GREP+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if test -z "$GREP"; then
@@ -4624,7 +5111,7 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_prog in grep ggrep; do
+  for ac_prog in grep ggrep; do
     for ac_exec_ext in '' $ac_executable_extensions; do
       ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext"
       { test -f "$ac_path_GREP" && $as_test_x "$ac_path_GREP"; } || continue
@@ -4644,7 +5131,7 @@
     $as_echo 'GREP' >> "conftest.nl"
     "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break
     diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    ac_count=`expr $ac_count + 1`
     if test $ac_count -gt ${ac_path_GREP_max-0}; then
       # Best one so far, save it but keep looking for a better one
       ac_cv_path_GREP="$ac_path_GREP"
@@ -4659,24 +5146,26 @@
       $ac_path_GREP_found && break 3
     done
   done
-  done
+done
 IFS=$as_save_IFS
   if test -z "$ac_cv_path_GREP"; then
-    as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+    { { $as_echo "$as_me:$LINENO: error: no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
+$as_echo "$as_me: error: no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
+   { (exit 1); exit 1; }; }
   fi
 else
   ac_cv_path_GREP=$GREP
 fi
 
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_path_GREP" >&5
 $as_echo "$ac_cv_path_GREP" >&6; }
  GREP="$ac_cv_path_GREP"
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5
+{ $as_echo "$as_me:$LINENO: checking for egrep" >&5
 $as_echo_n "checking for egrep... " >&6; }
-if test "${ac_cv_path_EGREP+set}" = set; then :
+if test "${ac_cv_path_EGREP+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
   if echo a | $GREP -E '(a|b)' >/dev/null 2>&1
@@ -4690,7 +5179,7 @@
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    for ac_prog in egrep; do
+  for ac_prog in egrep; do
     for ac_exec_ext in '' $ac_executable_extensions; do
       ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext"
       { test -f "$ac_path_EGREP" && $as_test_x "$ac_path_EGREP"; } || continue
@@ -4710,7 +5199,7 @@
     $as_echo 'EGREP' >> "conftest.nl"
     "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break
     diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break
-    as_fn_arith $ac_count + 1 && ac_count=$as_val
+    ac_count=`expr $ac_count + 1`
     if test $ac_count -gt ${ac_path_EGREP_max-0}; then
       # Best one so far, save it but keep looking for a better one
       ac_cv_path_EGREP="$ac_path_EGREP"
@@ -4725,10 +5214,12 @@
       $ac_path_EGREP_found && break 3
     done
   done
-  done
+done
 IFS=$as_save_IFS
   if test -z "$ac_cv_path_EGREP"; then
-    as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5
+    { { $as_echo "$as_me:$LINENO: error: no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&5
+$as_echo "$as_me: error: no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" >&2;}
+   { (exit 1); exit 1; }; }
   fi
 else
   ac_cv_path_EGREP=$EGREP
@@ -4736,17 +5227,21 @@
 
    fi
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_path_EGREP" >&5
 $as_echo "$ac_cv_path_EGREP" >&6; }
  EGREP="$ac_cv_path_EGREP"
 
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5
+{ $as_echo "$as_me:$LINENO: checking for ANSI C header files" >&5
 $as_echo_n "checking for ANSI C header files... " >&6; }
-if test "${ac_cv_header_stdc+set}" = set; then :
+if test "${ac_cv_header_stdc+set}" = set; then
   $as_echo_n "(cached) " >&6
 else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #include <stdlib.h>
 #include <stdarg.h>
@@ -4769,23 +5264,48 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
   ac_cv_header_stdc=yes
 else
-  ac_cv_header_stdc=no
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_cv_header_stdc=no
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
 if test $ac_cv_header_stdc = yes; then
   # SunOS 4.x string.h does not declare mem*, contrary to ANSI.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #include <string.h>
 
 _ACEOF
 if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "memchr" >/dev/null 2>&1; then :
-
+  $EGREP "memchr" >/dev/null 2>&1; then
+  :
 else
   ac_cv_header_stdc=no
 fi
@@ -4795,14 +5315,18 @@
 
 if test $ac_cv_header_stdc = yes; then
   # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #include <stdlib.h>
 
 _ACEOF
 if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
-  $EGREP "free" >/dev/null 2>&1; then :
-
+  $EGREP "free" >/dev/null 2>&1; then
+  :
 else
   ac_cv_header_stdc=no
 fi
@@ -4812,10 +5336,14 @@
 
 if test $ac_cv_header_stdc = yes; then
   # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
-  if test "$cross_compiling" = yes; then :
+  if test "$cross_compiling" = yes; then
   :
 else
-  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
 /* end confdefs.h.  */
 #include <ctype.h>
 #include <stdlib.h>
@@ -4842,33 +5370,118 @@
   return 0;
 }
 _ACEOF
-if ac_fn_c_try_run "$LINENO"; then :
+rm -f conftest$ac_exeext
+if { (ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
+  { (case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); }; }; then
+  :
+else
+  $as_echo "$as_me: program exited with status $ac_status" >&5
+$as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
-else
-  ac_cv_header_stdc=no
+( exit $ac_status )
+ac_cv_header_stdc=no
 fi
-rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \
-  conftest.$ac_objext conftest.beam conftest.$ac_ext
+rm -rf conftest.dSYM
+rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
 fi
 
+
 fi
 fi
-{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_header_stdc" >&5
 $as_echo "$ac_cv_header_stdc" >&6; }
 if test $ac_cv_header_stdc = yes; then
 
-$as_echo "#define STDC_HEADERS 1" >>confdefs.h
+cat >>confdefs.h <<\_ACEOF
+#define STDC_HEADERS 1
+_ACEOF
 
 fi
 
 # On IRIX 5.3, sys/types and inttypes.h are conflicting.
+
+
+
+
+
+
+
+
+
 for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \
 		  inttypes.h stdint.h unistd.h
-do :
-  as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
-ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default
-"
-if eval test \"x\$"$as_ac_Header"\" = x"yes"; then :
+do
+as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh`
+{ $as_echo "$as_me:$LINENO: checking for $ac_header" >&5
+$as_echo_n "checking for $ac_header... " >&6; }
+if { as_var=$as_ac_Header; eval "test \"\${$as_var+set}\" = set"; }; then
+  $as_echo_n "(cached) " >&6
+else
+  cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+
+#include <$ac_header>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  eval "$as_ac_Header=yes"
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	eval "$as_ac_Header=no"
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+fi
+ac_res=`eval 'as_val=${'$as_ac_Header'}
+		 $as_echo "$as_val"'`
+	       { $as_echo "$as_me:$LINENO: result: $ac_res" >&5
+$as_echo "$ac_res" >&6; }
+as_val=`eval 'as_val=${'$as_ac_Header'}
+		 $as_echo "$as_val"'`
+   if test "x$as_val" = x""yes; then
   cat >>confdefs.h <<_ACEOF
 #define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1
 _ACEOF
@@ -4878,22 +5491,288 @@
 done
 
 
-ac_fn_c_check_header_mongrel "$LINENO" "emmintrin.h" "ac_cv_header_emmintrin_h" "$ac_includes_default"
-if test "x$ac_cv_header_emmintrin_h" = x""yes; then :
+if test "${ac_cv_header_emmintrin_h+set}" = set; then
+  { $as_echo "$as_me:$LINENO: checking for emmintrin.h" >&5
+$as_echo_n "checking for emmintrin.h... " >&6; }
+if test "${ac_cv_header_emmintrin_h+set}" = set; then
+  $as_echo_n "(cached) " >&6
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_header_emmintrin_h" >&5
+$as_echo "$ac_cv_header_emmintrin_h" >&6; }
+else
+  # Is the header compilable?
+{ $as_echo "$as_me:$LINENO: checking emmintrin.h usability" >&5
+$as_echo_n "checking emmintrin.h usability... " >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+#include <emmintrin.h>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_header_compiler=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
-$as_echo "#define HAVE_EMMINTRIN 1" >>confdefs.h
+	ac_header_compiler=no
+fi
 
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
+$as_echo "$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ $as_echo "$as_me:$LINENO: checking emmintrin.h presence" >&5
+$as_echo_n "checking emmintrin.h presence... " >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <emmintrin.h>
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  ac_header_preproc=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  ac_header_preproc=no
 fi
 
+rm -f conftest.err conftest.$ac_ext
+{ $as_echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
+$as_echo "$ac_header_preproc" >&6; }
 
-ac_fn_c_check_header_mongrel "$LINENO" "xmmintrin.h" "ac_cv_header_xmmintrin_h" "$ac_includes_default"
-if test "x$ac_cv_header_xmmintrin_h" = x""yes; then :
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in
+  yes:no: )
+    { $as_echo "$as_me:$LINENO: WARNING: emmintrin.h: accepted by the compiler, rejected by the preprocessor!" >&5
+$as_echo "$as_me: WARNING: emmintrin.h: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: emmintrin.h: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: emmintrin.h: proceeding with the compiler's result" >&2;}
+    ac_header_preproc=yes
+    ;;
+  no:yes:* )
+    { $as_echo "$as_me:$LINENO: WARNING: emmintrin.h: present but cannot be compiled" >&5
+$as_echo "$as_me: WARNING: emmintrin.h: present but cannot be compiled" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: emmintrin.h:     check for missing prerequisite headers?" >&5
+$as_echo "$as_me: WARNING: emmintrin.h:     check for missing prerequisite headers?" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: emmintrin.h: see the Autoconf documentation" >&5
+$as_echo "$as_me: WARNING: emmintrin.h: see the Autoconf documentation" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: emmintrin.h:     section \"Present But Cannot Be Compiled\"" >&5
+$as_echo "$as_me: WARNING: emmintrin.h:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: emmintrin.h: proceeding with the preprocessor's result" >&5
+$as_echo "$as_me: WARNING: emmintrin.h: proceeding with the preprocessor's result" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: emmintrin.h: in the future, the compiler will take precedence" >&5
+$as_echo "$as_me: WARNING: emmintrin.h: in the future, the compiler will take precedence" >&2;}
+    ( cat <<\_ASBOX
+## -------------------------------------- ##
+## Report this to jtromp AT princeton.edu ##
+## -------------------------------------- ##
+_ASBOX
+     ) | sed "s/^/$as_me: WARNING:     /" >&2
+    ;;
+esac
+{ $as_echo "$as_me:$LINENO: checking for emmintrin.h" >&5
+$as_echo_n "checking for emmintrin.h... " >&6; }
+if test "${ac_cv_header_emmintrin_h+set}" = set; then
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_header_emmintrin_h=$ac_header_preproc
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_header_emmintrin_h" >&5
+$as_echo "$ac_cv_header_emmintrin_h" >&6; }
 
-$as_echo "#define HAVE_XMMINTRIN 1" >>confdefs.h
+fi
+if test "x$ac_cv_header_emmintrin_h" = x""yes; then
 
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_EMMINTRIN 1
+_ACEOF
+
 fi
 
 
+if test "${ac_cv_header_xmmintrin_h+set}" = set; then
+  { $as_echo "$as_me:$LINENO: checking for xmmintrin.h" >&5
+$as_echo_n "checking for xmmintrin.h... " >&6; }
+if test "${ac_cv_header_xmmintrin_h+set}" = set; then
+  $as_echo_n "(cached) " >&6
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_header_xmmintrin_h" >&5
+$as_echo "$ac_cv_header_xmmintrin_h" >&6; }
+else
+  # Is the header compilable?
+{ $as_echo "$as_me:$LINENO: checking xmmintrin.h usability" >&5
+$as_echo_n "checking xmmintrin.h usability... " >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+$ac_includes_default
+#include <xmmintrin.h>
+_ACEOF
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
+  ac_header_compiler=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_header_compiler=no
+fi
+
+rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+{ $as_echo "$as_me:$LINENO: result: $ac_header_compiler" >&5
+$as_echo "$ac_header_compiler" >&6; }
+
+# Is the header present?
+{ $as_echo "$as_me:$LINENO: checking xmmintrin.h presence" >&5
+$as_echo_n "checking xmmintrin.h presence... " >&6; }
+cat >conftest.$ac_ext <<_ACEOF
+/* confdefs.h.  */
+_ACEOF
+cat confdefs.h >>conftest.$ac_ext
+cat >>conftest.$ac_ext <<_ACEOF
+/* end confdefs.h.  */
+#include <xmmintrin.h>
+_ACEOF
+if { (ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } >/dev/null && {
+	 test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" ||
+	 test ! -s conftest.err
+       }; then
+  ac_header_preproc=yes
+else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+  ac_header_preproc=no
+fi
+
+rm -f conftest.err conftest.$ac_ext
+{ $as_echo "$as_me:$LINENO: result: $ac_header_preproc" >&5
+$as_echo "$ac_header_preproc" >&6; }
+
+# So?  What about this header?
+case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in
+  yes:no: )
+    { $as_echo "$as_me:$LINENO: WARNING: xmmintrin.h: accepted by the compiler, rejected by the preprocessor!" >&5
+$as_echo "$as_me: WARNING: xmmintrin.h: accepted by the compiler, rejected by the preprocessor!" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: xmmintrin.h: proceeding with the compiler's result" >&5
+$as_echo "$as_me: WARNING: xmmintrin.h: proceeding with the compiler's result" >&2;}
+    ac_header_preproc=yes
+    ;;
+  no:yes:* )
+    { $as_echo "$as_me:$LINENO: WARNING: xmmintrin.h: present but cannot be compiled" >&5
+$as_echo "$as_me: WARNING: xmmintrin.h: present but cannot be compiled" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: xmmintrin.h:     check for missing prerequisite headers?" >&5
+$as_echo "$as_me: WARNING: xmmintrin.h:     check for missing prerequisite headers?" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: xmmintrin.h: see the Autoconf documentation" >&5
+$as_echo "$as_me: WARNING: xmmintrin.h: see the Autoconf documentation" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: xmmintrin.h:     section \"Present But Cannot Be Compiled\"" >&5
+$as_echo "$as_me: WARNING: xmmintrin.h:     section \"Present But Cannot Be Compiled\"" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: xmmintrin.h: proceeding with the preprocessor's result" >&5
+$as_echo "$as_me: WARNING: xmmintrin.h: proceeding with the preprocessor's result" >&2;}
+    { $as_echo "$as_me:$LINENO: WARNING: xmmintrin.h: in the future, the compiler will take precedence" >&5
+$as_echo "$as_me: WARNING: xmmintrin.h: in the future, the compiler will take precedence" >&2;}
+    ( cat <<\_ASBOX
+## -------------------------------------- ##
+## Report this to jtromp AT princeton.edu ##
+## -------------------------------------- ##
+_ASBOX
+     ) | sed "s/^/$as_me: WARNING:     /" >&2
+    ;;
+esac
+{ $as_echo "$as_me:$LINENO: checking for xmmintrin.h" >&5
+$as_echo_n "checking for xmmintrin.h... " >&6; }
+if test "${ac_cv_header_xmmintrin_h+set}" = set; then
+  $as_echo_n "(cached) " >&6
+else
+  ac_cv_header_xmmintrin_h=$ac_header_preproc
+fi
+{ $as_echo "$as_me:$LINENO: result: $ac_cv_header_xmmintrin_h" >&5
+$as_echo "$ac_cv_header_xmmintrin_h" >&6; }
+
+fi
+if test "x$ac_cv_header_xmmintrin_h" = x""yes; then
+
+cat >>confdefs.h <<\_ACEOF
+#define HAVE_XMMINTRIN 1
+_ACEOF
+
+fi
+
+
 ac_ext=${ac_fc_srcext-f}
 ac_compile='$FC -c $FCFLAGS $ac_fcflags_srcext conftest.$ac_ext >&5'
 ac_link='$FC -o conftest$ac_exeext $FCFLAGS $LDFLAGS $ac_fcflags_srcext conftest.$ac_ext $LIBS >&5'
@@ -4906,6 +5785,13 @@
 
 
 
+
+
+
+
+
+
+
 if test x"$MPIFC" = x; then
     MPIFC=mpif90
 fi
@@ -4940,9 +5826,9 @@
 FC=$MPIFC
 FCFLAGS="$FCFLAGS $FLAGS_NO_CHECK"
 
-{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether mpif.h works" >&5
+{ $as_echo "$as_me:$LINENO: checking whether mpif.h works" >&5
 $as_echo_n "checking whether mpif.h works... " >&6; }
-cat > conftest.$ac_ext <<_ACEOF
+cat >conftest.$ac_ext <<_ACEOF
 
       program main
 
@@ -4956,14 +5842,34 @@
       end
 
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_fc_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+    { $as_echo "$as_me:$LINENO: result: yes" >&5
 $as_echo "yes" >&6; }
 
 else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+
+    { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
     cit_mpif_h=unknown
     cit_mpifc_info=`$FC -compile_info 2>/dev/null`
@@ -4973,13 +5879,16 @@
         esac
     done
     if test "$cit_mpif_h" == "unknown"; then
-        { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+        { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot compile a trivial MPI program using $MPIFC
-See \`config.log' for more details" "$LINENO" 5 ; }
+{ { $as_echo "$as_me:$LINENO: error: cannot compile a trivial MPI program using $MPIFC
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compile a trivial MPI program using $MPIFC
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }; }
     fi
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ofile" >&5
+    { $as_echo "$as_me:$LINENO: creating $ofile" >&5
 $as_echo "$as_me: creating $ofile" >&6;}
     cat >"$cfgfile" <<END_OF_HEADER
 ! $ofile.  Generated from $cit_mpif_h by configure.
@@ -4989,9 +5898,9 @@
     mv -f "$cfgfile" "$ofile" || \
         (rm -f "$ofile" && cp "$cfgfile" "$ofile" && rm -f "$cfgfile")
 
-    { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether generated mpif.h works" >&5
+    { $as_echo "$as_me:$LINENO: checking whether generated mpif.h works" >&5
 $as_echo_n "checking whether generated mpif.h works... " >&6; }
-    cat > conftest.$ac_ext <<_ACEOF
+    cat >conftest.$ac_ext <<_ACEOF
 
       program main
 
@@ -5005,25 +5914,50 @@
       end
 
 _ACEOF
-if ac_fn_fc_try_compile "$LINENO"; then :
+rm -f conftest.$ac_objext
+if { (ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:$LINENO: $ac_try_echo\""
+$as_echo "$ac_try_echo") >&5
+  (eval "$ac_compile") 2>conftest.er1
+  ac_status=$?
+  grep -v '^ *+' conftest.er1 >conftest.err
+  rm -f conftest.er1
+  cat conftest.err >&5
+  $as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
+  (exit $ac_status); } && {
+	 test -z "$ac_fc_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext; then
 
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+        { $as_echo "$as_me:$LINENO: result: yes" >&5
 $as_echo "yes" >&6; }
 
 else
+  $as_echo "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
 
-        { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5
+
+        { $as_echo "$as_me:$LINENO: result: no" >&5
 $as_echo "no" >&6; }
-        { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5
+        { { $as_echo "$as_me:$LINENO: error: in \`$ac_pwd':" >&5
 $as_echo "$as_me: error: in \`$ac_pwd':" >&2;}
-as_fn_error $? "cannot compile a trivial MPI program using $MPIFC
-See \`config.log' for more details" "$LINENO" 5 ; }
+{ { $as_echo "$as_me:$LINENO: error: cannot compile a trivial MPI program using $MPIFC
+See \`config.log' for more details." >&5
+$as_echo "$as_me: error: cannot compile a trivial MPI program using $MPIFC
+See \`config.log' for more details." >&2;}
+   { (exit 1); exit 1; }; }; }
 
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
 
 fi
+
 rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
 
 FC=$cit_fc_save_fc
@@ -5072,13 +6006,13 @@
     case $ac_val in #(
     *${as_nl}*)
       case $ac_var in #(
-      *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+      *_cv_*) { $as_echo "$as_me:$LINENO: WARNING: cache variable $ac_var contains a newline" >&5
 $as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
       esac
       case $ac_var in #(
       _ | IFS | as_nl) ;; #(
       BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
-      *) { eval $ac_var=; unset $ac_var;} ;;
+      *) $as_unset $ac_var ;;
       esac ;;
     esac
   done
@@ -5086,8 +6020,8 @@
   (set) 2>&1 |
     case $as_nl`(ac_space=' '; set) 2>&1` in #(
     *${as_nl}ac_space=\ *)
-      # `set' does not quote correctly, so add quotes: double-quote
-      # substitution turns \\\\ into \\, and sed turns \\ into \.
+      # `set' does not quote correctly, so add quotes (double-quote
+      # substitution turns \\\\ into \\, and sed turns \\ into \).
       sed -n \
 	"s/'/'\\\\''/g;
 	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
@@ -5110,11 +6044,11 @@
 if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
   if test -w "$cache_file"; then
     test "x$cache_file" != "x/dev/null" &&
-      { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
+      { $as_echo "$as_me:$LINENO: updating cache $cache_file" >&5
 $as_echo "$as_me: updating cache $cache_file" >&6;}
     cat confcache >$cache_file
   else
-    { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
+    { $as_echo "$as_me:$LINENO: not updating unwritable cache $cache_file" >&5
 $as_echo "$as_me: not updating unwritable cache $cache_file" >&6;}
   fi
 fi
@@ -5128,30 +6062,35 @@
 
 ac_libobjs=
 ac_ltlibobjs=
-U=
 for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
   # 1. Remove the extension, and $U if already installed.
   ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
   ac_i=`$as_echo "$ac_i" | sed "$ac_script"`
   # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
   #    will be set to the directory where LIBOBJS objects are built.
-  as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
-  as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo'
+  ac_libobjs="$ac_libobjs \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+  ac_ltlibobjs="$ac_ltlibobjs \${LIBOBJDIR}$ac_i"'$U.lo'
 done
 LIBOBJS=$ac_libobjs
 
 LTLIBOBJS=$ac_ltlibobjs
 
 
+if test -z "${COND_CUDA_TRUE}" && test -z "${COND_CUDA_FALSE}"; then
+  { { $as_echo "$as_me:$LINENO: error: conditional \"COND_CUDA\" was never defined.
+Usually this means the macro was only invoked conditionally." >&5
+$as_echo "$as_me: error: conditional \"COND_CUDA\" was never defined.
+Usually this means the macro was only invoked conditionally." >&2;}
+   { (exit 1); exit 1; }; }
+fi
 
 : ${CONFIG_STATUS=./config.status}
 ac_write_fail=0
 ac_clean_files_save=$ac_clean_files
 ac_clean_files="$ac_clean_files $CONFIG_STATUS"
-{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
+{ $as_echo "$as_me:$LINENO: creating $CONFIG_STATUS" >&5
 $as_echo "$as_me: creating $CONFIG_STATUS" >&6;}
-as_write_fail=0
-cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
+cat >$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 #! $SHELL
 # Generated by $as_me.
 # Run this file to recreate the current configuration.
@@ -5161,18 +6100,17 @@
 debug=false
 ac_cs_recheck=false
 ac_cs_silent=false
-
 SHELL=\${CONFIG_SHELL-$SHELL}
-export SHELL
-_ASEOF
-cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
-## -------------------- ##
-## M4sh Initialization. ##
-## -------------------- ##
+_ACEOF
 
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+## --------------------- ##
+## M4sh Initialization.  ##
+## --------------------- ##
+
 # Be more Bourne compatible
 DUALCASE=1; export DUALCASE # for MKS sh
-if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then :
+if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then
   emulate sh
   NULLCMD=:
   # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
@@ -5180,15 +6118,23 @@
   alias -g '${1+"$@"}'='"$@"'
   setopt NO_GLOB_SUBST
 else
-  case `(set -o) 2>/dev/null` in #(
-  *posix*) :
-    set -o posix ;; #(
-  *) :
-     ;;
+  case `(set -o) 2>/dev/null` in
+  *posix*) set -o posix ;;
 esac
+
 fi
 
 
+
+
+# PATH needs CR
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
 as_nl='
 '
 export as_nl
@@ -5196,13 +6142,7 @@
 as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
 as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
 as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
-# Prefer a ksh shell builtin over an external printf program on Solaris,
-# but without wasting forks for bash or zsh.
-if test -z "$BASH_VERSION$ZSH_VERSION" \
-    && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then
-  as_echo='print -r --'
-  as_echo_n='print -rn --'
-elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
+if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
   as_echo='printf %s\n'
   as_echo_n='printf %s'
 else
@@ -5213,7 +6153,7 @@
     as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
     as_echo_n_body='eval
       arg=$1;
-      case $arg in #(
+      case $arg in
       *"$as_nl"*)
 	expr "X$arg" : "X\\(.*\\)$as_nl";
 	arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
@@ -5236,7 +6176,14 @@
   }
 fi
 
+# Support unset when possible.
+if ( (MAIL=60; unset MAIL) || exit) >/dev/null 2>&1; then
+  as_unset=unset
+else
+  as_unset=false
+fi
 
+
 # IFS
 # We need space, tab and new line, in precisely that order.  Quoting is
 # there to prevent editors from complaining about space-tab.
@@ -5245,15 +6192,15 @@
 IFS=" ""	$as_nl"
 
 # Find who we are.  Look in the path if we contain no directory separator.
-case $0 in #((
+case $0 in
   *[\\/]* ) as_myself=$0 ;;
   *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
 for as_dir in $PATH
 do
   IFS=$as_save_IFS
   test -z "$as_dir" && as_dir=.
-    test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
-  done
+  test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break
+done
 IFS=$as_save_IFS
 
      ;;
@@ -5265,16 +6212,12 @@
 fi
 if test ! -f "$as_myself"; then
   $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
-  exit 1
+  { (exit 1); exit 1; }
 fi
 
-# Unset variables that we do not need and which cause bugs (e.g. in
-# pre-3.0 UWIN ksh).  But do not cause bugs in bash 2.01; the "|| exit 1"
-# suppresses any "Segmentation fault" message there.  '((' could
-# trigger a bug in pdksh 5.2.14.
-for as_var in BASH_ENV ENV MAIL MAILPATH
-do eval test x\${$as_var+set} = xset \
-  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+# Work around bugs in pre-3.0 UWIN ksh.
+for as_var in ENV MAIL MAILPATH
+do ($as_unset $as_var) >/dev/null 2>&1 && $as_unset $as_var
 done
 PS1='$ '
 PS2='> '
@@ -5286,89 +6229,7 @@
 LANGUAGE=C
 export LANGUAGE
 
-# CDPATH.
-(unset CDPATH) >/dev/null 2>&1 && unset CDPATH
-
-
-# as_fn_error STATUS ERROR [LINENO LOG_FD]
-# ----------------------------------------
-# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
-# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
-# script with STATUS, using 1 if that was 0.
-as_fn_error ()
-{
-  as_status=$1; test $as_status -eq 0 && as_status=1
-  if test "$4"; then
-    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
-    $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
-  fi
-  $as_echo "$as_me: error: $2" >&2
-  as_fn_exit $as_status
-} # as_fn_error
-
-
-# as_fn_set_status STATUS
-# -----------------------
-# Set $? to STATUS, without forking.
-as_fn_set_status ()
-{
-  return $1
-} # as_fn_set_status
-
-# as_fn_exit STATUS
-# -----------------
-# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
-as_fn_exit ()
-{
-  set +e
-  as_fn_set_status $1
-  exit $1
-} # as_fn_exit
-
-# as_fn_unset VAR
-# ---------------
-# Portably unset VAR.
-as_fn_unset ()
-{
-  { eval $1=; unset $1;}
-}
-as_unset=as_fn_unset
-# as_fn_append VAR VALUE
-# ----------------------
-# Append the text in VALUE to the end of the definition contained in VAR. Take
-# advantage of any shell optimizations that allow amortized linear growth over
-# repeated appends, instead of the typical quadratic growth present in naive
-# implementations.
-if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then :
-  eval 'as_fn_append ()
-  {
-    eval $1+=\$2
-  }'
-else
-  as_fn_append ()
-  {
-    eval $1=\$$1\$2
-  }
-fi # as_fn_append
-
-# as_fn_arith ARG...
-# ------------------
-# Perform arithmetic evaluation on the ARGs, and store the result in the
-# global $as_val. Take advantage of shells that can avoid forks. The arguments
-# must be portable across $(()) and expr.
-if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then :
-  eval 'as_fn_arith ()
-  {
-    as_val=$(( $* ))
-  }'
-else
-  as_fn_arith ()
-  {
-    as_val=`expr "$@" || test $? -eq 1`
-  }
-fi # as_fn_arith
-
-
+# Required to use basename.
 if expr a : '\(a\)' >/dev/null 2>&1 &&
    test "X`expr 00001 : '.*\(...\)'`" = X001; then
   as_expr=expr
@@ -5382,12 +6243,8 @@
   as_basename=false
 fi
 
-if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
-  as_dirname=dirname
-else
-  as_dirname=false
-fi
 
+# Name of the executable.
 as_me=`$as_basename -- "$0" ||
 $as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
 	 X"$0" : 'X\(//\)$' \| \
@@ -5407,25 +6264,76 @@
 	  }
 	  s/.*/./; q'`
 
-# Avoid depending upon Character Ranges.
-as_cr_letters='abcdefghijklmnopqrstuvwxyz'
-as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
-as_cr_Letters=$as_cr_letters$as_cr_LETTERS
-as_cr_digits='0123456789'
-as_cr_alnum=$as_cr_Letters$as_cr_digits
+# CDPATH.
+$as_unset CDPATH
 
+
+
+  as_lineno_1=$LINENO
+  as_lineno_2=$LINENO
+  test "x$as_lineno_1" != "x$as_lineno_2" &&
+  test "x`expr $as_lineno_1 + 1`" = "x$as_lineno_2" || {
+
+  # Create $as_me.lineno as a copy of $as_myself, but with $LINENO
+  # uniformly replaced by the line number.  The first 'sed' inserts a
+  # line-number line after each line using $LINENO; the second 'sed'
+  # does the real work.  The second script uses 'N' to pair each
+  # line-number line with the line containing $LINENO, and appends
+  # trailing '-' during substitution so that $LINENO is not a special
+  # case at line end.
+  # (Raja R Harinath suggested sed '=', and Paul Eggert wrote the
+  # scripts with optimization help from Paolo Bonzini.  Blame Lee
+  # E. McMahon (1931-1989) for sed's syntax.  :-)
+  sed -n '
+    p
+    /[$]LINENO/=
+  ' <$as_myself |
+    sed '
+      s/[$]LINENO.*/&-/
+      t lineno
+      b
+      :lineno
+      N
+      :loop
+      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+      t loop
+      s/-\n.*//
+    ' >$as_me.lineno &&
+  chmod +x "$as_me.lineno" ||
+    { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2
+   { (exit 1); exit 1; }; }
+
+  # Don't try to exec as it changes $[0], causing all sort of problems
+  # (the dirname of $[0] is not the place where we might find the
+  # original and so on.  Autoconf is especially sensitive to this).
+  . "./$as_me.lineno"
+  # Exit status is that of the last command.
+  exit
+}
+
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
 ECHO_C= ECHO_N= ECHO_T=
-case `echo -n x` in #(((((
+case `echo -n x` in
 -n*)
-  case `echo 'xy\c'` in
+  case `echo 'x\c'` in
   *c*) ECHO_T='	';;	# ECHO_T is single tab character.
-  xy)  ECHO_C='\c';;
-  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
-       ECHO_T='	';;
+  *)   ECHO_C='\c';;
   esac;;
 *)
   ECHO_N='-n';;
 esac
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
 
 rm -f conf$$ conf$$.exe conf$$.file
 if test -d conf$$.dir; then
@@ -5454,56 +6362,8 @@
 rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
 rmdir conf$$.dir 2>/dev/null
 
-
-# as_fn_mkdir_p
-# -------------
-# Create "$as_dir" as a directory, including parents if necessary.
-as_fn_mkdir_p ()
-{
-
-  case $as_dir in #(
-  -*) as_dir=./$as_dir;;
-  esac
-  test -d "$as_dir" || eval $as_mkdir_p || {
-    as_dirs=
-    while :; do
-      case $as_dir in #(
-      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
-      *) as_qdir=$as_dir;;
-      esac
-      as_dirs="'$as_qdir' $as_dirs"
-      as_dir=`$as_dirname -- "$as_dir" ||
-$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
-	 X"$as_dir" : 'X\(//\)[^/]' \| \
-	 X"$as_dir" : 'X\(//\)$' \| \
-	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
-$as_echo X"$as_dir" |
-    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)[^/].*/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\/\)$/{
-	    s//\1/
-	    q
-	  }
-	  /^X\(\/\).*/{
-	    s//\1/
-	    q
-	  }
-	  s/.*/./; q'`
-      test -d "$as_dir" && break
-    done
-    test -z "$as_dirs" || eval "mkdir $as_dirs"
-  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
-
-
-} # as_fn_mkdir_p
 if mkdir -p . 2>/dev/null; then
-  as_mkdir_p='mkdir -p "$as_dir"'
+  as_mkdir_p=:
 else
   test -d ./-p && rmdir ./-p
   as_mkdir_p=false
@@ -5522,10 +6382,10 @@
       if test -d "$1"; then
 	test -d "$1/.";
       else
-	case $1 in #(
+	case $1 in
 	-*)set "./$1";;
 	esac;
-	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in #((
+	case `ls -ld'$as_ls_L_option' "$1" 2>/dev/null` in
 	???[sx]*):;;*)false;;esac;fi
     '\'' sh
   '
@@ -5540,19 +6400,13 @@
 
 
 exec 6>&1
-## ----------------------------------- ##
-## Main body of $CONFIG_STATUS script. ##
-## ----------------------------------- ##
-_ASEOF
-test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1
 
-cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-# Save the log message, to keep $0 and so on meaningful, and to
+# Save the log message, to keep $[0] and so on meaningful, and to
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
 This file was extended by Specfem3D Globe $as_me 5.1.2, which was
-generated by GNU Autoconf 2.67.  Invocation command line was
+generated by GNU Autoconf 2.63.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
   CONFIG_HEADERS  = $CONFIG_HEADERS
@@ -5583,15 +6437,13 @@
 
 cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 ac_cs_usage="\
-\`$as_me' instantiates files and other configuration actions
-from templates according to the current configuration.  Unless the files
-and actions are specified as TAGs, all are instantiated by default.
+\`$as_me' instantiates files from templates according to the
+current configuration.
 
-Usage: $0 [OPTION]... [TAG]...
+Usage: $0 [OPTION]... [FILE]...
 
   -h, --help       print this help, then exit
   -V, --version    print version number and configuration settings, then exit
-      --config     print configuration, then exit
   -q, --quiet, --silent
                    do not print progress messages
   -d, --debug      don't remove temporary files
@@ -5607,17 +6459,16 @@
 Configuration headers:
 $config_headers
 
-Report bugs to <jtromp AT princeton.edu>."
+Report bugs to <bug-autoconf at gnu.org>."
 
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
-ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
 ac_cs_version="\\
 Specfem3D Globe config.status 5.1.2
-configured by $0, generated by GNU Autoconf 2.67,
-  with options \\"\$ac_cs_config\\"
+configured by $0, generated by GNU Autoconf 2.63,
+  with options \\"`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
-Copyright (C) 2010 Free Software Foundation, Inc.
+Copyright (C) 2008 Free Software Foundation, Inc.
 This config.status script is free software; the Free Software Foundation
 gives unlimited permission to copy, distribute and modify it."
 
@@ -5632,16 +6483,11 @@
 while test $# != 0
 do
   case $1 in
-  --*=?*)
+  --*=*)
     ac_option=`expr "X$1" : 'X\([^=]*\)='`
     ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
     ac_shift=:
     ;;
-  --*=)
-    ac_option=`expr "X$1" : 'X\([^=]*\)='`
-    ac_optarg=
-    ac_shift=:
-    ;;
   *)
     ac_option=$1
     ac_optarg=$2
@@ -5655,29 +6501,27 @@
     ac_cs_recheck=: ;;
   --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
     $as_echo "$ac_cs_version"; exit ;;
-  --config | --confi | --conf | --con | --co | --c )
-    $as_echo "$ac_cs_config"; exit ;;
   --debug | --debu | --deb | --de | --d | -d )
     debug=: ;;
   --file | --fil | --fi | --f )
     $ac_shift
     case $ac_optarg in
     *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
-    '') as_fn_error $? "missing file argument" ;;
     esac
-    as_fn_append CONFIG_FILES " '$ac_optarg'"
+    CONFIG_FILES="$CONFIG_FILES '$ac_optarg'"
     ac_need_defaults=false;;
   --header | --heade | --head | --hea )
     $ac_shift
     case $ac_optarg in
     *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
     esac
-    as_fn_append CONFIG_HEADERS " '$ac_optarg'"
+    CONFIG_HEADERS="$CONFIG_HEADERS '$ac_optarg'"
     ac_need_defaults=false;;
   --he | --h)
     # Conflict between --help and --header
-    as_fn_error $? "ambiguous option: \`$1'
-Try \`$0 --help' for more information.";;
+    { $as_echo "$as_me: error: ambiguous option: $1
+Try \`$0 --help' for more information." >&2
+   { (exit 1); exit 1; }; };;
   --help | --hel | -h )
     $as_echo "$ac_cs_usage"; exit ;;
   -q | -quiet | --quiet | --quie | --qui | --qu | --q \
@@ -5685,10 +6529,11 @@
     ac_cs_silent=: ;;
 
   # This is an error.
-  -*) as_fn_error $? "unrecognized option: \`$1'
-Try \`$0 --help' for more information." ;;
+  -*) { $as_echo "$as_me: error: unrecognized option: $1
+Try \`$0 --help' for more information." >&2
+   { (exit 1); exit 1; }; } ;;
 
-  *) as_fn_append ac_config_targets " $1"
+  *) ac_config_targets="$ac_config_targets $1"
      ac_need_defaults=false ;;
 
   esac
@@ -5743,7 +6588,9 @@
     "setup/constants.h") CONFIG_FILES="$CONFIG_FILES setup/constants.h" ;;
     "setup/precision.h") CONFIG_FILES="$CONFIG_FILES setup/precision.h" ;;
 
-  *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5 ;;
+  *) { { $as_echo "$as_me:$LINENO: error: invalid argument: $ac_config_target" >&5
+$as_echo "$as_me: error: invalid argument: $ac_config_target" >&2;}
+   { (exit 1); exit 1; }; };;
   esac
 done
 
@@ -5769,7 +6616,7 @@
   trap 'exit_status=$?
   { test -z "$tmp" || test ! -d "$tmp" || rm -fr "$tmp"; } && exit $exit_status
 ' 0
-  trap 'as_fn_exit 1' 1 2 13 15
+  trap '{ (exit 1); exit 1; }' 1 2 13 15
 }
 # Create a (secure) tmp directory for tmp files.
 
@@ -5780,7 +6627,11 @@
 {
   tmp=./conf$$-$RANDOM
   (umask 077 && mkdir "$tmp")
-} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5
+} ||
+{
+   $as_echo "$as_me: cannot create a temporary directory in ." >&2
+   { (exit 1); exit 1; }
+}
 
 # Set up the scripts for CONFIG_FILES section.
 # No need to generate them if there are no CONFIG_FILES.
@@ -5788,13 +6639,8 @@
 if test -n "$CONFIG_FILES"; then
 
 
-ac_cr=`echo X | tr X '\015'`
-# On cygwin, bash can eat \r inside `` if the user requested igncr.
-# But we know of no other shell where ac_cr would be empty at this
-# point, so we can use a bashism as a fallback.
-if test "x$ac_cr" = x; then
-  eval ac_cr=\$\'\\r\'
-fi
+ac_cr='
+'
 ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
 if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
   ac_cs_awk_cr='\\r'
@@ -5811,18 +6657,24 @@
   echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
   echo "_ACEOF"
 } >conf$$subs.sh ||
-  as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
-ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'`
+  { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+   { (exit 1); exit 1; }; }
+ac_delim_num=`echo "$ac_subst_vars" | grep -c '$'`
 ac_delim='%!_!# '
 for ac_last_try in false false false false false :; do
   . ./conf$$subs.sh ||
-    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+    { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+   { (exit 1); exit 1; }; }
 
   ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
   if test $ac_delim_n = $ac_delim_num; then
     break
   elif $ac_last_try; then
-    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+    { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+   { (exit 1); exit 1; }; }
   else
     ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
   fi
@@ -5844,7 +6696,7 @@
 t delim
 :nl
 h
-s/\(.\{148\}\)..*/\1/
+s/\(.\{148\}\).*/\1/
 t more1
 s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
 p
@@ -5858,7 +6710,7 @@
 t nl
 :delim
 h
-s/\(.\{148\}\)..*/\1/
+s/\(.\{148\}\).*/\1/
 t more2
 s/["\\]/\\&/g; s/^/"/; s/$/"/
 p
@@ -5911,28 +6763,22 @@
 else
   cat
 fi < "$tmp/subs1.awk" > "$tmp/subs.awk" \
-  || as_fn_error $? "could not setup config files machinery" "$LINENO" 5
+  || { { $as_echo "$as_me:$LINENO: error: could not setup config files machinery" >&5
+$as_echo "$as_me: error: could not setup config files machinery" >&2;}
+   { (exit 1); exit 1; }; }
 _ACEOF
 
-# VPATH may cause trouble with some makes, so we remove sole $(srcdir),
-# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and
+# VPATH may cause trouble with some makes, so we remove $(srcdir),
+# ${srcdir} and @srcdir@ from VPATH if srcdir is ".", strip leading and
 # trailing colons and then remove the whole line if VPATH becomes empty
 # (actually we leave an empty line to preserve line numbers).
 if test "x$srcdir" = x.; then
-  ac_vpsub='/^[	 ]*VPATH[	 ]*=[	 ]*/{
-h
-s///
-s/^/:/
-s/[	 ]*$/:/
-s/:\$(srcdir):/:/g
-s/:\${srcdir}:/:/g
-s/:@srcdir@:/:/g
-s/^:*//
+  ac_vpsub='/^[	 ]*VPATH[	 ]*=/{
+s/:*\$(srcdir):*/:/
+s/:*\${srcdir}:*/:/
+s/:*@srcdir@:*/:/
+s/^\([^=]*=[	 ]*\):*/\1/
 s/:*$//
-x
-s/\(=[	 ]*\).*/\1/
-G
-s/\n//
 s/^[^=]*=[	 ]*$//
 }'
 fi
@@ -5960,7 +6806,9 @@
   if test -z "$ac_t"; then
     break
   elif $ac_last_try; then
-    as_fn_error $? "could not make $CONFIG_HEADERS" "$LINENO" 5
+    { { $as_echo "$as_me:$LINENO: error: could not make $CONFIG_HEADERS" >&5
+$as_echo "$as_me: error: could not make $CONFIG_HEADERS" >&2;}
+   { (exit 1); exit 1; }; }
   else
     ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
   fi
@@ -6045,7 +6893,9 @@
 _ACAWK
 _ACEOF
 cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
-  as_fn_error $? "could not setup config headers machinery" "$LINENO" 5
+  { { $as_echo "$as_me:$LINENO: error: could not setup config headers machinery" >&5
+$as_echo "$as_me: error: could not setup config headers machinery" >&2;}
+   { (exit 1); exit 1; }; }
 fi # test -n "$CONFIG_HEADERS"
 
 
@@ -6058,7 +6908,9 @@
   esac
   case $ac_mode$ac_tag in
   :[FHL]*:*);;
-  :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5 ;;
+  :L* | :C*:*) { { $as_echo "$as_me:$LINENO: error: invalid tag $ac_tag" >&5
+$as_echo "$as_me: error: invalid tag $ac_tag" >&2;}
+   { (exit 1); exit 1; }; };;
   :[FH]-) ac_tag=-:-;;
   :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
   esac
@@ -6086,10 +6938,12 @@
 	   [\\/$]*) false;;
 	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
 	   esac ||
-	   as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5 ;;
+	   { { $as_echo "$as_me:$LINENO: error: cannot find input file: $ac_f" >&5
+$as_echo "$as_me: error: cannot find input file: $ac_f" >&2;}
+   { (exit 1); exit 1; }; };;
       esac
       case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
-      as_fn_append ac_file_inputs " '$ac_f'"
+      ac_file_inputs="$ac_file_inputs '$ac_f'"
     done
 
     # Let's still pretend it is `configure' which instantiates (i.e., don't
@@ -6100,7 +6954,7 @@
 	`' by configure.'
     if test x"$ac_file" != x-; then
       configure_input="$ac_file.  $configure_input"
-      { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
+      { $as_echo "$as_me:$LINENO: creating $ac_file" >&5
 $as_echo "$as_me: creating $ac_file" >&6;}
     fi
     # Neutralize special characters interpreted by sed in replacement strings.
@@ -6113,7 +6967,9 @@
 
     case $ac_tag in
     *:-:* | *:-) cat >"$tmp/stdin" \
-      || as_fn_error $? "could not create $ac_file" "$LINENO" 5  ;;
+      || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+   { (exit 1); exit 1; }; } ;;
     esac
     ;;
   esac
@@ -6141,7 +6997,47 @@
 	    q
 	  }
 	  s/.*/./; q'`
-  as_dir="$ac_dir"; as_fn_mkdir_p
+  { as_dir="$ac_dir"
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || { $as_mkdir_p && mkdir -p "$as_dir"; } || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+$as_echo X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || { { $as_echo "$as_me:$LINENO: error: cannot create directory $as_dir" >&5
+$as_echo "$as_me: error: cannot create directory $as_dir" >&2;}
+   { (exit 1); exit 1; }; }; }
   ac_builddir=.
 
 case "$ac_dir" in
@@ -6189,6 +7085,7 @@
 # If the template does not know about datarootdir, expand it.
 # FIXME: This hack should be removed a few years after 2.60.
 ac_datarootdir_hack=; ac_datarootdir_seen=
+
 ac_sed_dataroot='
 /datarootdir/ {
   p
@@ -6198,11 +7095,12 @@
 /@docdir@/p
 /@infodir@/p
 /@localedir@/p
-/@mandir@/p'
+/@mandir@/p
+'
 case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
 *datarootdir*) ac_datarootdir_seen=yes;;
 *@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+  { $as_echo "$as_me:$LINENO: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
 $as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
@@ -6212,7 +7110,7 @@
   s&@infodir@&$infodir&g
   s&@localedir@&$localedir&g
   s&@mandir@&$mandir&g
-  s&\\\${datarootdir}&$datarootdir&g' ;;
+    s&\\\${datarootdir}&$datarootdir&g' ;;
 esac
 _ACEOF
 
@@ -6239,22 +7137,26 @@
 $ac_datarootdir_hack
 "
 eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$tmp/subs.awk" >$tmp/out \
-  || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+  || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+   { (exit 1); exit 1; }; }
 
 test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
   { ac_out=`sed -n '/\${datarootdir}/p' "$tmp/out"`; test -n "$ac_out"; } &&
   { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' "$tmp/out"`; test -z "$ac_out"; } &&
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir'
-which seems to be undefined.  Please make sure it is defined" >&5
+  { $as_echo "$as_me:$LINENO: WARNING: $ac_file contains a reference to the variable \`datarootdir'
+which seems to be undefined.  Please make sure it is defined." >&5
 $as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir'
-which seems to be undefined.  Please make sure it is defined" >&2;}
+which seems to be undefined.  Please make sure it is defined." >&2;}
 
   rm -f "$tmp/stdin"
   case $ac_file in
   -) cat "$tmp/out" && rm -f "$tmp/out";;
   *) rm -f "$ac_file" && mv "$tmp/out" "$ac_file";;
   esac \
-  || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+  || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+   { (exit 1); exit 1; }; }
  ;;
   :H)
   #
@@ -6265,19 +7167,25 @@
       $as_echo "/* $configure_input  */" \
       && eval '$AWK -f "$tmp/defines.awk"' "$ac_file_inputs"
     } >"$tmp/config.h" \
-      || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+      || { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+   { (exit 1); exit 1; }; }
     if diff "$ac_file" "$tmp/config.h" >/dev/null 2>&1; then
-      { $as_echo "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5
+      { $as_echo "$as_me:$LINENO: $ac_file is unchanged" >&5
 $as_echo "$as_me: $ac_file is unchanged" >&6;}
     else
       rm -f "$ac_file"
       mv "$tmp/config.h" "$ac_file" \
-	|| as_fn_error $? "could not create $ac_file" "$LINENO" 5
+	|| { { $as_echo "$as_me:$LINENO: error: could not create $ac_file" >&5
+$as_echo "$as_me: error: could not create $ac_file" >&2;}
+   { (exit 1); exit 1; }; }
     fi
   else
     $as_echo "/* $configure_input  */" \
       && eval '$AWK -f "$tmp/defines.awk"' "$ac_file_inputs" \
-      || as_fn_error $? "could not create -" "$LINENO" 5
+      || { { $as_echo "$as_me:$LINENO: error: could not create -" >&5
+$as_echo "$as_me: error: could not create -" >&2;}
+   { (exit 1); exit 1; }; }
   fi
  ;;
 
@@ -6287,12 +7195,15 @@
 done # for ac_tag
 
 
-as_fn_exit 0
+{ (exit 0); exit 0; }
 _ACEOF
+chmod +x $CONFIG_STATUS
 ac_clean_files=$ac_clean_files_save
 
 test $ac_write_fail = 0 ||
-  as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5
+  { { $as_echo "$as_me:$LINENO: error: write failure creating $CONFIG_STATUS" >&5
+$as_echo "$as_me: error: write failure creating $CONFIG_STATUS" >&2;}
+   { (exit 1); exit 1; }; }
 
 
 # configure is writing to config.log, and then calls config.status.
@@ -6313,10 +7224,10 @@
   exec 5>>config.log
   # Use ||, not &&, to avoid exiting from the if with $? = 1, which
   # would make configure fail if this is the last instruction.
-  $ac_cs_success || as_fn_exit 1
+  $ac_cs_success || { (exit 1); exit 1; }
 fi
 if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
-  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+  { $as_echo "$as_me:$LINENO: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
 $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
 fi
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/configure.ac
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/configure.ac	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/configure.ac	2012-02-14 15:11:07 UTC (rev 19622)
@@ -33,7 +33,14 @@
 AC_SUBST([CUSTOM_REAL])
 AC_SUBST([CUSTOM_MPI_TYPE])
 
+AC_ARG_WITH([cuda],
+    [AC_HELP_STRING([--with-cuda],
+        [build cuda GPU enabled version @<:@default=no@:>@])],
+    [want_cuda="$withval"],
+    [want_cuda=no])
+AM_CONDITIONAL([COND_CUDA], [test "$want_cuda" = yes])
 
+
 # Checks for programs.
 
 # a courtesy to the installed base of users
@@ -76,6 +83,13 @@
 AC_ARG_VAR(MPILIBS, [extra libraries for linking MPI programs])
 AC_ARG_VAR(FLAGS_CHECK, [Fortran compiler flags for non-critical subroutines])
 AC_ARG_VAR(FLAGS_NO_CHECK, [Fortran compiler flags for creating fast, production-run code for critical subroutines])
+AC_ARG_VAR(CUDA_LIB,[Location of CUDA library libcudart])
+AC_ARG_VAR(CUDA_INC,[Location of CUDA include files])
+AC_ARG_VAR(MPI_INC,[Location of MPI include mpi.h, which is needed by nvcc when compiling cuda files])
+AC_ARG_VAR(AR, [ar library creation])
+AC_ARG_VAR(ARFLAGS, [ar flag library creation])
+AC_ARG_VAR(RANLIB, [ranlib library creation])
+
 if test x"$MPIFC" = x; then
     MPIFC=mpif90
 fi
@@ -101,7 +115,7 @@
 # Checks for library functions.
 
 
-AC_CONFIG_FILES([Makefile src/auxiliaries/Makefile src/meshfem3D/Makefile src/specfem3D/Makefile src/create_header_file/Makefile setup/constants.h setup/precision.h DATA/Par_file DATA/CMTSOLUTION DATA/STATIONS])
+AC_CONFIG_FILES([Makefile src/auxiliaries/Makefile src/meshfem3D/Makefile src/specfem3D/Makefile src/create_header_file/Makefile setup/constants.h setup/precision.h])
 AC_OUTPUT
 
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/flags.guess
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/flags.guess	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/flags.guess	2012-02-14 15:11:07 UTC (rev 19622)
@@ -27,28 +27,28 @@
         # check: http://software.intel.com/sites/products/documentation/hpc/compilerpro/en-us/fortran/lin/compiler_f/index.htm
         #
         if test x"$FLAGS_CHECK" = x; then
-            FLAGS_CHECK="-O3 -xSSE4.2 -ftz -funroll-loops -unroll5 -ftz -align sequence -vec-report0 -std95 -implicitnone -warn truncated_source -warn argument_checking -warn unused -warn declarations -warn alignments -warn ignore_loc -warn usage -check nobounds" # -mcmodel=medium
+            FLAGS_CHECK="-O3 -xSSE4.2 -ftz -funroll-loops -unroll5 -ftz -align sequence -assume byterecl -vec-report0 -std95 -implicitnone -warn truncated_source -warn argument_checking -warn unused -warn declarations -warn alignments -warn ignore_loc -warn usage -check nobounds" # -mcmodel=medium
         fi
         if test x"$FLAGS_NO_CHECK" = x; then
-            FLAGS_NO_CHECK="-O3 -xSSE4.2 -ftz -funroll-loops -unroll5 -ftz -align sequence -vec-report0 -std95 -implicitnone -warn truncated_source -warn argument_checking -warn unused -warn declarations -warn alignments -warn ignore_loc -warn usage -check nobounds" # -mcmodel=medium
+            FLAGS_NO_CHECK="-O3 -xSSE4.2 -ftz -funroll-loops -unroll5 -ftz -align sequence -assume byterecl -vec-report0 -std95 -implicitnone -warn truncated_source -warn argument_checking -warn unused -warn declarations -warn alignments -warn ignore_loc -warn usage -check nobounds" # -mcmodel=medium
         fi
         # useful for debugging...
             # for debugging: change -check nobounds to -check all -debug -g -O0 -fp-stack-check -traceback -ftrapuv
         #if test x"$FLAGS_CHECK" = x; then
         #    # without -e95
-        #    FLAGS_CHECK="-O3 -check nobounds -ftz -traceback -ftrapuv -align sequence -vec-report0 -std95 -implicitnone -warn truncated_source -warn argument_checking -warn unused -warn declarations -warn alignments -warn ignore_loc -warn usage" # -mcmodel=medium
+        #    FLAGS_CHECK="-O3 -check nobounds -ftz -traceback -ftrapuv -align sequence -assume byterecl -vec-report0 -std95 -implicitnone -warn truncated_source -warn argument_checking -warn unused -warn declarations -warn alignments -warn ignore_loc -warn usage" # -mcmodel=medium
         #fi
         #if test x"$FLAGS_NO_CHECK" = x; then
         #    # standard options (leave option -ftz, which is *critical* for performance)
-        #    FLAGS_NO_CHECK="-O3 -xP -ftz -align sequence -vec-report0 -std95 -implicitnone -warn truncated_source -warn argument_checking -warn unused -warn declarations -warn alignments -warn ignore_loc -warn usage" # -mcmodel=medium
+        #    FLAGS_NO_CHECK="-O3 -xP -ftz -align sequence -assume byterecl -vec-report0 -std95 -implicitnone -warn truncated_source -warn argument_checking -warn unused -warn declarations -warn alignments -warn ignore_loc -warn usage" # -mcmodel=medium
         #fi
         #
         # Intel Nehalem processor architecture, Intel compiler version 10.1
         #if test x"$FLAGS_CHECK" = x; then
-        #    FLAGS_CHECK="-O2 -ftz -xT -static-intel -implicitnone"
+        #    FLAGS_CHECK="-O2 -ftz -xT -assume byterecl -static-intel -implicitnone"
         #fi
         #if test x"$FLAGS_NO_CHECK" = x; then
-        #    FLAGS_NO_CHECK="-O2 -ftz -xT -static-intel -implicitnone"
+        #    FLAGS_NO_CHECK="-O2 -ftz -xT -assume byterecl -static-intel -implicitnone"
         #fi
         #
         # Intel Nehalem processor architecture, Intel compiler version 11.1
@@ -154,9 +154,9 @@
         ;;
     *xlf*|*/*xlf*)
         #
-        # on some (but not all) IBM machines one might need to add -qsave otherwise the IBM compiler allocates the
+        # do NOT remove option -qsave otherwise the IBM compiler allocates the
         # arrays in the stack and the code crashes if the stack size is too
-        # small (which is sometimes the case, but less often these days on large machines)
+        # small (which is often the case)
         #
         # on IBM with xlf one should also set
         #
@@ -177,11 +177,11 @@
         #
         if test x"$FLAGS_NO_CHECK" = x; then
 # deleted -qxflag=dvz because it requires handler function __xl_dzx and thus linking will fail 
-            FLAGS_NO_CHECK="-O3 -qstrict -q64 -qtune=auto -qarch=auto -qcache=auto -qfree=f90 -qsuffix=f=f90 -qhalt=w -qlanglvl=90pure -qsuppress=1518-317 -qflttrap=overflow:zerodivide:invalid:enable -qfloat=nans -qinitauto=7FBFFFFF -Q -Q+rank,swap_all -Wl,-relax -qunroll=yes"
+            FLAGS_NO_CHECK="-O3 -qsave -qstrict -q64 -qtune=auto -qarch=auto -qcache=auto -qfree=f90 -qsuffix=f=f90 -qhalt=w -qlanglvl=90pure -qsuppress=1518-317 -qflttrap=overflow:zerodivide:invalid:enable -qfloat=nans -qinitauto=7FBFFFFF -Q -Q+rank,swap_all -Wl,-relax -qunroll=yes"
         # on "MareNostrum" IBM at the Barcelona SuperComputing Center (Spain) use:
-        # -qtune=ppc970 -qarch=ppc64v -qsave     instead of -qtune=auto -qarch=auto
+        # -qtune=ppc970 -qarch=ppc64v instead of -qtune=auto -qarch=auto
         # on "Babel" IBM BlueGene at IDRIS (France) use:
-        # -qtune=auto -qarch=450d -qsave     instead of -qtune=auto -qarch=auto
+        # -qtune=auto -qarch=450d instead of -qtune=auto -qarch=auto
         fi
         if test x"$FLAGS_CHECK" = x; then
         #

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in	2012-02-14 15:11:07 UTC (rev 19622)
@@ -47,7 +47,6 @@
   integer, parameter :: CUSTOM_REAL = @CUSTOM_REAL@
 
 ! this for non blocking assembly
-  logical, parameter :: USE_NONBLOCKING_COMMS = .true.
   integer, parameter :: ELEMENTS_NONBLOCKING_CM_IC = 1500
   integer, parameter :: ELEMENTS_NONBLOCKING_OC = 3000
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/auxiliaries/create_movie_AVS_DX.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/auxiliaries/create_movie_AVS_DX.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/auxiliaries/create_movie_AVS_DX.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -758,48 +758,49 @@
 !
 !=====================================================================
 !
-
-  subroutine read_params_and_create_movie
-
 !
-! This routine is called by the Pyrized version.
+!  subroutine read_params_and_create_movie
 !
+!!
+!! This routine is called by the Pyrized version.
+!!
+!
+!  implicit none
+!
+!  integer it1,it2
+!  integer iformat
+!
+!! parameters read from parameter file
+!  integer NEX_XI,NEX_ETA
+!  integer NSTEP,NTSTEP_BETWEEN_FRAMES,NCHUNKS
+!  integer NPROCTOT,NEX_PER_PROC_XI,NEX_PER_PROC_ETA
+!  logical MOVIE_SURFACE
+!
+!  integer, external :: err_occurred
+!
+!  call read_AVS_DX_parameters(NEX_XI,NEX_ETA, &
+!           NSTEP,NTSTEP_BETWEEN_FRAMES, &
+!           NCHUNKS,MOVIE_SURFACE, &
+!           NPROCTOT,NEX_PER_PROC_XI,NEX_PER_PROC_ETA)
+!
+!! read additional parameters for making movies
+!  call read_value_integer(iformat, 'format')
+!  if(err_occurred() /= 0) stop 'an error occurred while reading the parameter file'
+!  call read_value_integer(it1, 'beginning')
+!  if(err_occurred() /= 0) stop 'an error occurred while reading the parameter file'
+!  call read_value_integer(it2, 'end')
+!  if(err_occurred() /= 0) stop 'an error occurred while reading the parameter file'
+!
+!! run the main program
+!  call create_movie_AVS_DX(iformat,it1,it2, &
+!           NEX_XI,NEX_ETA, &
+!           NSTEP,NTSTEP_BETWEEN_FRAMES, &
+!           NCHUNKS, &
+!           NPROCTOT,NEX_PER_PROC_XI,NEX_PER_PROC_ETA, &
+!           USE_COMPONENT)
+!
+!  end subroutine read_params_and_create_movie
 
-  implicit none
-
-  integer it1,it2
-  integer iformat
-
-! parameters read from parameter file
-  integer NEX_XI,NEX_ETA
-  integer NSTEP,NTSTEP_BETWEEN_FRAMES,NCHUNKS
-  integer NPROCTOT,NEX_PER_PROC_XI,NEX_PER_PROC_ETA
-  logical MOVIE_SURFACE
-
-  integer, external :: err_occurred
-
-  call read_AVS_DX_parameters(NEX_XI,NEX_ETA, &
-           NSTEP,NTSTEP_BETWEEN_FRAMES, &
-           NCHUNKS,MOVIE_SURFACE, &
-           NPROCTOT,NEX_PER_PROC_XI,NEX_PER_PROC_ETA)
-
-! read additional parameters for making movies
-  call read_value_integer(iformat, 'format')
-  if(err_occurred() /= 0) stop 'an error occurred while reading the parameter file'
-  call read_value_integer(it1, 'beginning')
-  if(err_occurred() /= 0) stop 'an error occurred while reading the parameter file'
-  call read_value_integer(it2, 'end')
-  if(err_occurred() /= 0) stop 'an error occurred while reading the parameter file'
-
-! run the main program
-  call create_movie_AVS_DX(iformat,it1,it2, &
-           NEX_XI,NEX_ETA, &
-           NSTEP,NTSTEP_BETWEEN_FRAMES, &
-           NCHUNKS, &
-           NPROCTOT,NEX_PER_PROC_XI,NEX_PER_PROC_ETA)
-
-  end subroutine read_params_and_create_movie
-
 ! ------------------------------------------------------------------
 
   subroutine read_AVS_DX_parameters(NEX_XI,NEX_ETA, &

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,200 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// prepares a device array with with all inter-element edge-nodes -- this
+// is followed by a memcpy and MPI operations
+__global__ void prepare_boundary_potential_on_device(realw* d_potential_dot_dot_acoustic,
+                                                     realw* d_send_potential_dot_dot_buffer,
+                                                     int num_interfaces_ext_mesh,
+                                                     int max_nibool_interfaces_ext_mesh,
+                                                     int* d_nibool_interfaces_ext_mesh,
+                                                     int* d_ibool_interfaces_ext_mesh) {
+
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int iinterface=0;
+
+  for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+    if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
+      d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces_ext_mesh*iinterface)] =
+      d_potential_dot_dot_acoustic[(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)];
+    }
+  }
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// prepares and transfers the inter-element edge-nodes to the host to be MPI'd
+extern "C"
+void FC_FUNC_(transfer_boun_pot_from_device,
+              TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer_f,
+                                             realw* send_potential_dot_dot_buffer,
+                                             int* FORWARD_OR_ADJOINT){
+
+  TRACE("transfer_boun_pot_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // checks if anything to do
+  if( mp->num_interfaces_outer_core == 0 ) return;
+
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_outer_core))/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if(*FORWARD_OR_ADJOINT == 1) {
+    prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_accel_outer_core,
+                                                           mp->d_send_accel_buffer_outer_core,
+                                                           mp->num_interfaces_outer_core,
+                                                           mp->max_nibool_interfaces_outer_core,
+                                                           mp->d_nibool_interfaces_outer_core,
+                                                           mp->d_ibool_interfaces_outer_core);
+  }
+  else if(*FORWARD_OR_ADJOINT == 3) {
+    prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_accel_outer_core,
+                                                           mp->d_send_accel_buffer_outer_core,
+                                                           mp->num_interfaces_outer_core,
+                                                           mp->max_nibool_interfaces_outer_core,
+                                                           mp->d_nibool_interfaces_outer_core,
+                                                           mp->d_ibool_interfaces_outer_core);
+  }
+
+  print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_accel_buffer_outer_core,
+                                     (mp->max_nibool_interfaces_outer_core)*(mp->num_interfaces_outer_core)*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),98000);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_boundary_kernel");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void assemble_boundary_potential_on_device(realw* d_potential_dot_dot_acoustic,
+                                                      realw* d_send_potential_dot_dot_buffer,
+                                                      int num_interfaces_ext_mesh,
+                                                      int max_nibool_interfaces_ext_mesh,
+                                                      int* d_nibool_interfaces_ext_mesh,
+                                                      int* d_ibool_interfaces_ext_mesh) {
+
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int iinterface=0;
+
+  for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+    if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
+
+      atomicAdd(&d_potential_dot_dot_acoustic[(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)],
+                d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces_ext_mesh*iinterface)]);
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_asmbl_pot_to_device,
+              TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
+                                            realw* buffer_recv_scalar,
+                                            int* FORWARD_OR_ADJOINT) {
+
+  TRACE("transfer_asmbl_pot_to_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  //double start_time = get_time();
+
+  // checks if anything to do
+  if( mp->num_interfaces_outer_core == 0 ) return;
+
+  // copies scalar buffer onto GPU
+  cudaMemcpy(mp->d_send_accel_buffer_outer_core, buffer_recv_scalar,
+             (mp->max_nibool_interfaces_outer_core)*(mp->num_interfaces_outer_core)*sizeof(realw), 
+             cudaMemcpyHostToDevice);
+
+  // assembles on GPU
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_outer_core)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if(*FORWARD_OR_ADJOINT == 1) {
+    //assemble forward field
+    assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_accel_outer_core,
+                                                            mp->d_send_accel_buffer_outer_core,
+                                                            mp->num_interfaces_outer_core,
+                                                            mp->max_nibool_interfaces_outer_core,
+                                                            mp->d_nibool_interfaces_outer_core,
+                                                            mp->d_ibool_interfaces_outer_core);
+  }
+  else if(*FORWARD_OR_ADJOINT == 3) {
+    //assemble reconstructed/backward field
+    assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_accel_outer_core,
+                                                            mp->d_send_accel_buffer_outer_core,
+                                                            mp->num_interfaces_outer_core,
+                                                            mp->max_nibool_interfaces_outer_core,
+                                                            mp->d_nibool_interfaces_outer_core,
+                                                            mp->d_ibool_interfaces_outer_core);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("transfer_asmbl_pot_to_device");
+#endif
+}
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,287 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// prepares a device array with all inter-element edge-nodes -- this
+// is followed by a memcpy and MPI operations
+__global__ void prepare_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
+                                                 int num_interfaces_ext_mesh,
+                                                 int max_nibool_interfaces_ext_mesh,
+                                                 int* d_nibool_interfaces_ext_mesh,
+                                                 int* d_ibool_interfaces_ext_mesh) {
+
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int iinterface=0;
+
+  for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+    if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
+      d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)] =
+      d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)];
+      d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1] =
+      d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1];
+      d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2] =
+      d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2];
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// prepares and transfers the inter-element edge-nodes to the host to be MPI'd
+// (elements on boundary)
+extern "C"
+void FC_FUNC_(transfer_boun_accel_from_device,
+              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
+                                                  realw* send_accel_buffer,
+                                                  int* IREGION,
+                                                  int* FORWARD_OR_ADJOINT){
+  TRACE("transfer_boun_accel_from_device");
+  int blocksize,size_padded,num_blocks_x,num_blocks_y;
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // crust/mantle region
+  if( *IREGION == IREGION_CRUST_MANTLE ){
+    if( mp->num_interfaces_crust_mantle > 0 ){
+
+      blocksize = BLOCKSIZE_TRANSFER;
+      size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_crust_mantle)/((double)blocksize)))*blocksize;
+      num_blocks_x = size_padded/blocksize;
+      num_blocks_y = 1;
+      while(num_blocks_x > 65535) {
+        num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+        num_blocks_y = num_blocks_y*2;
+      }
+      dim3 grid(num_blocks_x,num_blocks_y);
+      dim3 threads(blocksize,1,1);
+
+      if(*FORWARD_OR_ADJOINT == 1) {
+        prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle,
+                                                           mp->d_send_accel_buffer_crust_mantle,
+                                                           mp->num_interfaces_crust_mantle,
+                                                           mp->max_nibool_interfaces_crust_mantle,
+                                                           mp->d_nibool_interfaces_crust_mantle,
+                                                           mp->d_ibool_interfaces_crust_mantle);
+      }
+      else if(*FORWARD_OR_ADJOINT == 3) {
+        prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+                                                           mp->d_send_accel_buffer_crust_mantle,
+                                                           mp->num_interfaces_crust_mantle,
+                                                           mp->max_nibool_interfaces_crust_mantle,
+                                                           mp->d_nibool_interfaces_crust_mantle,
+                                                           mp->d_ibool_interfaces_crust_mantle);
+      }
+
+      // copies buffer to CPU
+      cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_crust_mantle,
+                 3*mp->max_nibool_interfaces_crust_mantle*mp->num_interfaces_crust_mantle*sizeof(realw),
+                 cudaMemcpyDeviceToHost);
+               
+    }
+  }
+  
+  // inner core region
+  if( *IREGION == IREGION_INNER_CORE ){
+    if( mp->num_interfaces_inner_core > 0 ){
+      
+      blocksize = BLOCKSIZE_TRANSFER;
+      size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_inner_core)/((double)blocksize)))*blocksize;
+      num_blocks_x = size_padded/blocksize;
+      num_blocks_y = 1;
+      while(num_blocks_x > 65535) {
+        num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+        num_blocks_y = num_blocks_y*2;
+      }    
+      dim3 grid(num_blocks_x,num_blocks_y);
+      dim3 threads(blocksize,1,1);
+      
+      if(*FORWARD_OR_ADJOINT == 1) {
+        prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
+                                                           mp->d_send_accel_buffer_inner_core,
+                                                           mp->num_interfaces_inner_core,
+                                                           mp->max_nibool_interfaces_inner_core,
+                                                           mp->d_nibool_interfaces_inner_core,
+                                                           mp->d_ibool_interfaces_inner_core);
+      }
+      else if(*FORWARD_OR_ADJOINT == 3) {
+        prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core,
+                                                           mp->d_send_accel_buffer_inner_core,
+                                                           mp->num_interfaces_inner_core,
+                                                           mp->max_nibool_interfaces_inner_core,
+                                                           mp->d_nibool_interfaces_inner_core,
+                                                           mp->d_ibool_interfaces_inner_core);
+      }
+      
+      // copies buffer to CPU
+      cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_inner_core,
+                 3*mp->max_nibool_interfaces_inner_core*mp->num_interfaces_inner_core*sizeof(realw),
+                 cudaMemcpyDeviceToHost);
+      
+    }
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_boun_accel_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
+                                                  int num_interfaces_ext_mesh,
+                                                  int max_nibool_interfaces_ext_mesh,
+                                                  int* d_nibool_interfaces_ext_mesh,
+                                                  int* d_ibool_interfaces_ext_mesh) {
+
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int iinterface=0;
+  
+  for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+    if(id < d_nibool_interfaces_ext_mesh[iinterface]) {
+      atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)],
+                d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)]);
+      atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1],
+                d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1]);
+      atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2],
+                d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2]);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// FORWARD_OR_ADJOINT == 1 for accel, and == 3 for b_accel
+extern "C"
+void FC_FUNC_(transfer_asmbl_accel_to_device,
+              TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
+                                              realw* buffer_recv_vector,
+                                              int* IREGION,
+                                              int* FORWARD_OR_ADJOINT) {
+  TRACE("transfer_asmbl_accel_to_device");
+  int blocksize,size_padded,num_blocks_x,num_blocks_y;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  // crust/mantle region
+  if( *IREGION == IREGION_CRUST_MANTLE ){
+    if( mp->num_interfaces_crust_mantle > 0 ){
+
+      // copies vector buffer values to GPU
+      cudaMemcpy(mp->d_send_accel_buffer_crust_mantle, buffer_recv_vector,
+                 3*(mp->max_nibool_interfaces_crust_mantle)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
+                 cudaMemcpyHostToDevice);
+
+      // assembles values
+      blocksize = BLOCKSIZE_TRANSFER;
+      size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_crust_mantle)/((double)blocksize)))*blocksize;
+      num_blocks_x = size_padded/blocksize;
+      num_blocks_y = 1;
+      while(num_blocks_x > 65535) {
+        num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+        num_blocks_y = num_blocks_y*2;
+      }
+
+      dim3 grid(num_blocks_x,num_blocks_y);
+      dim3 threads(blocksize,1,1);
+
+      if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
+        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle, 
+                                                            mp->d_send_accel_buffer_crust_mantle,
+                                                            mp->num_interfaces_crust_mantle,
+                                                            mp->max_nibool_interfaces_crust_mantle,
+                                                            mp->d_nibool_interfaces_crust_mantle,
+                                                            mp->d_ibool_interfaces_crust_mantle);
+      }
+      else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
+        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle, 
+                                                            mp->d_send_accel_buffer_crust_mantle,
+                                                            mp->num_interfaces_crust_mantle,
+                                                            mp->max_nibool_interfaces_crust_mantle,
+                                                            mp->d_nibool_interfaces_crust_mantle,
+                                                            mp->d_ibool_interfaces_crust_mantle);
+      }
+    }
+  }
+
+  // inner core region
+  if( *IREGION == IREGION_INNER_CORE ){
+    if( mp->num_interfaces_inner_core > 0 ){
+      // copies buffer values to GPU
+      cudaMemcpy(mp->d_send_accel_buffer_inner_core, buffer_recv_vector,
+                 3*(mp->max_nibool_interfaces_inner_core)*(mp->num_interfaces_inner_core)*sizeof(realw),
+                 cudaMemcpyHostToDevice);
+      
+      // assembles values
+      blocksize = BLOCKSIZE_TRANSFER;
+      size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_inner_core)/((double)blocksize)))*blocksize;
+      num_blocks_x = size_padded/blocksize;
+      num_blocks_y = 1;
+      while(num_blocks_x > 65535) {
+        num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+        num_blocks_y = num_blocks_y*2;
+      }
+      
+      dim3 grid(num_blocks_x,num_blocks_y);
+      dim3 threads(blocksize,1,1);
+      
+      if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
+        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core, 
+                                                            mp->d_send_accel_buffer_inner_core,
+                                                            mp->num_interfaces_inner_core,
+                                                            mp->max_nibool_interfaces_inner_core,
+                                                            mp->d_nibool_interfaces_inner_core,
+                                                            mp->d_ibool_interfaces_inner_core);
+      }
+      else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
+        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core, 
+                                                            mp->d_send_accel_buffer_inner_core,
+                                                            mp->num_interfaces_inner_core,
+                                                            mp->max_nibool_interfaces_inner_core,
+                                                            mp->d_nibool_interfaces_inner_core,
+                                                            mp->d_ibool_interfaces_inner_core);
+      }
+    }
+  }
+  
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_asmbl_accel_to_device");
+#endif
+}

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,928 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+#include "prepare_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Helper functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+double get_time()
+{
+  struct timeval t;
+  struct timezone tzp;
+  gettimeofday(&t, &tzp);
+  return t.tv_sec + t.tv_usec*1e-6;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(pause_for_debug,
+              PAUSE_FOR_DEBUG)() {
+  TRACE("pause_for_debug");
+
+  pause_for_debugger(1);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void pause_for_debugger(int pause) {
+  if(pause) {
+    int myrank;
+#ifdef WITH_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+    myrank = 0;
+#endif
+    printf("I'm rank %d\n",myrank);
+    int i = 0;
+    char hostname[256];
+    gethostname(hostname, sizeof(hostname));
+    printf("PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
+    FILE *file = fopen("/scratch/eiger/rietmann/attach_gdb.txt","w+");
+    if( file != NULL ){
+      fprintf(file,"PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
+      fclose(file);
+    }
+    fflush(stdout);
+    while (0 == i)
+      sleep(5);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void exit_on_cuda_error(char* kernel_name) {
+  // sync and check to catch errors from previous async operations
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr,"Error after: %s --- %s\n", kernel_name, cudaGetErrorString(err));
+    pause_for_debugger(0);
+    //free(kernel_name);
+    exit(EXIT_FAILURE);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void exit_on_error(char* info)
+{
+  printf("\nERROR: %s\n",info);
+  fflush(stdout);
+#ifdef WITH_MPI
+  MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+  //free(info);
+  exit(EXIT_FAILURE);
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void print_CUDA_error_if_any(cudaError_t err, int num)
+{
+  if (cudaSuccess != err)
+  {
+    printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
+    fflush(stdout);
+#ifdef WITH_MPI
+    MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+    exit(EXIT_FAILURE);
+  }
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void get_free_memory(double* free_db, double* used_db, double* total_db) {
+
+  // gets memory usage in byte
+  size_t free_byte ;
+  size_t total_byte ;
+  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+  if ( cudaSuccess != cuda_status ){
+    printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
+    exit(EXIT_FAILURE);
+  }
+
+  *free_db = (double)free_byte ;
+  *total_db = (double)total_byte ;
+  *used_db = *total_db - *free_db ;
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Saves GPU memory usage to file
+void output_free_memory(int myrank,char* info_str) {
+
+  FILE* fp;
+  char filename[BUFSIZ];
+  double free_db,used_db,total_db;
+
+  get_free_memory(&free_db,&used_db,&total_db);
+
+  sprintf(filename,"OUTPUT_FILES/gpu_device_mem_usage_proc_%06d.txt",myrank);
+  fp = fopen(filename,"a+");
+  if( fp != NULL ){
+    fprintf(fp,"%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
+            used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+    fclose(fp);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Fortran-callable version of above method
+extern "C"
+void FC_FUNC_(output_free_device_memory,
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {
+
+  TRACE("output_free_device_memory");
+
+  char info[6];
+  sprintf(info,"f %d:",*myrank);
+  output_free_memory(*myrank,info);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(get_free_device_memory,
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {
+  TRACE("get_free_device_memory");
+
+  double free_db,used_db,total_db;
+
+  get_free_memory(&free_db,&used_db,&total_db);
+
+  // converts to MB
+  *free = (realw) free_db/1024.0/1024.0;
+  *used = (realw) used_db/1024.0/1024.0;
+  *total = (realw) total_db/1024.0/1024.0;
+  return;
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+ __global__ void check_phase_ispec_kernel(int num_phase_ispec,
+ int* phase_ispec,
+ int NSPEC_AB,
+ int* ier) {
+
+ int i,ispec,iphase,count0,count1;
+ *ier = 0;
+
+ for(iphase=0; iphase < 2; iphase++){
+ count0 = 0;
+ count1 = 0;
+
+ for(i=0; i < num_phase_ispec; i++){
+ ispec = phase_ispec[iphase*num_phase_ispec + i] - 1;
+ if( ispec < -1 || ispec >= NSPEC_AB ){
+ printf("Error in d_phase_ispec_inner_elastic %d %d\n",i,ispec);
+ *ier = 1;
+ return;
+ }
+ if( ispec >= 0 ){ count0++;}
+ if( ispec < 0 ){ count1++;}
+ }
+
+ printf("check_phase_ispec done: phase %d, count = %d %d \n",iphase,count0,count1);
+
+ }
+ }
+
+ void check_phase_ispec(long* Mesh_pointer_f,int type){
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ printf("check phase_ispec for type=%d\n",type);
+
+ dim3 grid(1,1);
+ dim3 threads(1,1,1);
+
+ int* h_debug = (int*) calloc(1,sizeof(int));
+ int* d_debug;
+ cudaMalloc((void**)&d_debug,sizeof(int));
+
+ if( type == 1 ){
+ check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_elastic,
+ mp->d_phase_ispec_inner_elastic,
+ mp->NSPEC_AB,
+ d_debug);
+ }else if( type == 2 ){
+ check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_acoustic,
+ mp->d_phase_ispec_inner_acoustic,
+ mp->NSPEC_AB,
+ d_debug);
+ }
+
+ cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+ cudaFree(d_debug);
+ if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+ free(h_debug);
+ fflush(stdout);
+
+ #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("check_phase_ispec");
+ #endif
+
+ }
+ */
+
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+ __global__ void check_ispec_is_kernel(int NSPEC_AB,
+ int* ispec_is,
+ int* ier) {
+
+ int ispec,count0,count1;
+
+ *ier = 0;
+ count0 = 0;
+ count1 = 0;
+ for(ispec=0; ispec < NSPEC_AB; ispec++){
+ if( ispec_is[ispec] < -1 || ispec_is[ispec] > 1 ){
+ printf("Error in ispec_is %d %d\n",ispec,ispec_is[ispec]);
+ *ier = 1;
+ return;
+ //exit(1);
+ }
+ if( ispec_is[ispec] == 0 ){count0++;}
+ if( ispec_is[ispec] != 0 ){count1++;}
+ }
+ printf("check_ispec_is done: count = %d %d\n",count0,count1);
+ }
+
+ void check_ispec_is(long* Mesh_pointer_f,int type){
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ printf("check ispec_is for type=%d\n",type);
+
+ dim3 grid(1,1);
+ dim3 threads(1,1,1);
+
+ int* h_debug = (int*) calloc(1,sizeof(int));
+ int* d_debug;
+ cudaMalloc((void**)&d_debug,sizeof(int));
+
+ if( type == 0 ){
+ check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+ mp->d_ispec_is_inner,
+ d_debug);
+ }else if( type == 1 ){
+ check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+ mp->d_ispec_is_elastic,
+ d_debug);
+ }else if( type == 2 ){
+ check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+ mp->d_ispec_is_acoustic,
+ d_debug);
+ }
+
+ cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+ cudaFree(d_debug);
+ if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+ free(h_debug);
+ fflush(stdout);
+
+ #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("check_ispec_is");
+ #endif
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+ __global__ void check_array_ispec_kernel(int num_array_ispec,
+ int* array_ispec,
+ int NSPEC_AB,
+ int* ier) {
+
+ int i,ispec,count0,count1;
+
+ *ier = 0;
+ count0 = 0;
+ count1 = 0;
+
+ for(i=0; i < num_array_ispec; i++){
+ ispec = array_ispec[i] - 1;
+ if( ispec < -1 || ispec >= NSPEC_AB ){
+ printf("Error in d_array_ispec %d %d\n",i,ispec);
+ *ier = 1;
+ return;
+ }
+ if( ispec >= 0 ){ count0++;}
+ if( ispec < 0 ){ count1++;}
+ }
+
+ printf("check_array_ispec done: count = %d %d \n",count0,count1);
+ }
+
+ void check_array_ispec(long* Mesh_pointer_f,int type){
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ printf("check array_ispec for type=%d\n",type);
+
+ dim3 grid(1,1);
+ dim3 threads(1,1,1);
+
+ int* h_debug = (int*) calloc(1,sizeof(int));
+ int* d_debug;
+ cudaMalloc((void**)&d_debug,sizeof(int));
+
+ if( type == 1 ){
+ check_array_ispec_kernel<<<grid,threads>>>(mp->d_num_abs_boundary_faces,
+ mp->d_abs_boundary_ispec,
+ mp->NSPEC_AB,
+ d_debug);
+ }
+
+ cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+ cudaFree(d_debug);
+ if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+ free(h_debug);
+ fflush(stdout);
+
+ #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("check_array_ispec");
+ #endif
+
+ }
+ */
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Check functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_displ_gpu,
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {
+
+TRACE("check_max_norm_displ_gpu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  cudaMemcpy(displ, mp->d_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(displ[i]));
+  }
+  printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_vector,
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {
+
+TRACE("check_max_norm_vector");
+
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  realw maxnorm=0;
+  int maxloc;
+  for(int i=0;i<*size;i++) {
+    if(maxnorm<fabsf(vector1[i])) {
+      maxnorm = vector1[i];
+      maxloc = i;
+    }
+  }
+  printf("%d:maxnorm of vector %d [%d] = %e\n",procid,*announceID,maxloc,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_displ,
+              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {
+
+TRACE("check_max_norm_displ");
+
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(displ[i]));
+  }
+  printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_b_displ_gpu,
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {
+
+TRACE("check_max_norm_b_displ_gpu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  realw* b_accel = (realw*)malloc(*size*sizeof(realw));
+
+  cudaMemcpy(b_displ, mp->d_b_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  realw maxnorm=0;
+  realw maxnorm_accel=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
+    maxnorm_accel = MAX(maxnorm,fabsf(b_accel[i]));
+  }
+  free(b_accel);
+  printf("%d: maxnorm of backward displ = %e\n",*announceID,maxnorm);
+  printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm_accel);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_b_accel_gpu,
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {
+
+TRACE("check_max_norm_b_accel_gpu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
+  }
+  printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_b_veloc_gpu,
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {
+
+TRACE("check_max_norm_b_veloc_gpu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(b_veloc[i]));
+  }
+  printf("%d: maxnorm of backward veloc = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_b_displ,
+              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {
+
+TRACE("check_max_norm_b_displ");
+
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
+  }
+  printf("%d:maxnorm of backward displ = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_max_norm_b_accel,
+              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {
+
+TRACE("check_max_norm_b_accel");
+
+  realw maxnorm=0;
+
+  for(int i=0;i<*size;i++) {
+    maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
+  }
+  printf("%d:maxnorm of backward accel = %e\n",*announceID,maxnorm);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(check_error_vectors,
+              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {
+
+TRACE("check_error_vectors");
+
+  int size = *sizef;
+
+  double diff2 = 0;
+  double sum = 0;
+  double temp;
+  double maxerr=0;
+  int maxerrorloc;
+
+  for(int i=0;i<size;++i) {
+    temp = vector1[i]-vector2[i];
+    diff2 += temp*temp;
+    sum += vector1[i]*vector1[i];
+    if(maxerr < fabsf(temp)) {
+      maxerr = abs(temp);
+      maxerrorloc = i;
+    }
+  }
+
+  printf("rel error = %f, maxerr = %e @ %d\n",diff2/sum,maxerr,maxerrorloc);
+  int myrank;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+  myrank = 0;
+#endif
+  if(myrank == 0) {
+    for(int i=maxerrorloc;i>maxerrorloc-5;i--) {
+      printf("[%d]: %e vs. %e\n",i,vector1[i],vector2[i]);
+    }
+  }
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Auxiliary functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_max_accel,
+              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {
+
+TRACE("get_max_accel");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  int size = *sizef;
+  int it = *itf;
+  realw* accel_cpy = (realw*)malloc(size*sizeof(realw));
+  cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  realw maxval=0;
+  for(int i=0;i<size;++i) {
+    maxval = MAX(maxval,accel_cpy[i]);
+  }
+  printf("%d/%d: max=%e\n",it,procid,maxval);
+  free(accel_cpy);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void get_maximum_kernel(realw* array, int size, realw* d_max){
+
+  /* simplest version: uses only 1 thread
+   realw max;
+   max = 0;
+   // finds maximum value in array
+   if( size > 0 ){
+   max = abs(array[0]);
+   for( int i=1; i < size; i++){
+   if( abs(array[i]) > max ) max = abs(array[i]);
+   }
+   }
+   *d_max = max;
+   */
+
+  // reduction example:
+  __shared__ realw sdata[256] ;
+
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+
+  // loads absolute values into shared memory
+  sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ;
+
+  __syncthreads();
+
+  // do reduction in shared mem
+  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
+  {
+    if (tid < s){
+      // summation:
+      //sdata[tid] += sdata[tid + s];
+      // maximum:
+      if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
+    }
+    __syncthreads();
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) d_max[blockIdx.x] = sdata[0];
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_norm_acoustic_from_device,
+              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
+                                                  long* Mesh_pointer_f,
+                                                  int* SIMULATION_TYPE) {
+
+TRACE("get_norm_acoustic_from_device");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  realw max;
+  realw *d_max;
+
+  max = 0;
+
+  /* way 1 : timing Elapsed time: 8.464813e-03
+   realw* h_array;
+   h_array = (realw*)calloc(mp->NGLOB_AB,sizeof(realw));
+
+   print_CUDA_error_if_any(cudaMemcpy(h_array,mp->d_potential_dot_dot_acoustic,
+   sizeof(realw)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131);
+
+   // finds maximum value in array
+   max = h_array[0];
+   for( int i=1; i < mp->NGLOB_AB; i++){
+   if( abs(h_array[i]) > max ) max = abs(h_array[i]);
+   }
+   free(h_array);
+   */
+
+  /* way 2: timing Elapsed time: 8.818102e-02
+   // launch simple kernel
+   cudaMalloc((void**)&d_max,sizeof(realw));
+
+   dim3 grid(1,1);
+   dim3 threads(1,1,1);
+
+   get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+   mp->NGLOB_AB,
+   d_max);
+   print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(realw), cudaMemcpyDeviceToHost),222);
+
+   cudaFree(d_max);
+   */
+
+  // way 2 b: timing Elapsed time: 1.236916e-03
+  // launch simple reduction kernel
+  realw* h_max;
+  int blocksize = 256;
+
+  int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
+  //printf("num_blocks_x %i \n",num_blocks_x);
+
+  h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
+
+  dim3 grid(num_blocks_x,1);
+  dim3 threads(blocksize,1,1);
+
+  if(*SIMULATION_TYPE == 1 ){
+    get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                         mp->NGLOB_AB,
+                                         d_max);
+  }
+
+  if(*SIMULATION_TYPE == 3 ){
+    get_maximum_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                         mp->NGLOB_AB,
+                                         d_max);
+  }
+
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
+
+  // determines max for all blocks
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+
+  cudaFree(d_max);
+  free(h_max);
+
+  /* way 3: doesn't work properly...
+   cublasStatus status;
+
+   // Initialize CUBLAS
+   status = cublasInit();
+   if (status != CUBLAS_STATUS_SUCCESS) {
+   fprintf (stderr, "!!!! CUBLAS initialization error\n");
+   exit(1);
+   }
+
+   // cublas function: cublasIsamax
+   //       finds the smallest index of the maximum magnitude element of single 
+   //      precision vector x
+   int incr = 1;
+   int imax = 0;
+   imax = cublasIsamax(mp->NGLOB_AB,(realw*)mp->d_potential_dot_dot_acoustic, incr);
+   status= cublasGetError();
+   if (status != CUBLAS_STATUS_SUCCESS) {
+   fprintf (stderr, "!!!! CUBLAS error in cublasIsamax\n");
+   exit(1);
+   }
+
+   print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]),
+                      sizeof(realw), cudaMemcpyDeviceToHost),222);
+
+   printf("maximum %i %i %f \n",mp->NGLOB_AB,imax,max);
+
+   // Shutdown
+   status = cublasShutdown();
+   if (status != CUBLAS_STATUS_SUCCESS) {
+   fprintf (stderr, "!!!! shutdown error (A)\n");
+   exit(1);
+   }
+
+   */
+
+  // return result
+  *norm = max;
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("after get_norm_acoustic_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
+
+  // reduction example:
+  __shared__ realw sdata[256] ;
+
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+
+  // loads values into shared memory: assume array is a vector array
+  sdata[tid] = (i < size) ? sqrt(array[i*3]*array[i*3]
+                                 + array[i*3+1]*array[i*3+1]
+                                 + array[i*3+2]*array[i*3+2]) : 0.0 ;
+
+  __syncthreads();
+
+  // do reduction in shared mem
+  for(unsigned int s=blockDim.x/2; s>0; s>>=1)
+  {
+    if (tid < s){
+      // summation:
+      //sdata[tid] += sdata[tid + s];
+      // maximum:
+      if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
+    }
+    __syncthreads();
+  }
+
+  // write result for this block to global mem
+  if (tid == 0) d_max[blockIdx.x] = sdata[0];
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_norm_elastic_from_device,
+              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+                                                 long* Mesh_pointer_f,
+                                                 int* SIMULATION_TYPE) {
+
+  TRACE("get_norm_elastic_from_device");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  realw max;
+  realw *d_max;
+
+  max = 0;
+
+  // launch simple reduction kernel
+  realw* h_max;
+  int blocksize = 256;
+
+  int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
+  //printf("num_blocks_x %i \n",num_blocks_x);
+
+  h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
+
+  dim3 grid(num_blocks_x,1);
+  dim3 threads(blocksize,1,1);
+
+  if(*SIMULATION_TYPE == 1 ){
+    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ,
+                                                mp->NGLOB_AB,
+                                                d_max);
+  }
+
+  if(*SIMULATION_TYPE == 3 ){
+    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ,
+                                                mp->NGLOB_AB,
+                                                d_max);
+  }
+
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
+
+  // determines max for all blocks
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+
+  cudaFree(d_max);
+  free(h_max);
+
+  // return result
+  *norm = max;
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("after get_norm_elastic_from_device");
+#endif
+}
+
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,369 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_add_sources_acoustic_kernel(realw* potential_dot_dot_acoustic,
+                                                    int* ibool,
+                                                    int* ispec_is_inner,
+                                                    int phase_is_inner,
+                                                    realw* sourcearrays,
+                                                    double* stf_pre_compute,
+                                                    int myrank,
+                                                    int* islice_selected_source,
+                                                    int* ispec_selected_source,
+                                                    int* ispec_is_acoustic,
+                                                    realw* kappastore,
+                                                    int NSOURCES) {
+  int i = threadIdx.x;
+  int j = threadIdx.y;
+  int k = threadIdx.z;
+
+  int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
+
+  int ispec;
+  int iglob;
+  realw stf;
+  realw kappal;
+
+  if( isource < NSOURCES ){
+
+    if(myrank == islice_selected_source[isource]) {
+
+      ispec = ispec_selected_source[isource]-1;
+
+      if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
+
+        stf = (realw) stf_pre_compute[isource];
+        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+        kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+
+        atomicAdd(&potential_dot_dot_acoustic[iglob],
+                  -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal);
+
+        //      potential_dot_dot_acoustic[iglob] +=
+        //                -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal;
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_ac_cuda,
+              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
+                                                 int* phase_is_innerf,
+                                                 int* NSOURCESf,
+                                                 int* SIMULATION_TYPEf,
+                                                 double* h_stf_pre_compute,
+                                                 int* myrankf) {
+
+TRACE("compute_add_sources_ac_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nsources_local == 0 ) return;
+
+  int phase_is_inner = *phase_is_innerf;
+  int NSOURCES = *NSOURCESf;
+  int myrank = *myrankf;
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  // copies pre-computed source time factors onto GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                        mp->d_ibool,
+                                                        mp->d_ispec_is_inner,
+                                                        phase_is_inner,
+                                                        mp->d_sourcearrays,
+                                                        mp->d_stf_pre_compute,
+                                                        myrank,
+                                                        mp->d_islice_selected_source,
+                                                        mp->d_ispec_selected_source,
+                                                        mp->d_ispec_is_acoustic,
+                                                        mp->d_kappastore,
+                                                        NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_ac_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_ac_s3_cuda,
+              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
+                                                      int* phase_is_innerf,
+                                                      int* NSOURCESf,
+                                                      int* SIMULATION_TYPEf,
+                                                      double* h_stf_pre_compute,
+                                                      int* myrankf) {
+
+TRACE("compute_add_sources_ac_s3_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nsources_local == 0 ) return;
+
+  int phase_is_inner = *phase_is_innerf;
+  int NSOURCES = *NSOURCESf;
+  int myrank = *myrankf;
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  // copies source time factors onto GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                        mp->d_ibool,
+                                                        mp->d_ispec_is_inner,
+                                                        phase_is_inner,
+                                                        mp->d_sourcearrays,
+                                                        mp->d_stf_pre_compute,
+                                                        myrank,
+                                                        mp->d_islice_selected_source,
+                                                        mp->d_ispec_selected_source,
+                                                        mp->d_ispec_is_acoustic,
+                                                        mp->d_kappastore,
+                                                        NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_ac_s3_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic adjoint sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_acoustic,
+                                                      int nrec,
+                                                      realw* adj_sourcearrays,
+                                                      int* ibool,
+                                                      int* ispec_is_inner,
+                                                      int* ispec_is_acoustic,
+                                                      int* ispec_selected_rec,
+                                                      int phase_is_inner,
+                                                      int* pre_computed_irec,
+                                                      int nadj_rec_local,
+                                                      realw* kappastore) {
+
+  int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
+
+  // because of grid shape, irec_local can be too big
+  if(irec_local < nadj_rec_local) {
+
+    int irec = pre_computed_irec[irec_local];
+
+    int ispec = ispec_selected_rec[irec]-1;
+    if( ispec_is_acoustic[ispec] ){
+
+      // checks if element is in phase_is_inner run
+      if(ispec_is_inner[ispec] == phase_is_inner) {
+        int i = threadIdx.x;
+        int j = threadIdx.y;
+        int k = threadIdx.z;
+
+        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+        //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+
+        //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+        //                                            pre_computed_irec_local_index[irec],
+        //                                            pre_computed_index,
+        //                                            0,
+        //                                            i,j,k)]/kappal;
+
+        // beware, for acoustic medium, a pressure source would be taking the negative
+        // and divide by Kappa of the fluid;
+        // this would have to be done when constructing the adjoint source.
+        //
+        // note: we take the first component of the adj_sourcearrays
+        //          the idea is to have e.g. a pressure source, where all 3 components would be the same
+        realw stf = adj_sourcearrays[INDEX5(5,5,5,3,i,j,k,0,irec_local)]; // / kappal
+
+        atomicAdd(&potential_dot_dot_acoustic[iglob],stf);
+
+                  //+adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+                  //                         pre_computed_irec_local_index[irec],pre_computed_index-1,
+                  //                         0,i,j,k)] // / kappal
+                  //                         );
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
+              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_acoustic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
+
+TRACE("add_sources_ac_sim_2_or_3_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  // checks
+  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
+
+  // make sure grid dimension is less than 65535 in x dimension
+  int num_blocks_x = mp->nadj_rec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(5,5,5);
+
+  // build slice of adj_sourcearrays because full array is *very* large.
+  // note: this extracts array values for local adjoint sources at given time step "time_index"
+  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
+  int ispec,i,j,k;
+  int irec_local = 0;
+  for(int irec = 0; irec < *nrec; irec++) {
+    if(*myrank == h_islice_selected_rec[irec]) {
+      irec_local++;
+
+      // takes only acoustic sources
+      ispec = h_ispec_selected_rec[irec]-1;
+      if( h_ispec_is_acoustic[ispec] ){
+
+        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+          for(k=0;k<5;k++) {
+            for(j=0;j<5;j++) {
+              for(i=0;i<5;i++) {
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,irec_local-1)]
+                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,(*time_index)-1,
+                                            0,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,irec_local-1)]
+                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,(*time_index)-1,
+                                            1,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,irec_local-1)]
+                  = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,(*time_index)-1,
+                                            2,i,j,k)];
+              }
+            }
+          }
+        } // phase_is_inner
+      } // h_ispec_is_acoustic
+    }
+  }
+  // check all local sources were added
+  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
+
+  // copies extracted array values onto GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+                              (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
+
+  // launches cuda kernel for acoustic adjoint sources
+  add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                          *nrec,
+                                                          mp->d_adj_sourcearrays,
+                                                          mp->d_ibool,
+                                                          mp->d_ispec_is_inner,
+                                                          mp->d_ispec_is_acoustic,
+                                                          mp->d_ispec_selected_rec,
+                                                          *phase_is_inner,
+                                                          mp->d_pre_computed_irec,
+                                                          mp->nadj_rec_local,
+                                                          mp->d_kappastore);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
+#endif
+}

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,421 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elastic domain sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_add_sources_kernel(realw* accel,
+                                           int* ibool,
+                                           int* ispec_is_inner,
+                                           int phase_is_inner,
+                                           realw* sourcearrays,
+                                           double* stf_pre_compute,
+                                           int myrank,
+                                           int* islice_selected_source,
+                                           int* ispec_selected_source,
+                                           int* ispec_is_elastic,
+                                           int NSOURCES) {
+  int i = threadIdx.x;
+  int j = threadIdx.y;
+  int k = threadIdx.z;
+
+  int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
+  int ispec;
+  int iglob;
+  realw stf;
+
+  if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+
+    if(myrank == islice_selected_source[isource]) {
+
+      ispec = ispec_selected_source[isource]-1;
+
+      if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
+
+        stf = (realw) stf_pre_compute[isource];
+        iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+        atomicAdd(&accel[iglob*3],
+                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf);
+        atomicAdd(&accel[iglob*3+1],
+                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 1, i,j,k)]*stf);
+        atomicAdd(&accel[iglob*3+2],
+                  sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)]*stf);
+      }
+    }
+  }
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_el_cuda,
+              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* NSOURCESf,
+                                            double* h_stf_pre_compute,
+                                            int* myrankf) {
+
+TRACE("compute_add_sources_el_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check if anything to do
+  if( mp->nsources_local == 0 ) return;
+
+  int phase_is_inner = *phase_is_innerf;
+  int NSOURCES = *NSOURCESf;
+  int myrank = *myrankf;
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  //double* d_stf_pre_compute;
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel,
+                                               mp->d_ibool,
+                                               mp->d_ispec_is_inner,
+                                               phase_is_inner,
+                                               mp->d_sourcearrays,
+                                               mp->d_stf_pre_compute,
+                                               myrank,
+                                               mp->d_islice_selected_source,
+                                               mp->d_ispec_selected_source,
+                                               mp->d_ispec_is_elastic,
+                                               NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_kernel");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_el_s3_cuda,
+              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
+                                              double* h_stf_pre_compute,
+                                              int* NSOURCESf,
+                                              int* phase_is_inner,
+                                              int* myrank) {
+  TRACE("compute_add_sources_el_s3_cuda");
+  // EPIK_TRACER("compute_add_sources_el_s3_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  int NSOURCES = *NSOURCESf;
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+#endif
+
+  int num_blocks_x = NSOURCES;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel,mp->d_ibool,
+                                               mp->d_ispec_is_inner, *phase_is_inner,
+                                               mp->d_sourcearrays,
+                                               mp->d_stf_pre_compute,
+                                               *myrank,
+                                               mp->d_islice_selected_source,mp->d_ispec_selected_source,
+                                               mp->d_ispec_is_elastic,
+                                               NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// NOISE sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool,
+                                                        int* ispec_selected_rec,
+                                                        int irec_master_noise,
+                                                        realw* accel,
+                                                        realw* noise_sourcearray,
+                                                        int it) {
+  int tx = threadIdx.x;
+  int iglob = ibool[tx + NGLL3*(ispec_selected_rec[irec_master_noise-1]-1)]-1;
+
+  // not sure if we need atomic operations but just in case...
+  // accel[3*iglob] += noise_sourcearray[3*tx + 3*125*it];
+  // accel[1+3*iglob] += noise_sourcearray[1+3*tx + 3*125*it];
+  // accel[2+3*iglob] += noise_sourcearray[2+3*tx + 3*125*it];
+
+  atomicAdd(&accel[iglob*3],noise_sourcearray[3*tx + 3*NGLL3*it]);
+  atomicAdd(&accel[iglob*3+1],noise_sourcearray[1+3*tx + 3*NGLL3*it]);
+  atomicAdd(&accel[iglob*3+2],noise_sourcearray[2+3*tx + 3*NGLL3*it]);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(add_source_master_rec_noise_cu,
+              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
+                                                int* myrank_f,
+                                                int* it_f,
+                                                int* irec_master_noise_f,
+                                                int* islice_selected_rec) {
+
+TRACE("add_source_master_rec_noise_cu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  int it = *it_f-1; // -1 for Fortran -> C indexing differences
+  int irec_master_noise = *irec_master_noise_f;
+  int myrank = *myrank_f;
+
+  dim3 grid(1,1,1);
+  dim3 threads(NGLL3,1,1);
+
+  if(myrank == islice_selected_rec[irec_master_noise-1]) {
+    add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool,
+                                                              mp->d_ispec_selected_rec,
+                                                              irec_master_noise,
+                                                              mp->d_accel,
+                                                              mp->d_noise_sourcearray,
+                                                              it);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel");
+#endif
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ADJOINT sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
+                                                     int nrec,
+                                                     realw* adj_sourcearrays,
+                                                     int* ibool,
+                                                     int* ispec_is_inner,
+                                                     int* ispec_is_elastic,
+                                                     int* ispec_selected_rec,
+                                                     int phase_is_inner,
+                                                     int* pre_computed_irec,
+                                                     int nadj_rec_local) {
+
+  int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
+
+  if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+
+    int irec = pre_computed_irec[irec_local];
+
+    int ispec = ispec_selected_rec[irec]-1;
+    if( ispec_is_elastic[ispec] ){
+
+      if(ispec_is_inner[ispec] == phase_is_inner) {
+        int i = threadIdx.x;
+        int j = threadIdx.y;
+        int k = threadIdx.z;
+        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+        // atomic operations are absolutely necessary for correctness!
+        atomicAdd(&accel[3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+                                                              i,j,k,
+                                                              0,
+                                                              irec_local)]);
+
+        atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
+                                                             i,j,k,
+                                                             1,
+                                                             irec_local)]);
+
+        atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+                                                            i,j,k,
+                                                            2,
+                                                            irec_local)]);
+      }
+    } // ispec_is_elastic
+  }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_elastic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
+
+TRACE("add_sources_el_sim_type_2_or_3");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  // checks
+  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
+
+  // make sure grid dimension is less than 65535 in x dimension
+  int num_blocks_x = mp->nadj_rec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(5,5,5);
+
+  // build slice of adj_sourcearrays because full array is *very* large.
+  // note: this extracts array values for local adjoint sources at given time step "time_index"
+  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
+  int ispec,i,j,k;
+  int irec_local = 0;
+  for(int irec = 0; irec < *nrec; irec++) {
+    if(*myrank == h_islice_selected_rec[irec]) {
+      irec_local++;
+
+      // takes only elastic sources
+      ispec = h_ispec_selected_rec[irec]-1;
+      if( h_ispec_is_elastic[ispec] ){
+
+        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+          for(k=0;k<5;k++) {
+            for(j=0;j<5;j++) {
+              for(i=0;i<5;i++) {
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                i,j,k,0,
+                                                irec_local-1)]
+                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
+                                                    3,5,5,
+                                                    irec_local-1,
+                                                    *time_index-1,
+                                                    0,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                i,j,k,1,
+                                                irec_local-1)]
+                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
+                                                    3,5,5,
+                                                    irec_local-1,
+                                                    *time_index-1,
+                                                    1,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                i,j,k,2,
+                                                irec_local-1)]
+                        = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                                    *NTSTEP_BETWEEN_READ_ADJSRC,
+                                                    3,5,5,
+                                                    irec_local-1,
+                                                    *time_index-1,
+                                                    2,i,j,k)];
+              }
+            }
+          }
+        } // phase_is_inner
+      } // h_ispec_is_elastic
+    }
+  }
+  // check all local sources were added
+  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
+
+  // copies extracted array values onto GPU
+  cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+             (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice);
+
+
+  // the irec_local variable needs to be precomputed (as
+  // h_pre_comp..), because normally it is in the loop updating accel,
+  // and due to how it's incremented, it cannot be parallelized
+
+  add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel,
+                                                         *nrec,
+                                                         mp->d_adj_sourcearrays,
+                                                         mp->d_ibool,
+                                                         mp->d_ispec_is_inner,
+                                                         mp->d_ispec_is_elastic,
+                                                         mp->d_ispec_selected_rec,
+                                                         *phase_is_inner,
+                                                         mp->d_pre_computed_irec,
+                                                         mp->nadj_rec_local);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
+#endif
+}
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,482 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC - ELASTIC coupling
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_coupling_acoustic_el_kernel(realw* displ,
+                                                    realw* potential_dot_dot_acoustic,
+                                                    int num_coupling_ac_el_faces,
+                                                    int* coupling_ac_el_ispec,
+                                                    int* coupling_ac_el_ijk,
+                                                    realw* coupling_ac_el_normal,
+                                                    realw* coupling_ac_el_jacobian2Dw,
+                                                    int* ibool,
+                                                    int* ispec_is_inner,
+                                                    int phase_is_inner) {
+
+  int igll = threadIdx.x;
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
+  int i,j,k,iglob,ispec;
+  realw displ_x,displ_y,displ_z,displ_n;
+  realw nx,ny,nz;
+  realw jacobianw;
+
+  if( iface < num_coupling_ac_el_faces){
+
+    // don't compute points outside NGLLSQUARE==NGLL2==25
+    // way 2: no further check needed since blocksize = 25
+    //  if(igll<NGLL2) {
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = coupling_ac_el_ispec[iface] - 1;
+
+    if(ispec_is_inner[ispec] == phase_is_inner ) {
+
+      i = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1;
+      j = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+      k = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
+      // elastic displacement on global point
+      displ_x = displ[iglob*3] ; // (1,iglob)
+      displ_y = displ[iglob*3+1] ; // (2,iglob)
+      displ_z = displ[iglob*3+2] ; // (3,iglob)
+
+      // gets associated normal on GLL point
+      nx = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; // (1,igll,iface)
+      ny = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; // (2,igll,iface)
+      nz = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; // (3,igll,iface)
+
+      // calculates displacement component along normal
+      // (normal points outwards of acoustic element)
+      displ_n = displ_x*nx + displ_y*ny + displ_z*nz;
+
+      // gets associated, weighted jacobian
+      jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+
+      // continuity of pressure and normal displacement on global point
+
+      // note: Newmark time scheme together with definition of scalar potential:
+      //          pressure = - chi_dot_dot
+      //          requires that this coupling term uses the updated displacement at time step [t+delta_t],
+      //          which is done at the very beginning of the time loop
+      //          (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...)
+      //          it also means you have to calculate and update this here first before
+      //          calculating the coupling on the elastic side for the acceleration...
+      atomicAdd(&potential_dot_dot_acoustic[iglob],+ jacobianw*displ_n);
+
+    }
+  //  }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_coupling_ac_el_cuda,
+              COMPUTE_COUPLING_AC_EL_CUDA)(
+                                            long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* num_coupling_ac_el_facesf,
+                                            int* SIMULATION_TYPEf) {
+  TRACE("compute_coupling_ac_el_cuda");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  int phase_is_inner            = *phase_is_innerf;
+  int num_coupling_ac_el_faces  = *num_coupling_ac_el_facesf;
+  int SIMULATION_TYPE           = *SIMULATION_TYPEf;
+
+  // way 1: exact blocksize to match NGLLSQUARE
+  int blocksize = NGLL2;
+  int num_blocks_x = num_coupling_ac_el_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // launches GPU kernel
+  compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_displ,
+                                                       mp->d_potential_dot_dot_acoustic,
+                                                       num_coupling_ac_el_faces,
+                                                       mp->d_coupling_ac_el_ispec,
+                                                       mp->d_coupling_ac_el_ijk,
+                                                       mp->d_coupling_ac_el_normal,
+                                                       mp->d_coupling_ac_el_jacobian2Dw,
+                                                       mp->d_ibool,
+                                                       mp->d_ispec_is_inner,
+                                                       phase_is_inner);
+
+  //  adjoint simulations
+  if (SIMULATION_TYPE == 3 ){
+    compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_b_displ,
+                                                          mp->d_b_potential_dot_dot_acoustic,
+                                                          num_coupling_ac_el_faces,
+                                                          mp->d_coupling_ac_el_ispec,
+                                                          mp->d_coupling_ac_el_ijk,
+                                                          mp->d_coupling_ac_el_normal,
+                                                          mp->d_coupling_ac_el_jacobian2Dw,
+                                                          mp->d_ibool,
+                                                          mp->d_ispec_is_inner,
+                                                          phase_is_inner);
+
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_coupling_acoustic_el_kernel");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC - ACOUSTIC coupling
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_coupling_elastic_ac_kernel(realw* potential_dot_dot_acoustic,
+                                                    realw* accel,
+                                                    int num_coupling_ac_el_faces,
+                                                    int* coupling_ac_el_ispec,
+                                                    int* coupling_ac_el_ijk,
+                                                    realw* coupling_ac_el_normal,
+                                                    realw* coupling_ac_el_jacobian2Dw,
+                                                    int* ibool,
+                                                    int* ispec_is_inner,
+                                                    int phase_is_inner,
+                                                    int gravity,
+                                                    realw* minus_g,
+                                                    realw* rhostore,
+                                                    realw* displ) {
+
+  int igll = threadIdx.x;
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
+  int i,j,k,iglob,ispec;
+  realw pressure;
+  realw nx,ny,nz;
+  realw jacobianw;
+  realw rhol;
+
+  if( iface < num_coupling_ac_el_faces){
+
+    // don't compute points outside NGLLSQUARE==NGLL2==25
+    // way 2: no further check needed since blocksize = 25
+    //  if(igll<NGLL2) {
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = coupling_ac_el_ispec[iface] - 1;
+
+    if(ispec_is_inner[ispec] == phase_is_inner ) {
+
+      i = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1;
+      j = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+      k = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
+      // gets associated normal on GLL point
+      // note: normal points away from acoustic element
+      nx = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; // (1,igll,iface)
+      ny = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; // (2,igll,iface)
+      nz = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; // (3,igll,iface)
+
+      // gets associated, weighted jacobian
+      jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+
+      // acoustic pressure on global point
+      if( gravity ){
+        // takes density (from acoustic? element)
+        rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+
+        // note: uses potential chi such that displacement s = grad(chi),
+        //         pressure becomes: p = - kappa ( div( s ) ) = rho ( - dot_dot_chi + g * s )
+        //  g only acting in negative z-direction
+
+        // daniel: TODO - check gravity and coupling would be displ * nz  correct?
+        pressure = rhol*( - potential_dot_dot_acoustic[iglob]
+                         + minus_g[iglob] * displ[iglob*3+2] );
+
+        //daniel: TODO - check gravity and coupling
+        //pressure = - potential_dot_dot_acoustic[iglob] ;
+        //if( iface == 128 && igll == 5 ){
+        //  printf("coupling acoustic: %f %f \n",potential_dot_dot_acoustic[iglob],
+        //             minus_g[iglob] * displ[iglob*3+2]);
+        //}
+
+      }else{
+        // no gravity: uses potential chi such that displacement s = 1/rho grad(chi)
+        //                  pressure p = - kappa ( div( s ) ) then becomes: p = - dot_dot_chi
+        //                  ( multiplied with factor 1/kappa due to setup of equation of motion )
+        pressure = - potential_dot_dot_acoustic[iglob];
+      }
+
+      // continuity of displacement and pressure on global point
+      //
+      // note: Newmark time scheme together with definition of scalar potential:
+      //          pressure = - chi_dot_dot
+      //          requires that this coupling term uses the *UPDATED* pressure (chi_dot_dot), i.e.
+      //          pressure at time step [t + delta_t]
+      //          (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...)
+      //          it means you have to calculate and update the acoustic pressure first before
+      //          calculating this term...
+      atomicAdd(&accel[iglob*3],+ jacobianw*nx*pressure);
+      atomicAdd(&accel[iglob*3+1],+ jacobianw*ny*pressure);
+      atomicAdd(&accel[iglob*3+2],+ jacobianw*nz*pressure);
+    }
+    //  }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_coupling_el_ac_cuda,
+              COMPUTE_COUPLING_EL_AC_CUDA)(
+                                                 long* Mesh_pointer_f,
+                                                 int* phase_is_innerf,
+                                                 int* num_coupling_ac_el_facesf,
+                                                 int* SIMULATION_TYPEf) {
+  TRACE("compute_coupling_el_ac_cuda");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  int phase_is_inner            = *phase_is_innerf;
+  int num_coupling_ac_el_faces  = *num_coupling_ac_el_facesf;
+  int SIMULATION_TYPE           = *SIMULATION_TYPEf;
+
+  // way 1: exact blocksize to match NGLLSQUARE
+  int blocksize = 25;
+
+  int num_blocks_x = num_coupling_ac_el_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // launches GPU kernel
+  compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                       mp->d_accel,
+                                                       num_coupling_ac_el_faces,
+                                                       mp->d_coupling_ac_el_ispec,
+                                                       mp->d_coupling_ac_el_ijk,
+                                                       mp->d_coupling_ac_el_normal,
+                                                       mp->d_coupling_ac_el_jacobian2Dw,
+                                                       mp->d_ibool,
+                                                       mp->d_ispec_is_inner,
+                                                       phase_is_inner,
+                                                       mp->gravity,
+                                                       mp->d_minus_g,
+                                                       mp->d_rhostore,
+                                                       mp->d_displ);
+
+  //  adjoint simulations
+  if (SIMULATION_TYPE == 3 ){
+    compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                         mp->d_b_accel,
+                                                         num_coupling_ac_el_faces,
+                                                         mp->d_coupling_ac_el_ispec,
+                                                         mp->d_coupling_ac_el_ijk,
+                                                         mp->d_coupling_ac_el_normal,
+                                                         mp->d_coupling_ac_el_jacobian2Dw,
+                                                         mp->d_ibool,
+                                                         mp->d_ispec_is_inner,
+                                                         phase_is_inner,
+                                                         mp->gravity,
+                                                         mp->d_minus_g,
+                                                         mp->d_rhostore,
+                                                         mp->d_b_displ);
+
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_coupling_el_ac_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* OCEANS load coupled on free surface */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void compute_coupling_ocean_cuda_kernel(realw* accel,
+                                               realw* rmass,
+                                               realw* rmass_ocean_load,
+                                               int num_free_surface_faces,
+                                               int* free_surface_ispec,
+                                               int* free_surface_ijk,
+                                               realw* free_surface_normal,
+                                               int* ibool,
+                                               int* updated_dof_ocean_load) {
+  // gets spectral element face id
+  int igll = threadIdx.x ;  //  threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+  realw nx,ny,nz;
+  realw force_normal_comp,additional_term;
+
+  // for all faces on free surface
+  if( iface < num_free_surface_faces ){
+
+    int ispec = free_surface_ispec[iface]-1;
+
+    // gets global point index
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
+    //if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob);
+
+    // only update this global point once
+
+    // daniel: TODO - there might be better ways to implement a mutex like below,
+    //            and find a workaround to not use the temporary update array.
+    //            atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point
+
+    if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
+
+      // get normal
+      nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface)
+      ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
+      nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
+
+      // make updated component of right-hand side
+      // we divide by rmass() which is 1 / M
+      // we use the total force which includes the Coriolis term above
+      force_normal_comp = ( accel[iglob*3]*nx + accel[iglob*3+1]*ny + accel[iglob*3+2]*nz ) / rmass[iglob];
+
+      additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
+
+      // probably wouldn't need atomicAdd anymore, but just to be sure...
+      atomicAdd(&accel[iglob*3], + additional_term * nx);
+      atomicAdd(&accel[iglob*3+1], + additional_term * ny);
+      atomicAdd(&accel[iglob*3+2], + additional_term * nz);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_coupling_ocean_cuda,
+              COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
+                                       int* SIMULATION_TYPE) {
+
+  TRACE("compute_coupling_ocean_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // checks if anything to do
+  if( mp->num_free_surface_faces == 0 ) return;
+
+  // block sizes: exact blocksize to match NGLLSQUARE
+  int blocksize = NGLL2;
+
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+
+  // initializes temporary array to zero
+  print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
+                                     sizeof(int)*mp->NGLOB_AB),88501);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before kernel compute_coupling_ocean_cuda");
+#endif
+
+  compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel,
+                                                       mp->d_rmass,
+                                                       mp->d_rmass_ocean_load,
+                                                       mp->num_free_surface_faces,
+                                                       mp->d_free_surface_ispec,
+                                                       mp->d_free_surface_ijk,
+                                                       mp->d_free_surface_normal,
+                                                       mp->d_ibool,
+                                                       mp->d_updated_dof_ocean_load);
+  // for backward/reconstructed potentials
+  if(*SIMULATION_TYPE == 3) {
+    // re-initializes array
+    print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
+                                       sizeof(int)*mp->NGLOB_AB),88502);
+
+    compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
+                                                         mp->d_rmass,
+                                                         mp->d_rmass_ocean_load,
+                                                         mp->num_free_surface_faces,
+                                                         mp->d_free_surface_ispec,
+                                                         mp->d_free_surface_ijk,
+                                                         mp->d_free_surface_normal,
+                                                         mp->d_ibool,
+                                                         mp->d_updated_dof_ocean_load);
+
+  }
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_coupling_ocean_cuda");
+#endif
+}
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,1826 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+//  cuda constant arrays
+__constant__ realw d_hprime_xx[NGLL2];
+__constant__ realw d_hprime_yy[NGLL2];
+__constant__ realw d_hprime_zz[NGLL2];
+__constant__ realw d_hprimewgll_xx[NGLL2];
+__constant__ realw d_hprimewgll_yy[NGLL2];
+__constant__ realw d_hprimewgll_zz[NGLL2];
+__constant__ realw d_wgllwgll_xy[NGLL2];
+__constant__ realw d_wgllwgll_xz[NGLL2];
+__constant__ realw d_wgllwgll_yz[NGLL2];
+
+__constant__ realw d_wgll_cube[NGLL3]; // needed only for gravity case
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// CONSTANT arrays setup
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* note:
+ constant arrays when used in other compute_forces_***_cuda.cu routines stay zero,
+ constant declaration and cudaMemcpyToSymbol would have to be in the same file...
+ 
+ extern keyword doesn't work for __constant__ declarations.
+ 
+ also:
+ cudaMemcpyToSymbol("deviceCaseParams", caseParams, sizeof(CaseParams));
+ ..
+ and compile with -arch=sm_20
+ 
+ see also: http://stackoverflow.com/questions/4008031/how-to-use-cuda-constant-memory-in-a-programmer-pleasant-way
+ doesn't seem to work.
+ 
+ we could keep arrays separated for acoustic and elastic routines...
+ 
+ workaround:
+ 
+    for now, we store pointers with cudaGetSymbolAddress() function calls.
+    we pass those pointers in all other compute_forces_..() routines
+    
+    in this file, we can use the above constant array declarations without need of the pointers.
+    
+ */
+
+// constant arrays
+
+void setConst_hprime_xx(realw* array,Mesh* mp)
+{
+  
+  cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err));
+    fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+    exit(1);
+  }
+  
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprime_xx),"d_hprime_xx");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprime_xx: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+void setConst_hprime_yy(realw* array,Mesh* mp)
+{
+  
+  cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
+    fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+    exit(1);
+  }
+  
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+void setConst_hprime_zz(realw* array,Mesh* mp)
+{
+  
+  cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
+    fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+    exit(1);
+  }
+  
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+
+void setConst_hprimewgll_xx(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_xx),"d_hprimewgll_xx");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprimewgll_xx: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+void setConst_hprimewgll_yy(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_yy),"d_hprimewgll_yy");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprimewgll_yy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+void setConst_hprimewgll_zz(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_zz),"d_hprimewgll_zz");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprimewgll_zz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+void setConst_wgllwgll_xy(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_wgllwgll_xy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  //mp->d_wgllwgll_xy = d_wgllwgll_xy;
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xy),"d_wgllwgll_xy");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgllwgll_xy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  
+}
+
+void setConst_wgllwgll_xz(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in  setConst_wgllwgll_xz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  //mp->d_wgllwgll_xz = d_wgllwgll_xz;
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xz),"d_wgllwgll_xz");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgllwgll_xz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  
+}
+
+void setConst_wgllwgll_yz(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_wgllwgll_yz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  //mp->d_wgllwgll_yz = d_wgllwgll_yz;
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_yz),"d_wgllwgll_yz");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgllwgll_yz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  
+}
+
+void setConst_wgll_cube(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgll_cube, array, NGLL3*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_wgll_cube: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  //mp->d_wgll_cube = d_wgll_cube;
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgll_cube),"d_wgll_cube");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgll_cube: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+  
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elemental routines
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// updates stress
+
+__device__ void compute_element_cm_att_stress(int tx,int working_element,
+                                              realw* R_xx,
+                                              realw* R_yy,
+                                              realw* R_xy,
+                                              realw* R_xz,
+                                              realw* R_yz,
+                                              reald* sigma_xx,
+                                              reald* sigma_yy,
+                                              reald* sigma_zz,
+                                              reald* sigma_xy,
+                                              reald* sigma_xz,
+                                              reald* sigma_yz) {
+
+  int i_sls,offset;
+  reald R_xx_val,R_yy_val;
+
+  for(i_sls = 0; i_sls < N_SLS; i_sls++){
+    // index
+    // note: index for R_xx,.. here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
+    //          local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
+    offset = i_sls + N_SLS*(tx + NGLL3*working_element);
+
+    R_xx_val = R_xx[offset];
+    R_yy_val = R_yy[offset];
+
+    *sigma_xx = *sigma_xx - R_xx_val;
+    *sigma_yy = *sigma_yy - R_yy_val;
+    *sigma_zz = *sigma_zz + R_xx_val + R_yy_val;
+    *sigma_xy = *sigma_xy - R_xy[offset];
+    *sigma_xz = *sigma_xz - R_xz[offset];
+    *sigma_yz = *sigma_yz - R_yz[offset];
+  }
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// updates R_memory
+
+__device__ void compute_element_cm_att_memory(int tx,int working_element,
+                                              realw* d_muvstore,
+                                              realw* factor_common,
+                                              realw* alphaval,realw* betaval,realw* gammaval,
+                                              realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                              realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                              realw* epsilondev_xz,realw* epsilondev_yz,
+                                              reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
+                                              reald epsilondev_xz_loc,reald epsilondev_yz_loc,
+                                              int ANISOTROPY,
+                                              realw* d_c44store
+                                              ){
+
+  int i_sls;
+  int ijk_ispec;
+  int offset_align,offset;
+  reald fac;
+  reald alphaval_loc,betaval_loc,gammaval_loc;
+  reald factor_loc,Sn,Snp1;
+
+  // indices
+  offset_align = tx + NGLL3_PADDED * working_element;
+  ijk_ispec = tx + NGLL3 * working_element;
+
+  // shear moduli for common factor (only Q_mu attenuation)
+  if( ANISOTROPY ){
+    fac = d_c44store[offset_align];
+  }else{
+    fac = d_muvstore[offset_align];
+  }
+
+  // use Runge-Kutta scheme to march in time
+  for(i_sls = 0; i_sls < N_SLS; i_sls++){
+    // indices
+    // note: index for R_xx,... here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
+    //          local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
+    // index for (i_sls,i,j,k,ispec)
+    offset = i_sls + N_SLS*(tx + NGLL3*working_element);
+    // index for (i,j,k,ispec,i_sls)
+    //offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
+
+    // either mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+    // or       factor_common(i_sls,:,:,:,ispec) * c44store(:,:,:,ispec)
+    factor_loc = fac * factor_common[offset];
+
+    alphaval_loc = alphaval[i_sls]; // (i_sls)
+    betaval_loc = betaval[i_sls];
+    gammaval_loc = gammaval[i_sls];
+
+    // term in xx
+    Sn   = factor_loc * epsilondev_xx[ijk_ispec]; //(i,j,k,ispec)
+    Snp1   = factor_loc * epsilondev_xx_loc; //(i,j,k)
+    R_xx[offset] = alphaval_loc * R_xx[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+
+    // term in yy
+    Sn   = factor_loc * epsilondev_yy[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_yy_loc;
+    R_yy[offset] = alphaval_loc * R_yy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    // term in zz not computed since zero trace
+
+    // term in xy
+    Sn   = factor_loc * epsilondev_xy[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_xy_loc;
+    R_xy[offset] = alphaval_loc * R_xy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+
+    // term in xz
+    Sn   = factor_loc * epsilondev_xz[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_xz_loc;
+    R_xz[offset] = alphaval_loc * R_xz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+
+    // term in yz
+    Sn   = factor_loc * epsilondev_yz[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_yz_loc;
+    R_yz[offset] = alphaval_loc * R_yz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+  }
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// pre-computes gravity term
+
+__device__ void compute_element_cm_gravity(int tx,int working_element,
+                                          int* d_ibool,
+                                          realw* d_xstore,realw* d_ystore,realw* d_zstore,
+                                          realw* d_minus_gravity_table,
+                                          realw* d_minus_deriv_gravity_table,
+                                          realw* d_density_table,
+                                          realw* wgll_cube,
+                                          reald jacobianl,
+                                          reald* s_dummyx_loc,
+                                          reald* s_dummyy_loc,
+                                          reald* s_dummyz_loc,
+                                          reald* sigma_xx,
+                                          reald* sigma_yy,
+                                          reald* sigma_zz,
+                                          reald* sigma_xy,
+                                          reald* sigma_yx,
+                                          reald* sigma_xz,
+                                          reald* sigma_zx,
+                                          reald* sigma_yz,
+                                          reald* sigma_zy,
+                                          reald* rho_s_H1,
+                                          reald* rho_s_H2,
+                                          reald* rho_s_H3){
+
+  reald radius,theta,phi;
+  reald cos_theta,sin_theta,cos_phi,sin_phi;
+  reald minus_g,minus_dg;
+  reald rho;
+  reald gxl,gyl,gzl; 
+  reald minus_g_over_radius,minus_dg_plus_g_over_radius;
+  reald cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
+  reald Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
+  reald sx_l,sy_l,sz_l;
+  reald factor;
+
+  // R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
+  const reald R_EARTH_KM = 6371.0f;
+  // uncomment line below for PREM with oceans
+  //const reald R_EARTH_KM = 6368.0f;
+  
+  // compute non-symmetric terms for gravity
+
+  // use mesh coordinates to get theta and phi
+  // x y z contain r theta phi
+  int iglob = d_ibool[working_element*NGLL3 + tx]-1;
+  
+  radius = d_xstore[iglob];
+  theta = d_ystore[iglob];
+  phi = d_zstore[iglob];
+  
+  cos_theta = cos(theta);
+  sin_theta = sin(theta);
+  cos_phi = cos(phi);
+  sin_phi = sin(phi);
+  
+  // for efficiency replace with lookup table every 100 m in radial direction
+  // note: radius in crust mantle should never be zero,
+  //          and arrays in C start from 0, thus we need to subtract -1
+  int int_radius = rint(radius * R_EARTH_KM * 10.0f ) - 1;
+  
+  // get g, rho and dg/dr=dg
+  // spherical components of the gravitational acceleration
+  // for efficiency replace with lookup table every 100 m in radial direction
+  minus_g = d_minus_gravity_table[int_radius];
+  minus_dg = d_minus_deriv_gravity_table[int_radius];
+  rho = d_density_table[int_radius];
+    
+  // Cartesian components of the gravitational acceleration
+  gxl = minus_g*sin_theta*cos_phi;
+  gyl = minus_g*sin_theta*sin_phi;
+  gzl = minus_g*cos_theta;
+  
+  // Cartesian components of gradient of gravitational acceleration
+  // obtained from spherical components
+  
+  minus_g_over_radius = minus_g / radius;
+  minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius;
+  
+  cos_theta_sq = cos_theta*cos_theta;
+  sin_theta_sq = sin_theta*sin_theta;
+  cos_phi_sq = cos_phi*cos_phi;
+  sin_phi_sq = sin_phi*sin_phi;
+  
+  Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq;
+  Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq;
+  Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq;
+  Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq;
+  Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta;
+  Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta;
+
+  // get displacement and multiply by density to compute G tensor
+  sx_l = rho * s_dummyx_loc[tx];
+  sy_l = rho * s_dummyy_loc[tx];
+  sz_l = rho * s_dummyz_loc[tx];
+  
+  // compute G tensor from s . g and add to sigma (not symmetric)
+  *sigma_xx = *sigma_xx + sy_l*gyl + sz_l*gzl;
+  *sigma_yy = *sigma_yy + sx_l*gxl + sz_l*gzl;
+  *sigma_zz = *sigma_zz + sx_l*gxl + sy_l*gyl;
+  
+  *sigma_xy = *sigma_xy - sx_l * gyl;
+  *sigma_yx = *sigma_yx - sy_l * gxl;
+  
+  *sigma_xz = *sigma_xz - sx_l * gzl;
+  *sigma_zx = *sigma_zx - sz_l * gxl;
+  
+  *sigma_yz = *sigma_yz - sy_l * gzl;
+  *sigma_zy = *sigma_zy - sz_l * gyl;
+  
+  // precompute vector
+  factor = jacobianl * wgll_cube[tx];
+  *rho_s_H1 = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl);
+  *rho_s_H2 = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl);
+  *rho_s_H3 = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl);
+
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// computes stresses for anisotropic element
+
+__device__ void compute_element_cm_aniso(int offset,
+                                         realw* d_c11store,realw* d_c12store,realw* d_c13store,
+                                         realw* d_c14store,realw* d_c15store,realw* d_c16store,
+                                         realw* d_c22store,realw* d_c23store,realw* d_c24store,
+                                         realw* d_c25store,realw* d_c26store,realw* d_c33store,
+                                         realw* d_c34store,realw* d_c35store,realw* d_c36store,
+                                         realw* d_c44store,realw* d_c45store,realw* d_c46store,
+                                         realw* d_c55store,realw* d_c56store,realw* d_c66store,
+                                         int ATTENUATION,
+                                         reald minus_sum_beta,
+                                         reald duxdxl,reald duxdyl,reald duxdzl,
+                                         reald duydxl,reald duydyl,reald duydzl,
+                                         reald duzdxl,reald duzdyl,reald duzdzl,
+                                         reald duxdxl_plus_duydyl,reald duxdxl_plus_duzdzl,reald duydyl_plus_duzdzl,
+                                         reald duxdyl_plus_duydxl,reald duzdxl_plus_duxdzl,reald duzdyl_plus_duydzl,
+                                         reald* sigma_xx,reald* sigma_yy,reald* sigma_zz,
+                                         reald* sigma_xy,reald* sigma_xz,reald* sigma_yz
+                                         ){
+
+  reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+  reald mul;
+
+  c11 = d_c11store[offset];
+  c12 = d_c12store[offset];
+  c13 = d_c13store[offset];
+  c14 = d_c14store[offset];
+  c15 = d_c15store[offset];
+  c16 = d_c16store[offset];
+  c22 = d_c22store[offset];
+  c23 = d_c23store[offset];
+  c24 = d_c24store[offset];
+  c25 = d_c25store[offset];
+  c26 = d_c26store[offset];
+  c33 = d_c33store[offset];
+  c34 = d_c34store[offset];
+  c35 = d_c35store[offset];
+  c36 = d_c36store[offset];
+  c44 = d_c44store[offset];
+  c45 = d_c45store[offset];
+  c46 = d_c46store[offset];
+  c55 = d_c55store[offset];
+  c56 = d_c56store[offset];
+  c66 = d_c66store[offset];
+
+  // use unrelaxed parameters if attenuation
+  if( ATTENUATION){
+    mul = c44;
+    c11 = c11 + 1.33333333333333333333f * minus_sum_beta * mul;
+    c12 = c12 - 0.66666666666666666666f * minus_sum_beta * mul;
+    c13 = c13 - 0.66666666666666666666f * minus_sum_beta * mul;
+    c22 = c22 + 1.33333333333333333333f * minus_sum_beta * mul;
+    c23 = c23 - 0.66666666666666666666f * minus_sum_beta * mul;
+    c33 = c33 + 1.33333333333333333333f * minus_sum_beta * mul;
+    c44 = c44 + minus_sum_beta * mul;
+    c55 = c55 + minus_sum_beta * mul;
+    c66 = c66 + minus_sum_beta * mul;
+  }
+
+  *sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl +
+             c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl;
+  *sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl +
+             c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl;
+  *sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl +
+             c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl;
+  *sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl +
+             c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl;
+  *sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl +
+             c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl;
+  *sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
+             c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// computes stresses for isotropic element
+
+__device__ void compute_element_cm_iso(int offset,
+                                       realw* d_kappavstore,realw* d_muvstore,
+                                       int ATTENUATION,
+                                       reald one_minus_sum_beta_use,
+                                       reald duxdxl,reald duydyl,reald duzdzl,
+                                       reald duxdxl_plus_duydyl,reald duxdxl_plus_duzdzl,reald duydyl_plus_duzdzl,
+                                       reald duxdyl_plus_duydxl,reald duzdxl_plus_duxdzl,reald duzdyl_plus_duydzl,
+                                       reald* sigma_xx,reald* sigma_yy,reald* sigma_zz,
+                                       reald* sigma_xy,reald* sigma_xz,reald* sigma_yz){
+
+  reald lambdal,mul,lambdalplus2mul,kappal;
+
+  // compute elements with an elastic isotropic rheology
+  kappal = d_kappavstore[offset];
+  mul = d_muvstore[offset];
+
+  // use unrelaxed parameters if attenuation
+  if( ATTENUATION ){ mul = mul * one_minus_sum_beta_use; }
+
+  lambdalplus2mul = kappal + 1.33333333333333333333f * mul;  // 4./3. = 1.3333333
+  lambdal = lambdalplus2mul - 2.0f * mul;
+
+  // compute the six components of the stress tensor sigma
+  *sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+  *sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+  *sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+
+  *sigma_xy = mul*duxdyl_plus_duydxl;
+  *sigma_xz = mul*duzdxl_plus_duxdzl;
+  *sigma_yz = mul*duzdyl_plus_duydzl;
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// computes stresses for transversely isotropic element
+
+__device__ void compute_element_cm_tiso(int offset,
+                                        realw* d_kappavstore,realw* d_muvstore,
+                                        realw* d_kappahstore,realw* d_muhstore,realw* d_eta_anisostore,
+                                        int ATTENUATION,
+                                        reald one_minus_sum_beta_use,
+                                        reald duxdxl,reald duxdyl,reald duxdzl,
+                                        reald duydxl,reald duydyl,reald duydzl,
+                                        reald duzdxl,reald duzdyl,reald duzdzl,
+                                        reald duxdxl_plus_duydyl,reald duxdxl_plus_duzdzl,reald duydyl_plus_duzdzl,
+                                        reald duxdyl_plus_duydxl,reald duzdxl_plus_duxdzl,reald duzdyl_plus_duydzl,
+                                        int iglob,int NGLOB,
+                                        realw* d_ystore, realw* d_zstore,
+                                        reald* sigma_xx,reald* sigma_yy,reald* sigma_zz,
+                                        reald* sigma_xy,reald* sigma_xz,reald* sigma_yz){
+                                        
+  reald kappavl,muvl,kappahl,muhl;
+  reald rhovpvsq,rhovphsq,rhovsvsq,rhovshsq,eta_aniso;
+  reald costheta,sintheta,cosphi,sinphi;
+  reald costhetasq,sinthetasq,cosphisq,sinphisq,costhetafour,sinthetafour,cosphifour,sinphifour;
+  reald costwotheta,sintwotheta,costwophi,sintwophi,cosfourtheta,cosfourphi;
+  reald costwothetasq,costwophisq,sintwophisq;
+  reald etaminone,twoetaminone;
+  reald two_eta_aniso,four_eta_aniso,six_eta_aniso;
+  reald two_rhovsvsq,two_rhovshsq; // two_rhovpvsq,two_rhovphsq
+  reald four_rhovsvsq,four_rhovshsq; // four_rhovpvsq,four_rhovphsq
+  reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+
+  // cosine and sine function in CUDA only supported for float
+  reald theta,phi; 
+  
+  // use Kappa and mu from transversely isotropic model
+  kappavl = d_kappavstore[offset];
+  muvl = d_muvstore[offset];
+
+  kappahl = d_kappahstore[offset];
+  muhl = d_muhstore[offset];
+
+  // use unrelaxed parameters if attenuation
+  // eta does not need to be shifted since it is a ratio
+  if( ATTENUATION ){
+    muvl = muvl * one_minus_sum_beta_use;
+    muhl = muhl * one_minus_sum_beta_use;
+  }
+
+  rhovpvsq = kappavl + 1.33333333333333333333f * muvl ; //!!! that is C
+  rhovphsq = kappahl + 1.33333333333333333333f * muhl ; //!!! that is A
+
+  rhovsvsq = muvl; // !!! that is L
+  rhovshsq = muhl; //!!! that is N
+
+  eta_aniso = d_eta_anisostore[offset]; // !!! that is  F / (A - 2 L)
+
+  // use mesh coordinates to get theta and phi
+  //ystore and zstore contain theta and phi
+  theta = d_ystore[iglob];
+  phi = d_zstore[iglob];    
+
+  if( sizeof( theta ) == sizeof( float ) ){
+    // float operations
+
+    // daniel: TODO - sincos function
+    // or: sincosf(theta, &sintheta, &costheta);
+    // or with loss of accuracy:  __sincosf(theta, &sintheta, &costheta);
+    // or compile with: -use_fast_math
+    
+    costheta = cosf(theta);
+    sintheta = sinf(theta);   
+
+    cosphi = cosf(phi);
+    sinphi = sinf(phi);
+
+    costwotheta = cosf(2.0f * theta);
+    sintwotheta = sinf(2.0f * theta);
+    costwophi = cosf(2.0f * phi);
+    sintwophi = sinf(2.0f * phi);
+    cosfourtheta = cosf(4.0f * theta);
+    cosfourphi = cosf(4.0f * phi);    
+  }else{
+    // double operations
+    costheta = cos(theta);
+    sintheta = sin(theta);   
+
+    cosphi = cos(phi);
+    sinphi = sin(phi);      
+
+    costwotheta = cos(2.0f * theta);
+    sintwotheta = sin(2.0f * theta);
+    costwophi = cos(2.0f * phi);
+    sintwophi = sin(2.0f * phi);
+
+    cosfourtheta = cos(4.0f * theta);
+    cosfourphi = cos(4.0f * phi);    
+  }
+  
+  costhetasq = costheta * costheta;
+  sinthetasq = sintheta * sintheta;
+  cosphisq = cosphi * cosphi;
+  sinphisq = sinphi * sinphi;
+
+  costhetafour = costhetasq * costhetasq;
+  sinthetafour = sinthetasq * sinthetasq;
+  cosphifour = cosphisq * cosphisq;
+  sinphifour = sinphisq * sinphisq;
+
+  costwothetasq = costwotheta * costwotheta;
+
+  costwophisq = costwophi * costwophi;
+  sintwophisq = sintwophi * sintwophi;
+
+  etaminone = eta_aniso - 1.0f;
+  twoetaminone = 2.0f * eta_aniso - 1.0f;
+
+  // precompute some products to reduce the CPU time
+
+  two_eta_aniso = 2.0f * eta_aniso;
+  four_eta_aniso = 4.0f * eta_aniso;
+  six_eta_aniso = 6.0f * eta_aniso;
+
+  //two_rhovpvsq = 2.0f * rhovpvsq;
+  //two_rhovphsq = 2.0f * rhovphsq;
+  two_rhovsvsq = 2.0f * rhovsvsq;
+  two_rhovshsq = 2.0f * rhovshsq;
+
+  //four_rhovpvsq = 4.0f * rhovpvsq;
+  //four_rhovphsq = 4.0f * rhovphsq;
+  four_rhovsvsq = 4.0f * rhovsvsq;
+  four_rhovshsq = 4.0f * rhovshsq;
+
+  // the 21 anisotropic coefficients computed using Mathematica
+
+  c11 = rhovphsq*sinphifour + 2.0f*cosphisq*sinphisq*
+        (rhovphsq*costhetasq + (eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)*
+        sinthetasq) + cosphifour*
+        (rhovphsq*costhetafour + 2.0f*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)*
+        costhetasq*sinthetasq + rhovpvsq*sinthetafour);
+
+  c12 = ((rhovphsq - two_rhovshsq)*(3.0f + cosfourphi)*costhetasq)*0.25f -
+        four_rhovshsq*cosphisq*costhetasq*sinphisq +
+        (rhovphsq*(11.0f + 4.0f*costwotheta + cosfourtheta)*sintwophisq)*0.03125f +
+        eta_aniso*(rhovphsq - two_rhovsvsq)*(cosphifour +
+        2.0f*cosphisq*costhetasq*sinphisq + sinphifour)*sinthetasq +
+        rhovpvsq*cosphisq*sinphisq*sinthetafour -
+        rhovsvsq*sintwophisq*sinthetafour;
+
+  c13 = (cosphisq*(rhovphsq + six_eta_aniso*rhovphsq + rhovpvsq - four_rhovsvsq -
+        12.0f*eta_aniso*rhovsvsq + (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq -
+        four_eta_aniso*rhovsvsq)*cosfourtheta))*0.125f +
+        sinphisq*(eta_aniso*(rhovphsq - two_rhovsvsq)*costhetasq +
+        (rhovphsq - two_rhovshsq)*sinthetasq);
+
+  c14 = costheta*sinphi*((cosphisq*
+        (-rhovphsq + rhovpvsq + four_rhovshsq - four_rhovsvsq +
+        (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq -
+        four_eta_aniso*rhovsvsq)*costwotheta))*0.5f +
+        (etaminone*rhovphsq + 2.0f*(rhovshsq - eta_aniso*rhovsvsq))*sinphisq)* sintheta;
+
+  c15 = cosphi*costheta*((cosphisq* (-rhovphsq + rhovpvsq +
+        (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)*
+        costwotheta))*0.5f + etaminone*(rhovphsq - two_rhovsvsq)*sinphisq)*sintheta;
+
+  c16 = (cosphi*sinphi*(cosphisq* (-rhovphsq + rhovpvsq +
+        (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq -
+        four_eta_aniso*rhovsvsq)*costwotheta) +
+        2.0f*etaminone*(rhovphsq - two_rhovsvsq)*sinphisq)*sinthetasq)*0.5f;
+
+  c22 = rhovphsq*cosphifour + 2.0f*cosphisq*sinphisq*
+        (rhovphsq*costhetasq + (eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)*
+        sinthetasq) + sinphifour*
+        (rhovphsq*costhetafour + 2.0f*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)*
+        costhetasq*sinthetasq + rhovpvsq*sinthetafour);
+
+  c23 = ((rhovphsq + six_eta_aniso*rhovphsq + rhovpvsq - four_rhovsvsq - 12.0f*eta_aniso*rhovsvsq +
+        (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)*
+        cosfourtheta)*sinphisq)*0.125f +
+        cosphisq*(eta_aniso*(rhovphsq - two_rhovsvsq)*costhetasq +
+        (rhovphsq - two_rhovshsq)*sinthetasq);
+
+  c24 = costheta*sinphi*(etaminone*(rhovphsq - two_rhovsvsq)*cosphisq +
+        ((-rhovphsq + rhovpvsq + (twoetaminone*rhovphsq - rhovpvsq +
+        four_rhovsvsq - four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)*0.5f)*sintheta;
+
+  c25 = cosphi*costheta*((etaminone*rhovphsq + 2.0f*(rhovshsq - eta_aniso*rhovsvsq))*
+        cosphisq + ((-rhovphsq + rhovpvsq + four_rhovshsq - four_rhovsvsq +
+        (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq -
+        four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)*0.5f)*sintheta;
+
+  c26 = (cosphi*sinphi*(2.0f*etaminone*(rhovphsq - two_rhovsvsq)*cosphisq +
+        (-rhovphsq + rhovpvsq + (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq + four_rhovsvsq -
+        four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)*sinthetasq)*0.5f;
+
+  c33 = rhovpvsq*costhetafour + 2.0f*(eta_aniso*(rhovphsq - two_rhovsvsq) + two_rhovsvsq)*
+        costhetasq*sinthetasq + rhovphsq*sinthetafour;
+
+  c34 = -((rhovphsq - rhovpvsq + (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq
+        - four_eta_aniso*rhovsvsq)*costwotheta)*sinphi*sintwotheta)*0.25f;
+
+  c35 = -(cosphi*(rhovphsq - rhovpvsq +
+        (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)*
+        costwotheta)*sintwotheta)*0.25f;
+
+  c36 = -((rhovphsq - rhovpvsq - four_rhovshsq + four_rhovsvsq +
+        (twoetaminone*rhovphsq - rhovpvsq + four_rhovsvsq - four_eta_aniso*rhovsvsq)*
+        costwotheta)*sintwophi*sinthetasq)*0.25f;
+
+  c44 = cosphisq*(rhovsvsq*costhetasq + rhovshsq*sinthetasq) +
+        sinphisq*(rhovsvsq*costwothetasq +
+        (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq + four_eta_aniso*rhovsvsq)*costhetasq* sinthetasq);
+
+  c45 = ((rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq - two_rhovshsq - two_rhovsvsq +
+        four_eta_aniso*rhovsvsq + (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq +
+        4.0f*etaminone*rhovsvsq)*costwotheta)*sintwophi*sinthetasq)*0.25f;
+
+  c46 = -(cosphi*costheta*((rhovshsq - rhovsvsq)*cosphisq -
+        ((rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq - two_rhovshsq - two_rhovsvsq +
+        four_eta_aniso*rhovsvsq + (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq +
+        four_rhovsvsq - four_eta_aniso*rhovsvsq)*costwotheta)*sinphisq)*0.5f)* sintheta);
+
+  c55 = sinphisq*(rhovsvsq*costhetasq + rhovshsq*sinthetasq) +
+        cosphisq*(rhovsvsq*costwothetasq +
+        (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq + four_eta_aniso*rhovsvsq)*costhetasq* sinthetasq);
+
+  c56 = costheta*sinphi*((cosphisq*
+        (rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq - two_rhovshsq - two_rhovsvsq +
+        four_eta_aniso*rhovsvsq + (-rhovphsq + two_eta_aniso*rhovphsq - rhovpvsq +
+        four_rhovsvsq - four_eta_aniso*rhovsvsq)*costwotheta))*0.5f +
+        (-rhovshsq + rhovsvsq)*sinphisq)*sintheta;
+
+  c66 = rhovshsq*costwophisq*costhetasq -
+        2.0f*(rhovphsq - two_rhovshsq)*cosphisq*costhetasq*sinphisq +
+        (rhovphsq*(11.0f + 4.0f*costwotheta + cosfourtheta)*sintwophisq)*0.03125f -
+        (rhovsvsq*(-6.0f - 2.0f*cosfourphi + cos(4.0f*phi - 2.0f*theta) - 2.0f*costwotheta +
+        cos(2.0f*(2.0f*phi + theta)))*sinthetasq)*0.125f +
+        rhovpvsq*cosphisq*sinphisq*sinthetafour -
+        (eta_aniso*(rhovphsq - two_rhovsvsq)*sintwophisq*sinthetafour)*0.5f;
+
+  // general expression of stress tensor for full Cijkl with 21 coefficients
+
+  *sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl +
+              c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl;
+
+  *sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl +
+              c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl;
+
+  *sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl +
+              c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl;
+
+  *sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl +
+              c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl;
+
+  *sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl +
+              c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl;
+
+  *sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
+              c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
+
+  return;
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 2
+//
+// for crust_mantle
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void Kernel_2_crust_mantle_impl(int nb_blocks_to_compute,
+                                          int NGLOB,
+                                          int* d_ibool,
+                                          int* d_ispec_is_tiso,
+                                          int* d_phase_ispec_inner,
+                                          int num_phase_ispec,
+                                          int d_iphase,
+                                          int use_mesh_coloring_gpu,
+                                          realw* d_displ,
+                                          realw* d_accel,
+                                          realw* d_xix, realw* d_xiy, realw* d_xiz,
+                                          realw* d_etax, realw* d_etay, realw* d_etaz,
+                                          realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                                          realw* d_kappavstore, realw* d_muvstore,
+                                          realw* d_kappahstore, realw* d_muhstore,
+                                          realw* d_eta_anisostore,
+                                          int COMPUTE_AND_STORE_STRAIN,
+                                          realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                          realw* epsilondev_xz,realw* epsilondev_yz,
+                                          realw* epsilon_trace_over_3,
+                                          int SIMULATION_TYPE,
+                                          int ATTENUATION,
+                                          int USE_ATTENUATION_MIMIC,
+                                          realw* one_minus_sum_beta,realw* factor_common,
+                                          realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
+                                          realw* alphaval,realw* betaval,realw* gammaval,
+                                          int ANISOTROPY,
+                                          realw* d_c11store,realw* d_c12store,realw* d_c13store,
+                                          realw* d_c14store,realw* d_c15store,realw* d_c16store,
+                                          realw* d_c22store,realw* d_c23store,realw* d_c24store,
+                                          realw* d_c25store,realw* d_c26store,realw* d_c33store,
+                                          realw* d_c34store,realw* d_c35store,realw* d_c36store,
+                                          realw* d_c44store,realw* d_c45store,realw* d_c46store,
+                                          realw* d_c55store,realw* d_c56store,realw* d_c66store,
+                                          int GRAVITY,
+                                          realw* d_xstore,realw* d_ystore,realw* d_zstore,
+                                          realw* d_minus_gravity_table,
+                                          realw* d_minus_deriv_gravity_table,
+                                          realw* d_density_table,
+                                          realw* wgll_cube){
+
+  /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
+  int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  /* int bx = blockIdx.x; */
+  int tx = threadIdx.x;
+
+  //const int NGLLX = 5;
+  // const int NGLL2 = 25;
+  //const int NGLL3 = NGLL3;
+  const int NGLL3_ALIGN = NGLL3_PADDED;
+
+  int K = (tx/NGLL2);
+  int J = ((tx-K*NGLL2)/NGLLX);
+  int I = (tx-K*NGLL2-J*NGLLX);
+
+  int active,offset;
+  int iglob = 0;
+  int working_element;
+
+  reald tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+  reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+  reald duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+  reald duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+  reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+
+  reald fac1,fac2,fac3;
+  reald minus_sum_beta,one_minus_sum_beta_use;
+
+  reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+  reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+  //reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+  reald sum_terms1,sum_terms2,sum_terms3;
+
+  // gravity variables
+  reald sigma_yx,sigma_zx,sigma_zy;
+  reald rho_s_H1,rho_s_H2,rho_s_H3;
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+    int l;
+    realw hp1,hp2,hp3;
+#endif
+
+    __shared__ reald s_dummyx_loc[NGLL3];
+    __shared__ reald s_dummyy_loc[NGLL3];
+    __shared__ reald s_dummyz_loc[NGLL3];
+
+    __shared__ reald s_tempx1[NGLL3];
+    __shared__ reald s_tempx2[NGLL3];
+    __shared__ reald s_tempx3[NGLL3];
+    __shared__ reald s_tempy1[NGLL3];
+    __shared__ reald s_tempy2[NGLL3];
+    __shared__ reald s_tempy3[NGLL3];
+    __shared__ reald s_tempz1[NGLL3];
+    __shared__ reald s_tempz2[NGLL3];
+    __shared__ reald s_tempz3[NGLL3];
+
+// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
+// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
+    active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+
+// copy from global memory to shared memory
+// each thread writes one of the NGLL^3 = 125 data points
+    if (active) {
+
+#ifdef USE_MESH_COLORING_GPU
+      working_element = bx;
+#else
+      //mesh coloring
+      if( use_mesh_coloring_gpu ){
+        working_element = bx;
+      }else{
+        // iphase-1 and working_element-1 for Fortran->C array conventions
+        working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
+      }
+#endif
+
+      // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
+      iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+#ifdef USE_TEXTURES
+      s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
+      s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
+      s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
+#else
+      // changing iglob indexing to match fortran row changes fast style
+      s_dummyx_loc[tx] = d_displ[iglob*3];
+      s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+      s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+#endif
+    }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+    __syncthreads();  
+
+#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+
+    if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+      tempx1l = 0.f;
+      tempx2l = 0.f;
+      tempx3l = 0.f;
+
+      tempy1l = 0.f;
+      tempy2l = 0.f;
+      tempy3l = 0.f;
+
+      tempz1l = 0.f;
+      tempz2l = 0.f;
+      tempz3l = 0.f;
+
+      for (l=0;l<NGLLX;l++) {
+          hp1 = d_hprime_xx[l*NGLLX+I];
+          offset = K*NGLL2+J*NGLLX+l;
+          tempx1l += s_dummyx_loc[offset]*hp1;
+          tempy1l += s_dummyy_loc[offset]*hp1;
+          tempz1l += s_dummyz_loc[offset]*hp1;
+
+          hp2 = d_hprime_xx[l*NGLLX+J];
+          offset = K*NGLL2+l*NGLLX+I;
+          tempx2l += s_dummyx_loc[offset]*hp2;
+          tempy2l += s_dummyy_loc[offset]*hp2;
+          tempz2l += s_dummyz_loc[offset]*hp2;
+
+          hp3 = d_hprime_xx[l*NGLLX+K];
+          offset = l*NGLL2+J*NGLLX+I;
+          tempx3l += s_dummyx_loc[offset]*hp3;
+          tempy3l += s_dummyy_loc[offset]*hp3;
+          tempz3l += s_dummyz_loc[offset]*hp3;
+
+      }
+#else
+
+      tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+      tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+      tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+#endif
+
+      // compute derivatives of ux, uy and uz with respect to x, y and z
+      offset = working_element*NGLL3_ALIGN + tx;
+
+      xixl = d_xix[offset];
+      xiyl = d_xiy[offset];
+      xizl = d_xiz[offset];
+      etaxl = d_etax[offset];
+      etayl = d_etay[offset];
+      etazl = d_etaz[offset];
+      gammaxl = d_gammax[offset];
+      gammayl = d_gammay[offset];
+      gammazl = d_gammaz[offset];
+
+      duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+      duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+      duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+
+      duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+      duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+      duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+
+      duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+      duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+      duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+
+      // precompute some sums to save CPU time
+      duxdxl_plus_duydyl = duxdxl + duydyl;
+      duxdxl_plus_duzdzl = duxdxl + duzdzl;
+      duydyl_plus_duzdzl = duydyl + duzdzl;
+      duxdyl_plus_duydxl = duxdyl + duydxl;
+      duzdxl_plus_duxdzl = duzdxl + duxdzl;
+      duzdyl_plus_duydzl = duzdyl + duydzl;
+
+      // computes deviatoric strain attenuation and/or for kernel calculations
+      if(COMPUTE_AND_STORE_STRAIN) {
+        realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+
+        // local storage: stresses at this current time step
+        epsilondev_xx_loc = duxdxl - templ;
+        epsilondev_yy_loc = duydyl - templ;
+        epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+        epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+        epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+
+        if(SIMULATION_TYPE == 3) {
+          epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+        }
+      }
+
+      // attenuation
+      if(ATTENUATION){
+        // use unrelaxed parameters if attenuation
+        one_minus_sum_beta_use = one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
+        minus_sum_beta = one_minus_sum_beta_use - 1.0f;
+      }      
+
+      // computes stresses
+      if(ANISOTROPY){
+        // full anisotropic case, stress calculations
+        compute_element_cm_aniso(offset,
+                              d_c11store,d_c12store,d_c13store,d_c14store,d_c15store,d_c16store,d_c22store,
+                              d_c23store,d_c24store,d_c25store,d_c26store,d_c33store,d_c34store,d_c35store,
+                              d_c36store,d_c44store,d_c45store,d_c46store,d_c55store,d_c56store,d_c66store,
+                              ATTENUATION,
+                              minus_sum_beta,
+                              duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl,
+                              duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
+                              duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
+                              &sigma_xx,&sigma_yy,&sigma_zz,
+                              &sigma_xy,&sigma_xz,&sigma_yz);
+
+      }else{
+        if( ! d_ispec_is_tiso[working_element] ){
+          // isotropic case
+          compute_element_cm_iso(offset,
+                              d_kappavstore,d_muvstore,
+                              ATTENUATION,
+                              one_minus_sum_beta_use,
+                              duxdxl,duydyl,duzdzl,
+                              duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
+                              duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
+                              &sigma_xx,&sigma_yy,&sigma_zz,
+                              &sigma_xy,&sigma_xz,&sigma_yz);
+        }else{        
+          // transverse isotropy
+          compute_element_cm_tiso(offset,
+                                d_kappavstore,d_muvstore,
+                                d_kappahstore,d_muhstore,d_eta_anisostore,
+                                ATTENUATION,
+                                one_minus_sum_beta_use,
+                                duxdxl,duxdyl,duxdzl,
+                                duydxl,duydyl,duydzl,
+                                duzdxl,duzdyl,duzdzl,
+                                duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
+                                duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
+                                iglob, NGLOB,
+                                d_ystore,d_zstore,
+                                &sigma_xx,&sigma_yy,&sigma_zz,
+                                &sigma_xy,&sigma_xz,&sigma_yz);
+        }
+      } // ! end of test whether isotropic or anisotropic element
+      
+
+      if(ATTENUATION && (! USE_ATTENUATION_MIMIC ) ){
+        // subtracts memory variables if attenuation
+        compute_element_cm_att_stress(tx,working_element,
+                                      R_xx,R_yy,R_xy,R_xz,R_yz,
+                                      &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+      }
+
+      // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+      sigma_yx = sigma_xy;
+      sigma_zx = sigma_xz;
+      sigma_zy = sigma_yz;
+
+      // jacobian
+      jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
+                          -xiyl*(etaxl*gammazl-etazl*gammaxl)
+                          +xizl*(etaxl*gammayl-etayl*gammaxl));
+
+      if( GRAVITY ){
+        //  computes non-symmetric terms for gravity
+        compute_element_cm_gravity(tx,working_element,
+                                   d_ibool,d_xstore,d_ystore,d_zstore,
+                                   d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
+                                   wgll_cube,jacobianl,
+                                   s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+                                   &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
+                                   &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,                                   
+                                   &rho_s_H1,&rho_s_H2,&rho_s_H3);
+      }
+
+      // form dot product with test vector, non-symmetric form
+      s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+      s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+      s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+
+      s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+      s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+      s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+
+      s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+      s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+      s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+
+    }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+    __syncthreads();
+
+    if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+      tempx1l = 0.f;
+      tempy1l = 0.f;
+      tempz1l = 0.f;
+
+      tempx2l = 0.f;
+      tempy2l = 0.f;
+      tempz2l = 0.f;
+
+      tempx3l = 0.f;
+      tempy3l = 0.f;
+      tempz3l = 0.f;
+
+      for (l=0;l<NGLLX;l++) {
+
+        fac1 = d_hprimewgll_xx[I*NGLLX+l];
+        offset = K*NGLL2+J*NGLLX+l;
+        tempx1l += s_tempx1[offset]*fac1;
+        tempy1l += s_tempy1[offset]*fac1;
+        tempz1l += s_tempz1[offset]*fac1;
+
+        fac2 = d_hprimewgll_yy[J*NGLLX+l];
+        offset = K*NGLL2+l*NGLLX+I;
+        tempx2l += s_tempx2[offset]*fac2;
+        tempy2l += s_tempy2[offset]*fac2;
+        tempz2l += s_tempz2[offset]*fac2;
+
+        fac3 = d_hprimewgll_zz[K*NGLLX+l];
+        offset = l*NGLL2+J*NGLLX+I;
+        tempx3l += s_tempx3[offset]*fac3;
+        tempy3l += s_tempy3[offset]*fac3;
+        tempz3l += s_tempz3[offset]*fac3;
+
+      }
+#else
+
+      tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+      tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+      tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+      tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+              + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+              + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+              + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+              + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+
+      tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+              + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+              + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+              + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+              + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+
+      tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+              + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+              + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+              + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+              + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+
+      tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+              + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+              + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+              + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+              + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+
+      tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+              + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+              + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+              + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+              + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+
+      tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+              + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+              + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+              + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+              + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+
+#endif
+
+      fac1 = d_wgllwgll_yz[K*NGLLX+J];
+      fac2 = d_wgllwgll_xz[K*NGLLX+I];
+      fac3 = d_wgllwgll_xy[J*NGLLX+I];
+
+      sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+      sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+      sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+
+      // adds gravity term
+      if( GRAVITY ){
+        sum_terms1 += rho_s_H1;
+        sum_terms2 += rho_s_H2;
+        sum_terms3 += rho_s_H3;
+      }
+
+#ifdef USE_TEXTURES
+      d_accel[iglob] = tex1Dfetch(tex_accel, iglob) + sum_terms1);
+      d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) + sum_terms2);
+      d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) + sum_terms3);
+#else
+  /* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
+
+
+#ifdef USE_MESH_COLORING_GPU
+      // no atomic operation needed, colors don't share global points between elements
+      d_accel[iglob*3]     += sum_terms1;
+      d_accel[iglob*3 + 1] += sum_terms2;
+      d_accel[iglob*3 + 2] += sum_terms3;
+#else
+      //mesh coloring
+      if( use_mesh_coloring_gpu ){
+
+       // no atomic operation needed, colors don't share global points between elements
+        d_accel[iglob*3]     += sum_terms1;
+        d_accel[iglob*3 + 1] += sum_terms2;
+        d_accel[iglob*3 + 2] += sum_terms3;
+
+      }else{
+
+        // for testing purposes only: w/out atomic updates
+        //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+        //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+        //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+
+        atomicAdd(&d_accel[iglob*3], sum_terms1);
+        atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+        atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+
+      }
+#endif
+
+#endif
+
+      // update memory variables based upon the Runge-Kutta scheme
+      if( ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+        compute_element_cm_att_memory(tx,working_element,
+                                  d_muvstore,
+                                  factor_common,alphaval,betaval,gammaval,
+                                  R_xx,R_yy,R_xy,R_xz,R_yz,
+                                  epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+                                  epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc,
+                                  ANISOTROPY,d_c44store);
+      }
+
+      // save deviatoric strain for Runge-Kutta scheme
+      if( COMPUTE_AND_STORE_STRAIN ){
+        int ijk_ispec = tx + working_element*NGLL3;
+
+        // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+        epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
+        epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
+        epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
+        epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
+        epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
+      }
+
+    }
+
+#else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+    d_accel[iglob] -= 0.00000001f;
+    d_accel[iglob + NGLOB] -= 0.00000001f;
+    d_accel[iglob + 2*NGLOB] -= 0.00000001f;
+#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void Kernel_2_crust_mantle(int nb_blocks_to_compute,Mesh* mp,
+                          int d_iphase,
+                          int* d_ibool,
+                          int* d_ispec_is_tiso,
+                          realw* d_xix,realw* d_xiy,realw* d_xiz,
+                          realw* d_etax,realw* d_etay,realw* d_etaz,
+                          realw* d_gammax,realw* d_gammay,realw* d_gammaz,
+                          realw* d_kappavstore,realw* d_muvstore,
+                          realw* d_kappahstore,realw* d_muhstore,
+                          realw* d_eta_anisostore,
+                          realw* d_epsilondev_xx,
+                          realw* d_epsilondev_yy,
+                          realw* d_epsilondev_xy,
+                          realw* d_epsilondev_xz,
+                          realw* d_epsilondev_yz,
+                          realw* d_epsilon_trace_over_3,
+                          realw* d_one_minus_sum_beta,
+                          realw* d_factor_common,
+                          realw* d_R_xx,
+                          realw* d_R_yy,
+                          realw* d_R_xy,
+                          realw* d_R_xz,
+                          realw* d_R_yz,
+                          realw* d_b_epsilondev_xx,
+                          realw* d_b_epsilondev_yy,
+                          realw* d_b_epsilondev_xy,
+                          realw* d_b_epsilondev_xz,
+                          realw* d_b_epsilondev_yz,
+                          realw* d_b_epsilon_trace_over_3,
+                          realw* d_b_R_xx,
+                          realw* d_b_R_yy,
+                          realw* d_b_R_xy,
+                          realw* d_b_R_xz,
+                          realw* d_b_R_yz,
+                          realw* d_c11store,realw* d_c12store,realw* d_c13store,
+                          realw* d_c14store,realw* d_c15store,realw* d_c16store,
+                          realw* d_c22store,realw* d_c23store,realw* d_c24store,
+                          realw* d_c25store,realw* d_c26store,realw* d_c33store,
+                          realw* d_c34store,realw* d_c35store,realw* d_c36store,
+                          realw* d_c44store,realw* d_c45store,realw* d_c46store,
+                          realw* d_c55store,realw* d_c56store,realw* d_c66store
+                          ){
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before kernel Kernel_2_crust_mantle");
+#endif
+
+  /* if the grid can handle the number of blocks, we let it be 1D */
+  /* grid_2_x = nb_elem_color; */
+  /* nb_elem_color is just how many blocks we are computing now */
+
+  int num_blocks_x = nb_blocks_to_compute;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  int blocksize = NGLL3_PADDED;
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // Cuda timing
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+
+  Kernel_2_crust_mantle_impl<<<grid,threads>>>(nb_blocks_to_compute,
+                                  mp->NGLOB_CRUST_MANTLE,
+                                  d_ibool,
+                                  d_ispec_is_tiso,
+                                  mp->d_phase_ispec_inner_crust_mantle,
+                                  mp->num_phase_ispec_crust_mantle,
+                                  d_iphase,
+                                  mp->use_mesh_coloring_gpu,
+                                  mp->d_displ_crust_mantle,
+                                  mp->d_accel_crust_mantle,
+                                  d_xix, d_xiy, d_xiz,
+                                  d_etax, d_etay, d_etaz,
+                                  d_gammax, d_gammay, d_gammaz,
+                                  d_kappavstore, d_muvstore,
+                                  d_kappahstore, d_muhstore,
+                                  d_eta_anisostore,
+                                  mp->compute_and_store_strain,
+                                  d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
+                                  d_epsilondev_xz,d_epsilondev_yz,
+                                  d_epsilon_trace_over_3,
+                                  mp->simulation_type,
+                                  mp->attenuation,
+                                  mp->use_attenuation_mimic,
+                                  d_one_minus_sum_beta,d_factor_common,
+                                  d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+                                  mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+                                  mp->anisotropic_3D_mantle,
+                                  d_c11store,d_c12store,d_c13store,
+                                  d_c14store,d_c15store,d_c16store,
+                                  d_c22store,d_c23store,d_c24store,
+                                  d_c25store,d_c26store,d_c33store,
+                                  d_c34store,d_c35store,d_c36store,
+                                  d_c44store,d_c45store,d_c46store,
+                                  d_c55store,d_c56store,d_c66store,
+                                  mp->gravity,
+                                  mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
+                                  mp->d_minus_gravity_table,
+                                  mp->d_minus_deriv_gravity_table,
+                                  mp->d_density_table,
+                                  mp->d_wgll_cube);
+
+
+  if(mp->simulation_type == 3) {
+    Kernel_2_crust_mantle_impl<<< grid,threads>>>(nb_blocks_to_compute,
+                                     mp->NGLOB_CRUST_MANTLE,
+                                     d_ibool,
+                                     d_ispec_is_tiso,
+                                     mp->d_phase_ispec_inner_crust_mantle,
+                                     mp->num_phase_ispec_crust_mantle,
+                                     d_iphase,
+                                     mp->use_mesh_coloring_gpu,
+                                     mp->d_b_displ_crust_mantle,
+                                     mp->d_b_accel_crust_mantle,
+                                     d_xix, d_xiy, d_xiz,
+                                     d_etax, d_etay, d_etaz,
+                                     d_gammax, d_gammay, d_gammaz,
+                                     d_kappavstore, d_muvstore,
+                                     d_kappahstore, d_muhstore,
+                                     d_eta_anisostore,
+                                     mp->compute_and_store_strain,
+                                     d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
+                                     d_b_epsilondev_xz,d_b_epsilondev_yz,
+                                     d_b_epsilon_trace_over_3,
+                                     mp->simulation_type,
+                                     mp->attenuation,
+                                     mp->use_attenuation_mimic,
+                                     d_one_minus_sum_beta,d_factor_common,
+                                     d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
+                                     mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+                                     mp->anisotropic_3D_mantle,
+                                     d_c11store,d_c12store,d_c13store,
+                                     d_c14store,d_c15store,d_c16store,
+                                     d_c22store,d_c23store,d_c24store,
+                                     d_c25store,d_c26store,d_c33store,
+                                     d_c34store,d_c35store,d_c36store,
+                                     d_c44store,d_c45store,d_c46store,
+                                     d_c55store,d_c56store,d_c66store,
+                                     mp->gravity,
+                                     mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
+                                     mp->d_minus_gravity_table,
+                                     mp->d_minus_deriv_gravity_table,
+                                     mp->d_density_table,
+                                     mp->d_wgll_cube);
+  }
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("Kernel2 Execution Time: %f ms\n",time);
+
+  /* cudaThreadSynchronize(); */
+  /* LOG("Kernel 2 finished"); */
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("Kernel_2_crust_mantle");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(compute_forces_crust_mantle_cuda,
+              COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
+                                                int* iphase) {
+
+  TRACE("compute_forces_crust_mantle_cuda");
+
+//daniel: debug time
+//  printf("Running compute_forces\n");
+//  double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int num_elements;
+
+  if( *iphase == 1 )
+    num_elements = mp->nspec_outer_crust_mantle;
+  else
+    num_elements = mp->nspec_inner_crust_mantle;
+
+  // checks if anything to do
+  if( num_elements == 0 ) return;
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+
+    // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements
+    //         and followed by acoustic ones.
+    //         elastic elements also start with outer than inner element ordering
+
+    int nb_colors,nb_blocks_to_compute;
+    int istart;
+    int color_offset,color_offset_nonpadded,color_offset_nonpadded_att2;
+    int color_offset_ispec;
+
+    // sets up color loop
+    if( *iphase == 1 ){
+      // outer elements
+      nb_colors = mp->num_colors_outer_crust_mantle;
+      istart = 0;
+
+      // array offsets
+      color_offset = 0;
+      color_offset_nonpadded = 0;
+      color_offset_nonpadded_att2 = 0;
+      color_offset_ispec = 0;
+    }else{
+      // inner elements (start after outer elements)
+      nb_colors = mp->num_colors_outer_crust_mantle + mp->num_colors_inner_crust_mantle;
+      istart = mp->num_colors_outer_crust_mantle;
+
+      // array offsets
+      color_offset = (mp->nspec_outer_crust_mantle) * NGLL3_PADDED;
+      color_offset_nonpadded = (mp->nspec_outer_crust_mantle) * NGLL3;
+      color_offset_nonpadded_att2 = (mp->nspec_outer_crust_mantle) * NGLL3 * N_SLS;
+      color_offset_ispec = mp->nspec_outer_crust_mantle;
+    }
+
+    // loops over colors
+    for(int icolor = istart; icolor < nb_colors; icolor++){
+
+      nb_blocks_to_compute = mp->h_num_elem_colors_crust_mantle[icolor];
+
+      // checks
+      //if( nb_blocks_to_compute <= 0 ){
+      //  printf("error number of elastic color blocks: %d -- color = %d \n",nb_blocks_to_compute,icolor);
+      //  exit(EXIT_FAILURE);
+      //}
+
+      Kernel_2_crust_mantle(nb_blocks_to_compute,mp,
+                            *iphase,
+                            mp->d_ibool_crust_mantle + color_offset_nonpadded,
+                            mp->d_ispec_is_tiso_crust_mantle + color_offset_ispec,
+                            mp->d_xix_crust_mantle + color_offset,
+                            mp->d_xiy_crust_mantle + color_offset,
+                            mp->d_xiz_crust_mantle + color_offset,
+                            mp->d_etax_crust_mantle + color_offset,
+                            mp->d_etay_crust_mantle + color_offset,
+                            mp->d_etaz_crust_mantle + color_offset,
+                            mp->d_gammax_crust_mantle + color_offset,
+                            mp->d_gammay_crust_mantle + color_offset,
+                            mp->d_gammaz_crust_mantle + color_offset,
+                            mp->d_kappavstore_crust_mantle + color_offset,
+                            mp->d_muvstore_crust_mantle + color_offset,
+                            mp->d_kappahstore_crust_mantle + color_offset,
+                            mp->d_muhstore_crust_mantle + color_offset,
+                            mp->d_eta_anisostore_crust_mantle + color_offset,
+                            mp->d_epsilondev_xx_crust_mantle + color_offset_nonpadded,
+                            mp->d_epsilondev_yy_crust_mantle + color_offset_nonpadded,
+                            mp->d_epsilondev_xy_crust_mantle + color_offset_nonpadded,
+                            mp->d_epsilondev_xz_crust_mantle + color_offset_nonpadded,
+                            mp->d_epsilondev_yz_crust_mantle + color_offset_nonpadded,
+                            mp->d_eps_trace_over_3_crust_mantle + color_offset_nonpadded,
+                            mp->d_one_minus_sum_beta_crust_mantle + color_offset_nonpadded,
+                            mp->d_factor_common_crust_mantle + color_offset_nonpadded_att2,
+                            mp->d_R_xx_crust_mantle + color_offset_nonpadded,
+                            mp->d_R_yy_crust_mantle + color_offset_nonpadded,
+                            mp->d_R_xy_crust_mantle + color_offset_nonpadded,
+                            mp->d_R_xz_crust_mantle + color_offset_nonpadded,
+                            mp->d_R_yz_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_epsilondev_xx_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_epsilondev_yy_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_epsilondev_xy_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_epsilondev_xz_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_epsilondev_yz_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_eps_trace_over_3_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_R_xx_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_R_yy_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_R_xy_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_R_xz_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_R_yz_crust_mantle + color_offset_nonpadded,
+                            mp->d_c11store_crust_mantle + color_offset,
+                            mp->d_c12store_crust_mantle + color_offset,
+                            mp->d_c13store_crust_mantle + color_offset,
+                            mp->d_c14store_crust_mantle + color_offset,
+                            mp->d_c15store_crust_mantle + color_offset,
+                            mp->d_c16store_crust_mantle + color_offset,
+                            mp->d_c22store_crust_mantle + color_offset,
+                            mp->d_c23store_crust_mantle + color_offset,
+                            mp->d_c24store_crust_mantle + color_offset,
+                            mp->d_c25store_crust_mantle + color_offset,
+                            mp->d_c26store_crust_mantle + color_offset,
+                            mp->d_c33store_crust_mantle + color_offset,
+                            mp->d_c34store_crust_mantle + color_offset,
+                            mp->d_c35store_crust_mantle + color_offset,
+                            mp->d_c36store_crust_mantle + color_offset,
+                            mp->d_c44store_crust_mantle + color_offset,
+                            mp->d_c45store_crust_mantle + color_offset,
+                            mp->d_c46store_crust_mantle + color_offset,
+                            mp->d_c55store_crust_mantle + color_offset,
+                            mp->d_c56store_crust_mantle + color_offset,
+                            mp->d_c66store_crust_mantle + color_offset
+                            );
+
+      // for padded and aligned arrays
+      color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+      // for no-aligned arrays
+      color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+      // for factor_common array
+      color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
+      // for array(ispec)
+      color_offset_ispec += nb_blocks_to_compute;
+    }
+
+  }else{
+
+    // no mesh coloring: uses atomic updates
+
+    Kernel_2_crust_mantle(num_elements,mp,
+                          *iphase,
+                          mp->d_ibool_crust_mantle,
+                          mp->d_ispec_is_tiso_crust_mantle,
+                          mp->d_xix_crust_mantle,mp->d_xiy_crust_mantle,mp->d_xiz_crust_mantle,
+                          mp->d_etax_crust_mantle,mp->d_etay_crust_mantle,mp->d_etaz_crust_mantle,
+                          mp->d_gammax_crust_mantle,mp->d_gammay_crust_mantle,mp->d_gammaz_crust_mantle,
+                          mp->d_kappavstore_crust_mantle,mp->d_muvstore_crust_mantle,
+                          mp->d_kappahstore_crust_mantle,mp->d_muhstore_crust_mantle,
+                          mp->d_eta_anisostore_crust_mantle,
+                          mp->d_epsilondev_xx_crust_mantle,
+                          mp->d_epsilondev_yy_crust_mantle,
+                          mp->d_epsilondev_xy_crust_mantle,
+                          mp->d_epsilondev_xz_crust_mantle,
+                          mp->d_epsilondev_yz_crust_mantle,
+                          mp->d_eps_trace_over_3_crust_mantle,
+                          mp->d_one_minus_sum_beta_crust_mantle,
+                          mp->d_factor_common_crust_mantle,
+                          mp->d_R_xx_crust_mantle,
+                          mp->d_R_yy_crust_mantle,
+                          mp->d_R_xy_crust_mantle,
+                          mp->d_R_xz_crust_mantle,
+                          mp->d_R_yz_crust_mantle,
+                          mp->d_b_epsilondev_xx_crust_mantle,
+                          mp->d_b_epsilondev_yy_crust_mantle,
+                          mp->d_b_epsilondev_xy_crust_mantle,
+                          mp->d_b_epsilondev_xz_crust_mantle,
+                          mp->d_b_epsilondev_yz_crust_mantle,
+                          mp->d_b_eps_trace_over_3_crust_mantle,
+                          mp->d_b_R_xx_crust_mantle,
+                          mp->d_b_R_yy_crust_mantle,
+                          mp->d_b_R_xy_crust_mantle,
+                          mp->d_b_R_xz_crust_mantle,
+                          mp->d_b_R_yz_crust_mantle,
+                          mp->d_c11store_crust_mantle,mp->d_c12store_crust_mantle,mp->d_c13store_crust_mantle,
+                          mp->d_c14store_crust_mantle,mp->d_c15store_crust_mantle,mp->d_c16store_crust_mantle,
+                          mp->d_c22store_crust_mantle,mp->d_c23store_crust_mantle,mp->d_c24store_crust_mantle,
+                          mp->d_c25store_crust_mantle,mp->d_c26store_crust_mantle,mp->d_c33store_crust_mantle,
+                          mp->d_c34store_crust_mantle,mp->d_c35store_crust_mantle,mp->d_c36store_crust_mantle,
+                          mp->d_c44store_crust_mantle,mp->d_c45store_crust_mantle,mp->d_c46store_crust_mantle,
+                          mp->d_c55store_crust_mantle,mp->d_c56store_crust_mantle,mp->d_c66store_crust_mantle
+                          );
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_forces_crust_mantle_cuda");
+#endif  
+}

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,1217 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elemental routines
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// updates stress
+
+__device__ void compute_element_ic_att_stress(int tx,int working_element,
+                                             realw* R_xx,
+                                             realw* R_yy,
+                                             realw* R_xy,
+                                             realw* R_xz,
+                                             realw* R_yz,
+                                             reald* sigma_xx,
+                                             reald* sigma_yy,
+                                             reald* sigma_zz,
+                                             reald* sigma_xy,
+                                             reald* sigma_xz,
+                                             reald* sigma_yz) {
+
+  int i_sls,offset;
+  reald R_xx_val,R_yy_val;
+
+  for(i_sls = 0; i_sls < N_SLS; i_sls++){
+    // index
+    // note: index for R_xx,.. here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
+    //          local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
+    offset = i_sls + N_SLS*(tx + NGLL3*working_element);
+
+    R_xx_val = R_xx[offset];
+    R_yy_val = R_yy[offset];
+
+    *sigma_xx = *sigma_xx - R_xx_val;
+    *sigma_yy = *sigma_yy - R_yy_val;
+    *sigma_zz = *sigma_zz + R_xx_val + R_yy_val;
+    *sigma_xy = *sigma_xy - R_xy[offset];
+    *sigma_xz = *sigma_xz - R_xz[offset];
+    *sigma_yz = *sigma_yz - R_yz[offset];
+  }
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// updates R_memory
+
+__device__ void compute_element_ic_att_memory(int tx,int working_element,
+                                              realw* d_muv,
+                                              realw* factor_common,
+                                              realw* alphaval,realw* betaval,realw* gammaval,
+                                              realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                              realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                              realw* epsilondev_xz,realw* epsilondev_yz,
+                                              reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
+                                              reald epsilondev_xz_loc,reald epsilondev_yz_loc
+                                              ){
+
+  int i_sls;
+  int ijk_ispec;
+  int offset_align,offset;
+  reald mul;
+  reald alphaval_loc,betaval_loc,gammaval_loc;
+  reald factor_loc,Sn,Snp1;
+
+  // indices
+  offset_align = tx + NGLL3_PADDED * working_element;
+  ijk_ispec = tx + NGLL3 * working_element;
+
+  mul = d_muv[offset_align];
+
+  // use Runge-Kutta scheme to march in time
+  for(i_sls = 0; i_sls < N_SLS; i_sls++){
+
+    // indices
+    // note: index for R_xx,... here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
+    //          local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
+    // index for (i_sls,i,j,k,ispec)
+    offset = i_sls + N_SLS*(tx + NGLL3*working_element);
+
+    factor_loc = mul * factor_common[offset]; //mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+    alphaval_loc = alphaval[i_sls]; // (i_sls)
+    betaval_loc = betaval[i_sls];
+    gammaval_loc = gammaval[i_sls];
+
+    // term in xx
+    Sn   = factor_loc * epsilondev_xx[ijk_ispec]; //(i,j,k,ispec)
+    Snp1   = factor_loc * epsilondev_xx_loc; //(i,j,k)
+    R_xx[offset] = alphaval_loc * R_xx[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+
+    // term in yy
+    Sn   = factor_loc * epsilondev_yy[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_yy_loc;
+    R_yy[offset] = alphaval_loc * R_yy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    // term in zz not computed since zero trace
+
+    // term in xy
+    Sn   = factor_loc * epsilondev_xy[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_xy_loc;
+    R_xy[offset] = alphaval_loc * R_xy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+
+    // term in xz
+    Sn   = factor_loc * epsilondev_xz[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_xz_loc;
+    R_xz[offset] = alphaval_loc * R_xz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+
+    // term in yz
+    Sn   = factor_loc * epsilondev_yz[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_yz_loc;
+    R_yz[offset] = alphaval_loc * R_yz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+  }
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// pre-computes gravity term
+
+__device__ void compute_element_ic_gravity(int tx,int working_element,
+                                           int* d_ibool,
+                                           realw* d_xstore,realw* d_ystore,realw* d_zstore,
+                                           realw* d_minus_gravity_table,
+                                           realw* d_minus_deriv_gravity_table,
+                                           realw* d_density_table,
+                                           realw* wgll_cube,
+                                           reald jacobianl,
+                                           reald* s_dummyx_loc,
+                                           reald* s_dummyy_loc,
+                                           reald* s_dummyz_loc,
+                                           reald* sigma_xx,
+                                           reald* sigma_yy,
+                                           reald* sigma_zz,
+                                           reald* sigma_xy,
+                                           reald* sigma_yx,
+                                           reald* sigma_xz,
+                                           reald* sigma_zx,
+                                           reald* sigma_yz,
+                                           reald* sigma_zy,
+                                           reald* rho_s_H1,
+                                           reald* rho_s_H2,
+                                           reald* rho_s_H3){
+
+  reald radius,theta,phi;
+  reald cos_theta,sin_theta,cos_phi,sin_phi;
+  reald minus_g,minus_dg;
+  reald rho;
+  reald gxl,gyl,gzl; 
+  reald minus_g_over_radius,minus_dg_plus_g_over_radius;
+  reald cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
+  reald Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
+  reald sx_l,sy_l,sz_l;
+  reald factor;
+  
+  // R_EARTH_KM is the radius of the bottom of the oceans
+  const reald R_EARTH = 6371000.0f; // in m
+  const reald R_EARTH_KM = 6371.0f; // in km
+  // uncomment line below for PREM with oceans
+  //const reald R_EARTH = 6368000.0f;
+  //const reald R_EARTH_KM = 6368.0f;
+  
+  // compute non-symmetric terms for gravity
+  
+  // use mesh coordinates to get theta and phi
+  // x y z contain r theta phi
+  int iglob = d_ibool[working_element*NGLL3 + tx]-1;
+  
+  radius = d_xstore[iglob];  
+  // make sure radius is never zero even for points at center of cube
+  // because we later divide by radius
+  if(radius < 100.f / R_EARTH){ radius = 100.f / R_EARTH; }
+      
+  theta = d_ystore[iglob];
+  phi = d_zstore[iglob];
+  
+  cos_theta = cos(theta);
+  sin_theta = sin(theta);
+  cos_phi = cos(phi);
+  sin_phi = sin(phi);
+  
+  // for efficiency replace with lookup table every 100 m in radial direction
+  // note: radius in crust mantle should never be zero,
+  //          and arrays in C start from 0, thus we need to subtract -1
+  int int_radius = rint(radius * R_EARTH_KM * 10.0f ) - 1;
+  //make sure we never use below zero for point exactly at the center of the Earth
+  if( int_radius < 0 ){int_radius = 0;}
+  
+  // get g, rho and dg/dr=dg
+  // spherical components of the gravitational acceleration
+  // for efficiency replace with lookup table every 100 m in radial direction
+  minus_g = d_minus_gravity_table[int_radius];
+  minus_dg = d_minus_deriv_gravity_table[int_radius];
+  rho = d_density_table[int_radius];
+  
+  // Cartesian components of the gravitational acceleration
+  gxl = minus_g*sin_theta*cos_phi;
+  gyl = minus_g*sin_theta*sin_phi;
+  gzl = minus_g*cos_theta;
+  
+  // Cartesian components of gradient of gravitational acceleration
+  // obtained from spherical components
+  
+  minus_g_over_radius = minus_g / radius;
+  minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius;
+  
+  cos_theta_sq = cos_theta*cos_theta;
+  sin_theta_sq = sin_theta*sin_theta;
+  cos_phi_sq = cos_phi*cos_phi;
+  sin_phi_sq = sin_phi*sin_phi;
+  
+  Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq;
+  Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq;
+  Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq;
+  Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq;
+  Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta;
+  Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta;
+  
+  // get displacement and multiply by density to compute G tensor
+  sx_l = rho * s_dummyx_loc[tx];
+  sy_l = rho * s_dummyy_loc[tx];
+  sz_l = rho * s_dummyz_loc[tx];
+  
+  // compute G tensor from s . g and add to sigma (not symmetric)
+  *sigma_xx = *sigma_xx + sy_l*gyl + sz_l*gzl;
+  *sigma_yy = *sigma_yy + sx_l*gxl + sz_l*gzl;
+  *sigma_zz = *sigma_zz + sx_l*gxl + sy_l*gyl;
+  
+  *sigma_xy = *sigma_xy - sx_l * gyl;
+  *sigma_yx = *sigma_yx - sy_l * gxl;
+  
+  *sigma_xz = *sigma_xz - sx_l * gzl;
+  *sigma_zx = *sigma_zx - sz_l * gxl;
+  
+  *sigma_yz = *sigma_yz - sy_l * gzl;
+  *sigma_zy = *sigma_zy - sz_l * gyl;
+  
+  // precompute vector
+  factor = jacobianl * wgll_cube[tx];
+  *rho_s_H1 = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl);
+  *rho_s_H2 = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl);
+  *rho_s_H3 = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl);
+  
+  return;
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 2
+//
+// for inner_core
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void Kernel_2_inner_core_impl(int nb_blocks_to_compute,
+                                        int NGLOB,
+                                        int* d_ibool,
+                                        int* d_idoubling,
+                                        int* d_phase_ispec_inner,
+                                        int num_phase_ispec,
+                                        int d_iphase,
+                                        int use_mesh_coloring_gpu,
+                                        realw* d_displ, 
+                                        realw* d_accel,
+                                        realw* d_xix, realw* d_xiy, realw* d_xiz,
+                                        realw* d_etax, realw* d_etay, realw* d_etaz,
+                                        realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                                        realw* d_hprime_xx, realw* d_hprime_yy, realw* d_hprime_zz,
+                                        realw* d_hprimewgll_xx, realw* d_hprimewgll_yy, realw* d_hprimewgll_zz,
+                                        realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
+                                        realw* d_kappav,
+                                        realw* d_muv,
+                                        int COMPUTE_AND_STORE_STRAIN,
+                                        realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                        realw* epsilondev_xz,realw* epsilondev_yz,
+                                        realw* epsilon_trace_over_3,
+                                        int SIMULATION_TYPE,
+                                        int ATTENUATION,
+                                        int USE_ATTENUATION_MIMIC,
+                                        realw* one_minus_sum_beta,realw* factor_common,
+                                        realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
+                                        realw* alphaval,realw* betaval,realw* gammaval,
+                                        int ANISOTROPY,
+                                        realw* d_c11store,realw* d_c12store,realw* d_c13store,
+                                        realw* d_c33store,realw* d_c44store,
+                                        int GRAVITY,
+                                        realw* d_xstore,realw* d_ystore,realw* d_zstore,
+                                        realw* d_minus_gravity_table,
+                                        realw* d_minus_deriv_gravity_table,
+                                        realw* d_density_table,
+                                        realw* wgll_cube){
+
+  /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
+  int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  /* int bx = blockIdx.x; */
+  int tx = threadIdx.x;
+
+  //const int NGLLX = 5;
+  // const int NGLL2 = 25;
+  //const int NGLL3 = NGLL3;
+  const int NGLL3_ALIGN = NGLL3_PADDED;
+  const int IFLAG_IN_FICTITIOUS_CUBE = 11; // from constants.h
+
+  int K = (tx/NGLL2);
+  int J = ((tx-K*NGLL2)/NGLLX);
+  int I = (tx-K*NGLL2-J*NGLLX);
+
+  int active,offset;
+  int iglob = 0;
+  int working_element;
+
+  reald tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+  reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+  reald duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+  reald duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+  reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+  reald fac1,fac2,fac3;
+  reald lambdal,mul,lambdalplus2mul,kappal;
+  reald mul_iso,mul_aniso;
+  reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+  reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+  reald c11,c12,c13,c33,c44;
+  reald sum_terms1,sum_terms2,sum_terms3;
+
+  // gravity variables
+  reald sigma_yx,sigma_zx,sigma_zy;
+  reald rho_s_H1,rho_s_H2,rho_s_H3;
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+    int l;
+    realw hp1,hp2,hp3;
+#endif
+
+    __shared__ reald s_dummyx_loc[NGLL3];
+    __shared__ reald s_dummyy_loc[NGLL3];
+    __shared__ reald s_dummyz_loc[NGLL3];
+
+    __shared__ reald s_tempx1[NGLL3];
+    __shared__ reald s_tempx2[NGLL3];
+    __shared__ reald s_tempx3[NGLL3];
+    __shared__ reald s_tempy1[NGLL3];
+    __shared__ reald s_tempy2[NGLL3];
+    __shared__ reald s_tempy3[NGLL3];
+    __shared__ reald s_tempz1[NGLL3];
+    __shared__ reald s_tempz2[NGLL3];
+    __shared__ reald s_tempz3[NGLL3];
+
+// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
+// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
+    active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+
+// copy from global memory to shared memory
+// each thread writes one of the NGLL^3 = 125 data points
+    if (active) {
+
+#ifdef USE_MESH_COLORING_GPU
+      working_element = bx;
+#else
+      //mesh coloring
+      if( use_mesh_coloring_gpu ){
+        working_element = bx;
+      }else{
+        // iphase-1 and working_element-1 for Fortran->C array conventions
+        working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
+      }
+#endif
+
+      // exclude fictitious elements in central cube
+      if( d_idoubling[working_element] == IFLAG_IN_FICTITIOUS_CUBE ){
+        active = 0;
+      }else{
+        // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
+        iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+#ifdef USE_TEXTURES
+        s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
+        s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
+        s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
+#else
+        // changing iglob indexing to match fortran row changes fast style
+        s_dummyx_loc[tx] = d_displ[iglob*3];
+        s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+        s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+#endif
+      }
+    }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+    __syncthreads();
+
+#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+
+    if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+      tempx1l = 0.f;
+      tempx2l = 0.f;
+      tempx3l = 0.f;
+
+      tempy1l = 0.f;
+      tempy2l = 0.f;
+      tempy3l = 0.f;
+
+      tempz1l = 0.f;
+      tempz2l = 0.f;
+      tempz3l = 0.f;
+
+      for (l=0;l<NGLLX;l++) {
+          hp1 = d_hprime_xx[l*NGLLX+I];
+          offset = K*NGLL2+J*NGLLX+l;
+          tempx1l += s_dummyx_loc[offset]*hp1;
+          tempy1l += s_dummyy_loc[offset]*hp1;
+          tempz1l += s_dummyz_loc[offset]*hp1;
+
+          hp2 = d_hprime_xx[l*NGLLX+J];
+          offset = K*NGLL2+l*NGLLX+I;
+          tempx2l += s_dummyx_loc[offset]*hp2;
+          tempy2l += s_dummyy_loc[offset]*hp2;
+          tempz2l += s_dummyz_loc[offset]*hp2;
+
+          hp3 = d_hprime_xx[l*NGLLX+K];
+          offset = l*NGLL2+J*NGLLX+I;
+          tempx3l += s_dummyx_loc[offset]*hp3;
+          tempy3l += s_dummyy_loc[offset]*hp3;
+          tempz3l += s_dummyz_loc[offset]*hp3;
+
+      }
+#else
+
+      tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+      tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+      tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+#endif
+
+// compute derivatives of ux, uy and uz with respect to x, y and z
+      offset = working_element*NGLL3_ALIGN + tx;
+
+      xixl = d_xix[offset];
+      xiyl = d_xiy[offset];
+      xizl = d_xiz[offset];
+      etaxl = d_etax[offset];
+      etayl = d_etay[offset];
+      etazl = d_etaz[offset];
+      gammaxl = d_gammax[offset];
+      gammayl = d_gammay[offset];
+      gammazl = d_gammaz[offset];
+
+      duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+      duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+      duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+
+      duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+      duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+      duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+
+      duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+      duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+      duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+
+      // precompute some sums to save CPU time
+      duxdxl_plus_duydyl = duxdxl + duydyl;
+      duxdxl_plus_duzdzl = duxdxl + duzdzl;
+      duydyl_plus_duzdzl = duydyl + duzdzl;
+      duxdyl_plus_duydxl = duxdyl + duydxl;
+      duzdxl_plus_duxdzl = duzdxl + duxdzl;
+      duzdyl_plus_duydzl = duzdyl + duydzl;
+
+      // computes deviatoric strain attenuation and/or for kernel calculations
+      if(COMPUTE_AND_STORE_STRAIN) {
+        realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+
+        // local storage: stresses at this current time step
+        epsilondev_xx_loc = duxdxl - templ;
+        epsilondev_yy_loc = duydyl - templ;
+        epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+        epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+        epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+
+        if(SIMULATION_TYPE == 3) {
+          epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+        }
+      }
+
+      // compute elements with an elastic isotropic rheology
+      kappal = d_kappav[offset];
+      mul = d_muv[offset];
+
+      // attenuation
+      if(ATTENUATION){
+        // use unrelaxed parameters if attenuation
+        mul_iso  = mul * one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
+        mul_aniso = mul *( one_minus_sum_beta[tx+working_element*NGLL3] - 1.0f );
+      }else{
+        mul_iso = mul;
+      }
+
+      // full anisotropic case, stress calculations
+      if(ANISOTROPY){
+
+        // elastic tensor for hexagonal symmetry in reduced notation:
+        //
+        //      c11 c12 c13  0   0        0
+        //      c12 c11 c13  0   0        0
+        //      c13 c13 c33  0   0        0
+        //       0   0   0  c44  0        0
+        //       0   0   0   0  c44       0
+        //       0   0   0   0   0  (c11-c12)/2
+        //
+        //       in terms of the A, C, L, N and F of Love (1927):
+        //
+        //       c11 = A
+        //       c12 = A-2N
+        //       c13 = F
+        //       c33 = C
+        //       c44 = L
+
+        c11 = d_c11store[offset];
+        c12 = d_c12store[offset];
+        c13 = d_c13store[offset];
+        c33 = d_c33store[offset];
+        c44 = d_c44store[offset];
+
+        // use unrelaxed parameters if attenuation
+        if( ATTENUATION){
+          c11 = c11 + 1.33333333333333333333f * mul_aniso; // FOUR_THIRDS = 1.33333
+          c12 = c12 - 0.66666666666666666666f * mul_aniso; // TWO_THIRDS = 0.66666666666666666666f
+          c13 = c13 - 0.66666666666666666666f * mul_aniso;
+          c33 = c33 + 1.33333333333333333333f * mul_aniso;
+          c44 = c44 + mul_aniso;
+        }
+
+        sigma_xx = c11*duxdxl + c12*duydyl + c13*duzdzl;
+        sigma_yy = c12*duxdxl + c11*duydyl + c13*duzdzl;
+        sigma_zz = c13*duxdxl + c13*duydyl + c33*duzdzl;
+        sigma_xy = 0.5f*(c11-c12)*duxdyl_plus_duydxl;
+        sigma_xz = c44*duzdxl_plus_duxdzl;
+        sigma_yz = c44*duzdyl_plus_duydzl;
+
+      }else{
+
+        // isotropic case
+
+        lambdalplus2mul = kappal + 1.33333333333333333333f * mul_iso;  // 4./3. = 1.3333333
+        lambdal = lambdalplus2mul - 2.0f * mul_iso;
+
+        // compute the six components of the stress tensor sigma
+        sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+        sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+        sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+
+        sigma_xy = mul*duxdyl_plus_duydxl;
+        sigma_xz = mul*duzdxl_plus_duxdzl;
+        sigma_yz = mul*duzdyl_plus_duydzl;
+      }
+
+      if(ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+        // subtracts memory variables if attenuation
+        compute_element_ic_att_stress(tx,working_element,
+                                      R_xx,R_yy,R_xy,R_xz,R_yz,
+                                      &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+      }
+
+      // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+      sigma_yx = sigma_xy;
+      sigma_zx = sigma_xz;
+      sigma_zy = sigma_yz;
+
+      // jacobian
+      jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
+                          -xiyl*(etaxl*gammazl-etazl*gammaxl)
+                          +xizl*(etaxl*gammayl-etayl*gammaxl));
+
+      if( GRAVITY ){
+        //  computes non-symmetric terms for gravity
+        compute_element_ic_gravity(tx,working_element,
+                                   d_ibool,d_xstore,d_ystore,d_zstore,
+                                   d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
+                                   wgll_cube,jacobianl,
+                                   s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+                                   &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
+                                   &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,                                   
+                                   &rho_s_H1,&rho_s_H2,&rho_s_H3);
+      }
+
+      // form dot product with test vector, non-symmetric form
+      s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+      s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+      s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+
+      s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+      s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+      s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+
+      s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+      s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+      s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+
+    }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+    __syncthreads();
+
+    if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+      tempx1l = 0.f;
+      tempy1l = 0.f;
+      tempz1l = 0.f;
+
+      tempx2l = 0.f;
+      tempy2l = 0.f;
+      tempz2l = 0.f;
+
+      tempx3l = 0.f;
+      tempy3l = 0.f;
+      tempz3l = 0.f;
+
+      for (l=0;l<NGLLX;l++) {
+
+        fac1 = d_hprimewgll_xx[I*NGLLX+l];
+        offset = K*NGLL2+J*NGLLX+l;
+        tempx1l += s_tempx1[offset]*fac1;
+        tempy1l += s_tempy1[offset]*fac1;
+        tempz1l += s_tempz1[offset]*fac1;
+
+        fac2 = d_hprimewgll_yy[J*NGLLX+l];
+        offset = K*NGLL2+l*NGLLX+I;
+        tempx2l += s_tempx2[offset]*fac2;
+        tempy2l += s_tempy2[offset]*fac2;
+        tempz2l += s_tempz2[offset]*fac2;
+
+        fac3 = d_hprimewgll_zz[K*NGLLX+l];
+        offset = l*NGLL2+J*NGLLX+I;
+        tempx3l += s_tempx3[offset]*fac3;
+        tempy3l += s_tempy3[offset]*fac3;
+        tempz3l += s_tempz3[offset]*fac3;
+
+      }
+#else
+
+      tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+      tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+      tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+      tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+              + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+              + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+              + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+              + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+
+      tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+              + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+              + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+              + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+              + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+
+      tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+              + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+              + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+              + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+              + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+
+      tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+              + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+              + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+              + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+              + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+
+      tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+              + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+              + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+              + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+              + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+
+      tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+              + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+              + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+              + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+              + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+
+#endif
+
+      fac1 = d_wgllwgll_yz[K*NGLLX+J];
+      fac2 = d_wgllwgll_xz[K*NGLLX+I];
+      fac3 = d_wgllwgll_xy[J*NGLLX+I];
+
+      sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+      sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+      sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+
+      // adds gravity term
+      if( GRAVITY ){
+        sum_terms1 += rho_s_H1;
+        sum_terms2 += rho_s_H2;
+        sum_terms3 += rho_s_H3;
+      }
+
+#ifdef USE_TEXTURES
+      d_accel[iglob] = tex1Dfetch(tex_accel, iglob) + sum_terms1);
+      d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) + sum_terms2);
+      d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) + sum_terms3);
+#else
+  /* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
+
+
+#ifdef USE_MESH_COLORING_GPU
+      // no atomic operation needed, colors don't share global points between elements
+      d_accel[iglob*3]     += sum_terms1;
+      d_accel[iglob*3 + 1] += sum_terms2;
+      d_accel[iglob*3 + 2] += sum_terms3;
+#else
+      //mesh coloring
+      if( use_mesh_coloring_gpu ){
+
+       // no atomic operation needed, colors don't share global points between elements
+        d_accel[iglob*3]     += sum_terms1;
+        d_accel[iglob*3 + 1] += sum_terms2;
+        d_accel[iglob*3 + 2] += sum_terms3;
+
+      }else{
+
+        // for testing purposes only: w/out atomic updates
+        //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+        //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+        //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+
+        atomicAdd(&d_accel[iglob*3], sum_terms1);
+        atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+        atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+
+      }
+#endif
+
+#endif
+
+      // update memory variables based upon the Runge-Kutta scheme
+      if( ATTENUATION && ! USE_ATTENUATION_MIMIC ){
+        compute_element_ic_att_memory(tx,working_element,
+                                  d_muv,
+                                  factor_common,alphaval,betaval,gammaval,
+                                  R_xx,R_yy,R_xy,R_xz,R_yz,
+                                  epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+                                  epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc);
+      }
+
+      // save deviatoric strain for Runge-Kutta scheme
+      if( COMPUTE_AND_STORE_STRAIN ){
+        int ijk_ispec = tx + working_element*NGLL3;
+
+        // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+        epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
+        epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
+        epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
+        epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
+        epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
+      }
+
+    }
+
+#else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+    d_accel[iglob] -= 0.00000001f;
+    d_accel[iglob + NGLOB] -= 0.00000001f;
+    d_accel[iglob + 2*NGLOB] -= 0.00000001f;
+#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void Kernel_2_inner_core(int nb_blocks_to_compute,Mesh* mp,
+                        int d_iphase,
+                        int* d_ibool,
+                        int* d_idoubling,
+                        realw* d_xix,realw* d_xiy,realw* d_xiz,
+                        realw* d_etax,realw* d_etay,realw* d_etaz,
+                        realw* d_gammax,realw* d_gammay,realw* d_gammaz,
+                        realw* d_kappav,
+                        realw* d_muv,
+                        realw* d_epsilondev_xx,
+                        realw* d_epsilondev_yy,
+                        realw* d_epsilondev_xy,
+                        realw* d_epsilondev_xz,
+                        realw* d_epsilondev_yz,
+                        realw* d_epsilon_trace_over_3,
+                        realw* d_one_minus_sum_beta,
+                        realw* d_factor_common,
+                        realw* d_R_xx,
+                        realw* d_R_yy,
+                        realw* d_R_xy,
+                        realw* d_R_xz,
+                        realw* d_R_yz,
+                        realw* d_b_epsilondev_xx,
+                        realw* d_b_epsilondev_yy,
+                        realw* d_b_epsilondev_xy,
+                        realw* d_b_epsilondev_xz,
+                        realw* d_b_epsilondev_yz,
+                        realw* d_b_epsilon_trace_over_3,
+                        realw* d_b_R_xx,
+                        realw* d_b_R_yy,
+                        realw* d_b_R_xy,
+                        realw* d_b_R_xz,
+                        realw* d_b_R_yz,
+                        realw* d_c11store,realw* d_c12store,realw* d_c13store,
+                        realw* d_c33store,realw* d_c44store
+                        ){
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before kernel Kernel_2_inner_core");
+#endif
+
+  /* if the grid can handle the number of blocks, we let it be 1D */
+  /* grid_2_x = nb_elem_color; */
+  /* nb_elem_color is just how many blocks we are computing now */
+
+  int num_blocks_x = nb_blocks_to_compute;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  int blocksize = NGLL3_PADDED;
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // Cuda timing
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+
+  Kernel_2_inner_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
+                                  mp->NGLOB_AB,
+                                  d_ibool,
+                                  d_idoubling,
+                                  mp->d_phase_ispec_inner_inner_core,
+                                  mp->num_phase_ispec_inner_core,
+                                  d_iphase,
+                                  mp->use_mesh_coloring_gpu,
+                                  mp->d_displ_inner_core,
+                                  mp->d_accel_inner_core,
+                                  d_xix, d_xiy, d_xiz,
+                                  d_etax, d_etay, d_etaz,
+                                  d_gammax, d_gammay, d_gammaz,
+                                  mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                  mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                  mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                  d_kappav, d_muv,
+                                  mp->compute_and_store_strain,
+                                  d_epsilondev_xx,
+                                  d_epsilondev_yy,
+                                  d_epsilondev_xy,
+                                  d_epsilondev_xz,
+                                  d_epsilondev_yz,
+                                  d_epsilon_trace_over_3,
+                                  mp->simulation_type,
+                                  mp->attenuation,
+                                  mp->use_attenuation_mimic,
+                                  d_one_minus_sum_beta,
+                                  d_factor_common,
+                                  d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+                                  mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+                                  mp->anisotropic_inner_core,
+                                  d_c11store,d_c12store,d_c13store,
+                                  d_c33store,d_c44store,
+                                  mp->gravity,
+                                  mp->d_xstore_inner_core,mp->d_ystore_inner_core,mp->d_zstore_inner_core,
+                                  mp->d_minus_gravity_table,
+                                  mp->d_minus_deriv_gravity_table,
+                                  mp->d_density_table,
+                                  mp->d_wgll_cube);
+
+
+  if(mp->simulation_type == 3) {
+    Kernel_2_inner_core_impl<<< grid,threads>>>(nb_blocks_to_compute,
+                                     mp->NGLOB_AB,
+                                     d_ibool,
+                                     d_idoubling,
+                                     mp->d_phase_ispec_inner_inner_core,
+                                     mp->num_phase_ispec_inner_core,
+                                     d_iphase,
+                                     mp->use_mesh_coloring_gpu,
+                                     mp->d_b_displ_inner_core,
+                                     mp->d_b_accel_inner_core,
+                                     d_xix, d_xiy, d_xiz,
+                                     d_etax, d_etay, d_etaz,
+                                     d_gammax, d_gammay, d_gammaz,
+                                     mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                     mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                     mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                     d_kappav, d_muv,
+                                     mp->compute_and_store_strain,
+                                     d_b_epsilondev_xx,
+                                     d_b_epsilondev_yy,
+                                     d_b_epsilondev_xy,
+                                     d_b_epsilondev_xz,
+                                     d_b_epsilondev_yz,
+                                     d_b_epsilon_trace_over_3,
+                                     mp->simulation_type,
+                                     mp->attenuation,
+                                     mp->use_attenuation_mimic,
+                                     d_one_minus_sum_beta,
+                                     d_factor_common,
+                                     d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
+                                     mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+                                     mp->anisotropic_inner_core,
+                                     d_c11store,d_c12store,d_c13store,
+                                     d_c33store,d_c44store,
+                                     mp->gravity,
+                                     mp->d_xstore_inner_core,mp->d_ystore_inner_core,mp->d_zstore_inner_core,
+                                     mp->d_minus_gravity_table,
+                                     mp->d_minus_deriv_gravity_table,
+                                     mp->d_density_table,
+                                     mp->d_wgll_cube);
+  }
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("Kernel2 Execution Time: %f ms\n",time);
+
+  /* cudaThreadSynchronize(); */
+  /* LOG("Kernel 2 finished"); */
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("Kernel_2_inner_core");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(compute_forces_inner_core_cuda,
+              COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
+                                                int* iphase) {
+
+  TRACE("compute_forces_inner_core_cuda");
+
+//daniel: debug
+  //printf("Running compute_forces_inner_core_cuda\n");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int num_elements;
+
+  if( *iphase == 1 )
+    num_elements = mp->nspec_outer_inner_core;
+  else
+    num_elements = mp->nspec_inner_inner_core;
+
+  // checks if anything to do
+  if( num_elements == 0 ) return;
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+
+    // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements
+    //         and followed by acoustic ones.
+    //         elastic elements also start with outer than inner element ordering
+
+    int nb_colors,nb_blocks_to_compute;
+    int istart;
+    int color_offset,color_offset_nonpadded,color_offset_nonpadded_att2;
+    int color_offset_ispec;
+
+    // sets up color loop
+    if( *iphase == 1 ){
+      // outer elements
+      nb_colors = mp->num_colors_outer_inner_core;
+      istart = 0;
+
+      // array offsets
+      color_offset = 0;
+      color_offset_nonpadded = 0;
+      color_offset_nonpadded_att2 = 0;
+      color_offset_ispec = 0;
+    }else{
+      // inner elements (start after outer elements)
+      nb_colors = mp->num_colors_outer_inner_core + mp->num_colors_inner_inner_core;
+      istart = mp->num_colors_outer_inner_core;
+
+      // array offsets
+      color_offset = (mp->nspec_outer_inner_core) * NGLL3_PADDED;
+      color_offset_nonpadded = (mp->nspec_outer_inner_core) * NGLL3;
+      color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * NGLL3 * N_SLS;
+      color_offset_ispec = mp->nspec_outer_inner_core;
+    }
+
+    // loops over colors
+    for(int icolor = istart; icolor < nb_colors; icolor++){
+
+      nb_blocks_to_compute = mp->h_num_elem_colors_inner_core[icolor];
+
+      // checks
+      //if( nb_blocks_to_compute <= 0 ){
+      //  printf("error number of elastic color blocks: %d -- color = %d \n",nb_blocks_to_compute,icolor);
+      //  exit(EXIT_FAILURE);
+      //}
+
+      Kernel_2_inner_core(nb_blocks_to_compute,mp,
+                          *iphase,
+                          mp->d_ibool_inner_core + color_offset_nonpadded,
+                          mp->d_idoubling_inner_core + color_offset_ispec,
+                          mp->d_xix_inner_core + color_offset,
+                          mp->d_xiy_inner_core + color_offset,
+                          mp->d_xiz_inner_core + color_offset,
+                          mp->d_etax_inner_core + color_offset,
+                          mp->d_etay_inner_core + color_offset,
+                          mp->d_etaz_inner_core + color_offset,
+                          mp->d_gammax_inner_core + color_offset,
+                          mp->d_gammay_inner_core + color_offset,
+                          mp->d_gammaz_inner_core + color_offset,
+                          mp->d_kappavstore_inner_core + color_offset,
+                          mp->d_muvstore_inner_core + color_offset,
+                          mp->d_epsilondev_xx_inner_core + color_offset_nonpadded,
+                          mp->d_epsilondev_yy_inner_core + color_offset_nonpadded,
+                          mp->d_epsilondev_xy_inner_core + color_offset_nonpadded,
+                          mp->d_epsilondev_xz_inner_core + color_offset_nonpadded,
+                          mp->d_epsilondev_yz_inner_core + color_offset_nonpadded,
+                          mp->d_eps_trace_over_3_inner_core + color_offset_nonpadded,
+                          mp->d_one_minus_sum_beta_inner_core + color_offset_nonpadded,
+                          mp->d_factor_common_inner_core + color_offset_nonpadded_att2,
+                          mp->d_R_xx_inner_core + color_offset_nonpadded,
+                          mp->d_R_yy_inner_core + color_offset_nonpadded,
+                          mp->d_R_xy_inner_core + color_offset_nonpadded,
+                          mp->d_R_xz_inner_core + color_offset_nonpadded,
+                          mp->d_R_yz_inner_core + color_offset_nonpadded,
+                          mp->d_b_epsilondev_xx_inner_core + color_offset_nonpadded,
+                          mp->d_b_epsilondev_yy_inner_core + color_offset_nonpadded,
+                          mp->d_b_epsilondev_xy_inner_core + color_offset_nonpadded,
+                          mp->d_b_epsilondev_xz_inner_core + color_offset_nonpadded,
+                          mp->d_b_epsilondev_yz_inner_core + color_offset_nonpadded,
+                          mp->d_b_eps_trace_over_3_inner_core + color_offset_nonpadded,
+                          mp->d_b_R_xx_inner_core + color_offset_nonpadded,
+                          mp->d_b_R_yy_inner_core + color_offset_nonpadded,
+                          mp->d_b_R_xy_inner_core + color_offset_nonpadded,
+                          mp->d_b_R_xz_inner_core + color_offset_nonpadded,
+                          mp->d_b_R_yz_inner_core + color_offset_nonpadded,
+                          mp->d_c11store_inner_core + color_offset,
+                          mp->d_c12store_inner_core + color_offset,
+                          mp->d_c13store_inner_core + color_offset,
+                          mp->d_c33store_inner_core + color_offset,
+                          mp->d_c44store_inner_core + color_offset
+                          );
+
+      // for padded and aligned arrays
+      color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+      // for no-aligned arrays
+      color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+      // for factor_common array
+      color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
+      // for array(ispec)
+      color_offset_ispec += nb_blocks_to_compute;
+    }
+
+  }else{
+
+    // no mesh coloring: uses atomic updates
+
+    Kernel_2_inner_core(num_elements,mp,
+                        *iphase,
+                        mp->d_ibool_inner_core,
+                        mp->d_idoubling_inner_core,
+                        mp->d_xix_inner_core,mp->d_xiy_inner_core,mp->d_xiz_inner_core,
+                        mp->d_etax_inner_core,mp->d_etay_inner_core,mp->d_etaz_inner_core,
+                        mp->d_gammax_inner_core,mp->d_gammay_inner_core,mp->d_gammaz_inner_core,
+                        mp->d_kappavstore_inner_core,
+                        mp->d_muvstore_inner_core,
+                        mp->d_epsilondev_xx_inner_core,
+                        mp->d_epsilondev_yy_inner_core,
+                        mp->d_epsilondev_xy_inner_core,
+                        mp->d_epsilondev_xz_inner_core,
+                        mp->d_epsilondev_yz_inner_core,
+                        mp->d_eps_trace_over_3_inner_core,
+                        mp->d_one_minus_sum_beta_inner_core,
+                        mp->d_factor_common_inner_core,
+                        mp->d_R_xx_inner_core,
+                        mp->d_R_yy_inner_core,
+                        mp->d_R_xy_inner_core,
+                        mp->d_R_xz_inner_core,
+                        mp->d_R_yz_inner_core,
+                        mp->d_b_epsilondev_xx_inner_core,
+                        mp->d_b_epsilondev_yy_inner_core,
+                        mp->d_b_epsilondev_xy_inner_core,
+                        mp->d_b_epsilondev_xz_inner_core,
+                        mp->d_b_epsilondev_yz_inner_core,
+                        mp->d_b_eps_trace_over_3_inner_core,
+                        mp->d_b_R_xx_inner_core,
+                        mp->d_b_R_yy_inner_core,
+                        mp->d_b_R_xy_inner_core,
+                        mp->d_b_R_xz_inner_core,
+                        mp->d_b_R_yz_inner_core,
+                        mp->d_c11store_inner_core,mp->d_c12store_inner_core,mp->d_c13store_inner_core,
+                        mp->d_c33store_inner_core,mp->d_c44store_inner_core
+                        );
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_forces_inner_core_cuda");
+#endif    
+}
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,673 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elemental routines
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// fluid rotation
+
+__device__ void compute_element_oc_rotation(int tx,int working_element,
+                                            realw time,
+                                            realw two_omega_earth,
+                                            realw deltat,
+                                            realw* d_A_array_rotation,
+                                            realw* d_B_array_rotation,
+                                            reald dpotentialdxl, reald dpotentialdyl,
+                                            reald* dpotentialdx_with_rot,
+                                            reald* dpotentialdy_with_rot) {
+
+  reald two_omega_deltat,cos_two_omega_t,sin_two_omega_t;
+  reald A_rotation,B_rotation;
+  reald ux_rotation,uy_rotation;
+  reald source_euler_A,source_euler_B;
+
+  // non-padded offset
+  int offset_nonpadded = tx + working_element*NGLL3;
+
+  // store the source for the Euler scheme for A_rotation and B_rotation
+  two_omega_deltat = deltat * two_omega_earth;
+
+  cos_two_omega_t = cos(two_omega_earth*time);
+  sin_two_omega_t = sin(two_omega_earth*time);
+
+  // time step deltat of Euler scheme is included in the source
+  source_euler_A = two_omega_deltat * (cos_two_omega_t * dpotentialdyl + sin_two_omega_t * dpotentialdxl);
+  source_euler_B = two_omega_deltat * (sin_two_omega_t * dpotentialdyl - cos_two_omega_t * dpotentialdxl);
+
+  A_rotation = d_A_array_rotation[offset_nonpadded];
+  B_rotation = d_B_array_rotation[offset_nonpadded];
+
+  ux_rotation =   A_rotation*cos_two_omega_t + B_rotation*sin_two_omega_t;
+  uy_rotation = - A_rotation*sin_two_omega_t + B_rotation*cos_two_omega_t;
+
+  *dpotentialdx_with_rot = dpotentialdxl + ux_rotation;
+  *dpotentialdy_with_rot = dpotentialdyl + uy_rotation;
+
+  // updates rotation term with Euler scheme
+  d_A_array_rotation[offset_nonpadded] += source_euler_A;
+  d_B_array_rotation[offset_nonpadded] += source_euler_B;
+
+  return;
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 2
+//
+// for outer core ( acoustic domain )
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void Kernel_2_outer_core_impl(int nb_blocks_to_compute,
+                                       int NGLOB, int* d_ibool,
+                                       int* d_phase_ispec_inner,
+                                       int num_phase_ispec,
+                                       int d_iphase,
+                                       int use_mesh_coloring_gpu,
+                                       realw* d_potential, realw* d_potential_dot_dot,
+                                       realw* d_xix, realw* d_xiy, realw* d_xiz,
+                                       realw* d_etax, realw* d_etay, realw* d_etaz,
+                                       realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                                       realw* hprime_xx, realw* hprime_yy, realw* hprime_zz,
+                                       realw* hprimewgll_xx, realw* hprimewgll_yy, realw* hprimewgll_zz,
+                                       realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
+                                       int GRAVITY,
+                                       realw* d_xstore, realw* d_ystore, realw* d_zstore,
+                                       realw* d_d_ln_density_dr_table,
+                                       realw* d_minus_rho_g_over_kappa_fluid,
+                                       realw* wgll_cube,
+                                       int ROTATION,
+                                       realw time,
+                                       realw two_omega_earth,
+                                       realw deltat,
+                                       realw* d_A_array_rotation,realw* d_B_array_rotation){
+
+  int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  int tx = threadIdx.x;
+
+  //const int NGLL3 = NGLL3;
+  const int NGLL3_ALIGN = NGLL3_PADDED;
+  // R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
+  const reald R_EARTH_KM = 6371.0f;
+  // uncomment line below for PREM with oceans
+  //const reald R_EARTH_KM = 6368.0f;
+
+  int K = (tx/NGLL2);
+  int J = ((tx-K*NGLL2)/NGLLX);
+  int I = (tx-K*NGLL2-J*NGLLX);
+
+  int active,offset;
+  int iglob = 0;
+  int working_element;
+  reald temp1l,temp2l,temp3l;
+  reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+  reald dpotentialdxl,dpotentialdyl,dpotentialdzl;
+  reald dpotentialdx_with_rot,dpotentialdy_with_rot;
+  reald fac1,fac2,fac3;
+  reald sum_terms;
+  reald gravity_term;
+  reald gxl,gyl,gzl;
+  reald radius,theta,phi;
+  reald cos_theta,sin_theta,cos_phi,sin_phi;
+  reald grad_x_ln_rho,grad_y_ln_rho,grad_z_ln_rho;
+  int int_radius;
+
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+  int l;
+  int offset1,offset2,offset3;
+  realw hp1,hp2,hp3;
+#endif
+
+  __shared__ reald s_dummy_loc[NGLL3];
+
+  __shared__ reald s_temp1[NGLL3];
+  __shared__ reald s_temp2[NGLL3];
+  __shared__ reald s_temp3[NGLL3];
+
+// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
+// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
+  active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+
+// copy from global memory to shared memory
+// each thread writes one of the NGLL^3 = 125 data points
+  if (active) {
+
+#ifdef USE_MESH_COLORING_GPU
+    working_element = bx;
+#else
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+      working_element = bx;
+    }else{
+      // iphase-1 and working_element-1 for Fortran->C array conventions
+      working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
+    }
+#endif
+
+    // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
+    iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+#ifdef USE_TEXTURES
+    s_dummy_loc[tx] = tex1Dfetch(tex_potential, iglob);
+#else
+    // changing iglob indexing to match fortran row changes fast style
+    s_dummy_loc[tx] = d_potential[iglob];
+#endif
+  }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+  __syncthreads();
+
+#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+
+  if (active) {
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+//      if(iglob == 0 )printf("kernel 2: iglob %i  hprime_xx %f %f %f \n",iglob,hprime_xx[0],hprime_xx[1],hprime_xx[2]);
+#endif
+
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+    temp1l = 0.f;
+    temp2l = 0.f;
+    temp3l = 0.f;
+
+    for (l=0;l<NGLLX;l++) {
+        hp1 = hprime_xx[l*NGLLX+I];
+        offset1 = K*NGLL2+J*NGLLX+l;
+        temp1l += s_dummy_loc[offset1]*hp1;
+
+        //no more assumes that hprime_xx = hprime_yy = hprime_zz
+        hp2 = hprime_yy[l*NGLLX+J];
+        offset2 = K*NGLL2+l*NGLLX+I;
+        temp2l += s_dummy_loc[offset2]*hp2;
+
+        hp3 = hprime_zz[l*NGLLX+K];
+        offset3 = l*NGLL2+J*NGLLX+I;
+        temp3l += s_dummy_loc[offset3]*hp3;
+    }
+#else
+
+    temp1l = s_dummy_loc[K*NGLL2+J*NGLLX]*hprime_xx[I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+1]*hprime_xx[NGLLX+I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+2]*hprime_xx[2*NGLLX+I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+3]*hprime_xx[3*NGLLX+I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+4]*hprime_xx[4*NGLLX+I];
+
+    temp2l = s_dummy_loc[K*NGLL2+I]*hprime_yy[J]
+            + s_dummy_loc[K*NGLL2+NGLLX+I]*hprime_yy[NGLLX+J]
+            + s_dummy_loc[K*NGLL2+2*NGLLX+I]*hprime_yy[2*NGLLX+J]
+            + s_dummy_loc[K*NGLL2+3*NGLLX+I]*hprime_yy[3*NGLLX+J]
+            + s_dummy_loc[K*NGLL2+4*NGLLX+I]*hprime_yy[4*NGLLX+J];
+
+    temp3l = s_dummy_loc[J*NGLLX+I]*hprime_zz[K]
+            + s_dummy_loc[NGLL2+J*NGLLX+I]*hprime_zz[NGLLX+K]
+            + s_dummy_loc[2*NGLL2+J*NGLLX+I]*hprime_zz[2*NGLLX+K]
+            + s_dummy_loc[3*NGLL2+J*NGLLX+I]*hprime_zz[3*NGLLX+K]
+            + s_dummy_loc[4*NGLL2+J*NGLLX+I]*hprime_zz[4*NGLLX+K];
+
+#endif
+
+    // compute derivatives of ux, uy and uz with respect to x, y and z
+    offset = working_element*NGLL3_ALIGN + tx;
+
+    xixl = d_xix[offset];
+    xiyl = d_xiy[offset];
+    xizl = d_xiz[offset];
+    etaxl = d_etax[offset];
+    etayl = d_etay[offset];
+    etazl = d_etaz[offset];
+    gammaxl = d_gammax[offset];
+    gammayl = d_gammay[offset];
+    gammazl = d_gammaz[offset];
+
+    //  compute the jacobian
+    jacobianl = 1.f / (xixl*(etayl*gammazl-etazl*gammayl)
+                      -xiyl*(etaxl*gammazl-etazl*gammaxl)
+                      +xizl*(etaxl*gammayl-etayl*gammaxl));
+
+    // derivatives of potential
+    dpotentialdxl = xixl*temp1l + etaxl*temp2l + gammaxl*temp3l;
+    dpotentialdyl = xiyl*temp1l + etayl*temp2l + gammayl*temp3l;
+    dpotentialdzl = xizl*temp1l + etazl*temp2l + gammazl*temp3l;
+
+    // compute contribution of rotation and add to gradient of potential
+    // this term has no Z component
+    if(ROTATION){
+      compute_element_oc_rotation(tx,working_element,time,two_omega_earth,deltat,
+                                  d_A_array_rotation,d_B_array_rotation,
+                                  dpotentialdxl,dpotentialdyl,
+                                  &dpotentialdx_with_rot,&dpotentialdy_with_rot);
+
+    }else{
+      dpotentialdx_with_rot = dpotentialdxl;
+      dpotentialdy_with_rot = dpotentialdyl;
+    }
+
+    // pre-computes gravity terms
+
+    // use mesh coordinates to get theta and phi
+    // x y z contain r theta phi
+    radius = d_xstore[iglob];
+    theta = d_ystore[iglob];
+    phi = d_zstore[iglob];
+
+    cos_theta = cos(theta);
+    sin_theta = sin(theta);
+    cos_phi = cos(phi);
+    sin_phi = sin(phi);
+
+    // for efficiency replace with lookup table every 100 m in radial direction
+    // note: radius in outer core should never be zero,
+    //          and arrays in C start from 0, thus we need to subtract -1
+    int_radius = rint(radius * R_EARTH_KM * 10.0f ) - 1;
+
+    // depending on gravity or not, different potential definitions are used
+    if( ! GRAVITY ){
+      // add (chi/rho)grad(rho) term in no gravity case
+
+      // grad(rho)/rho in Cartesian components
+      grad_x_ln_rho = sin_theta * cos_phi * d_d_ln_density_dr_table[int_radius];
+      grad_y_ln_rho = sin_theta * sin_phi * d_d_ln_density_dr_table[int_radius];
+      grad_z_ln_rho = cos_theta * d_d_ln_density_dr_table[int_radius];
+
+      // adding (chi/rho)grad(rho)
+      dpotentialdx_with_rot = dpotentialdx_with_rot + s_dummy_loc[tx] * grad_x_ln_rho;
+      dpotentialdy_with_rot = dpotentialdy_with_rot + s_dummy_loc[tx] * grad_y_ln_rho;
+      dpotentialdzl = dpotentialdzl + s_dummy_loc[tx] * grad_z_ln_rho;
+
+    }else{
+
+      // compute divergence of displacement
+      // precompute and store gravity term
+      //
+      // get g, rho and dg/dr=dg
+      // spherical components of the gravitational acceleration
+      //
+      // Cartesian components of the gravitational acceleration
+      // integrate and multiply by rho / Kappa
+      gxl = sin_theta*cos_phi;
+      gyl = sin_theta*sin_phi;
+      gzl = cos_theta;
+
+      // uses potential definition: s = grad(chi)
+      // gravity term: - rho * g * 1/kappa grad(chi)
+
+      gravity_term = d_minus_rho_g_over_kappa_fluid[int_radius] * jacobianl * wgll_cube[tx] *
+                    ( dpotentialdx_with_rot * gxl + dpotentialdy_with_rot * gyl + dpotentialdzl * gzl);
+
+      // divergence of displacement field with gravity on
+      // note: these calculations are only considered for SIMULATION_TYPE == 1 .and. SAVE_FORWARD
+      //          and one has set MOVIE_VOLUME_TYPE == 4 when MOVIE_VOLUME is .true.;
+      //         in case of SIMULATION_TYPE == 3, it gets overwritten by compute_kernels_outer_core()
+      //if (NSPEC_OUTER_CORE_ADJOINT /= 1 && MOVIE_VOLUME ){
+      //  div_displfluid(i,j,k,ispec) =  d_minus_rho_g_over_kappa_fluid[int_radius] *
+      //        (dpotentialdx_with_rot * gxl + dpotentialdy_with_rot * gyl + dpotentialdzl * gzl);
+      //}
+
+    }
+
+    // form the dot product with the test vector
+    s_temp1[tx] = jacobianl*(xixl*dpotentialdx_with_rot + xiyl*dpotentialdy_with_rot + xizl*dpotentialdzl);
+    s_temp2[tx] = jacobianl*(etaxl*dpotentialdx_with_rot + etayl*dpotentialdy_with_rot + etazl*dpotentialdzl);
+    s_temp3[tx] = jacobianl*(gammaxl*dpotentialdx_with_rot + gammayl*dpotentialdy_with_rot + gammazl*dpotentialdzl);
+  }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+  __syncthreads();
+
+  if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+    temp1l = 0.f;
+    temp2l = 0.f;
+    temp3l = 0.f;
+
+    for (l=0;l<NGLLX;l++) {
+        fac1 = hprimewgll_xx[I*NGLLX+l];
+        offset1 = K*NGLL2+J*NGLLX+l;
+        temp1l += s_temp1[offset1]*fac1;
+
+        //no more assumes hprimewgll_xx = hprimewgll_yy = hprimewgll_zz
+        fac2 = hprimewgll_yy[J*NGLLX+l];
+        offset2 = K*NGLL2+l*NGLLX+I;
+        temp2l += s_temp2[offset2]*fac2;
+
+        fac3 = hprimewgll_zz[K*NGLLX+l];
+        offset3 = l*NGLL2+J*NGLLX+I;
+        temp3l += s_temp3[offset3]*fac3;
+    }
+#else
+
+    temp1l = s_temp1[K*NGLL2+J*NGLLX]*hprimewgll_xx[I*NGLLX]
+            + s_temp1[K*NGLL2+J*NGLLX+1]*hprimewgll_xx[I*NGLLX+1]
+            + s_temp1[K*NGLL2+J*NGLLX+2]*hprimewgll_xx[I*NGLLX+2]
+            + s_temp1[K*NGLL2+J*NGLLX+3]*hprimewgll_xx[I*NGLLX+3]
+            + s_temp1[K*NGLL2+J*NGLLX+4]*hprimewgll_xx[I*NGLLX+4];
+
+
+    temp2l = s_temp2[K*NGLL2+I]*hprimewgll_yy[J*NGLLX]
+            + s_temp2[K*NGLL2+NGLLX+I]*hprimewgll_yy[J*NGLLX+1]
+            + s_temp2[K*NGLL2+2*NGLLX+I]*hprimewgll_yy[J*NGLLX+2]
+            + s_temp2[K*NGLL2+3*NGLLX+I]*hprimewgll_yy[J*NGLLX+3]
+            + s_temp2[K*NGLL2+4*NGLLX+I]*hprimewgll_yy[J*NGLLX+4];
+
+
+    temp3l = s_temp3[J*NGLLX+I]*hprimewgll_zz[K*NGLLX]
+            + s_temp3[NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+1]
+            + s_temp3[2*NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+2]
+            + s_temp3[3*NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+3]
+            + s_temp3[4*NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+4];
+
+
+#endif
+
+    fac1 = wgllwgll_yz[K*NGLLX+J];
+    fac2 = wgllwgll_xz[K*NGLLX+I];
+    fac3 = wgllwgll_xy[J*NGLLX+I];
+
+    sum_terms = -(fac1*temp1l + fac2*temp2l + fac3*temp3l);
+    if( GRAVITY ) sum_terms += gravity_term;
+
+    iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+#ifdef USE_TEXTURES
+    d_potential_dot_dot[iglob] = tex1Dfetch(tex_potential_dot_dot, iglob)
+                                            + sum_terms;
+#else
+
+#ifdef USE_MESH_COLORING_GPU
+    // no atomic operation needed, colors don't share global points between elements
+    d_potential_dot_dot[iglob] += sum_terms;
+#else
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+
+      // no atomic operation needed, colors don't share global points between elements
+      d_potential_dot_dot[iglob] += sum_terms;
+
+    }else{
+
+      atomicAdd(&d_potential_dot_dot[iglob],sum_terms);
+
+    }
+#endif
+
+#endif
+  }
+
+#else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+  d_potential_dot_dot[iglob] = 123.123f;
+#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void Kernel_2_outer_core(int nb_blocks_to_compute, Mesh* mp,
+                         int d_iphase,
+                         int* d_ibool,
+                         realw* d_xix,realw* d_xiy,realw* d_xiz,
+                         realw* d_etax,realw* d_etay,realw* d_etaz,
+                         realw* d_gammax,realw* d_gammay,realw* d_gammaz,
+                         realw time, realw b_time,
+                         realw* d_A_array_rotation,realw* d_B_array_rotation,
+                         realw* d_b_A_array_rotation,realw* d_b_B_array_rotation
+                         ){
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("before outer_core kernel Kernel_2");
+#endif
+
+  /* if the grid can handle the number of blocks, we let it be 1D */
+  /* grid_2_x = nb_elem_color; */
+  /* nb_elem_color is just how many blocks we are computing now */
+
+  int num_blocks_x = nb_blocks_to_compute;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2;
+  dim3 grid_2(num_blocks_x,num_blocks_y);
+
+  // Cuda timing
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+
+  Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+                                                        mp->NGLOB_OUTER_CORE,
+                                                        d_ibool,
+                                                        mp->d_phase_ispec_inner_outer_core,
+                                                        mp->num_phase_ispec_outer_core,
+                                                        d_iphase,
+                                                        mp->use_mesh_coloring_gpu,
+                                                        mp->d_displ_outer_core,
+                                                        mp->d_accel_outer_core,
+                                                        d_xix, d_xiy, d_xiz,
+                                                        d_etax, d_etay, d_etaz,
+                                                        d_gammax, d_gammay, d_gammaz,
+                                                        mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                                        mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                                        mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                        mp->gravity,
+                                                        mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
+                                                        mp->d_d_ln_density_dr_table,
+                                                        mp->d_minus_rho_g_over_kappa_fluid,
+                                                        mp->d_wgll_cube,
+                                                        mp->rotation,
+                                                        time,
+                                                        mp->d_two_omega_earth,
+                                                        mp->d_deltat,
+                                                        d_A_array_rotation,d_B_array_rotation);
+
+  if(mp->simulation_type == 3) {
+    Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+                                                          mp->NGLOB_OUTER_CORE,
+                                                          d_ibool,
+                                                          mp->d_phase_ispec_inner_outer_core,
+                                                          mp->num_phase_ispec_outer_core,
+                                                          d_iphase,
+                                                          mp->use_mesh_coloring_gpu,
+                                                          mp->d_b_displ_outer_core,
+                                                          mp->d_b_accel_outer_core,
+                                                          d_xix, d_xiy, d_xiz,
+                                                          d_etax, d_etay, d_etaz,
+                                                          d_gammax, d_gammay, d_gammaz,
+                                                          mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                                          mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                                          mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                          mp->gravity,
+                                                          mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
+                                                          mp->d_d_ln_density_dr_table,
+                                                          mp->d_minus_rho_g_over_kappa_fluid,
+                                                          mp->d_wgll_cube,
+                                                          mp->rotation,
+                                                          b_time,
+                                                          mp->d_b_two_omega_earth,
+                                                          mp->d_b_deltat,
+                                                          d_b_A_array_rotation,d_b_B_array_rotation);
+  }
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("Kernel2 Execution Time: %f ms\n",time);
+
+  /* cudaThreadSynchronize(); */
+  /* TRACE("Kernel 2 finished"); */
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("Tried to start with %dx1 blocks\n",nb_blocks_to_compute);
+  exit_on_cuda_error("kernel Kernel_2_outer_core");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// main compute_forces_outer_core CUDA routine
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_forces_outer_core_cuda,
+              COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
+                                            int* iphase,
+                                            realw* time_f,
+                                            realw* b_time_f) {
+
+  TRACE("compute_forces_outer_core_cuda");
+
+//daniel: debug
+  //printf("Running compute_forces_outer_core_cuda\n");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int num_elements;
+  realw time = *time_f;
+  realw b_time = *b_time_f;
+
+  if( *iphase == 1 )
+    num_elements = mp->nspec_outer_outer_core;
+  else
+    num_elements = mp->nspec_inner_outer_core;
+
+  if( num_elements == 0 ) return;
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+
+    // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements
+    //         and followed by acoustic ones.
+    //         acoustic elements also start with outer than inner element ordering
+
+    int nb_colors,nb_blocks_to_compute;
+    int istart;
+    int color_offset,color_offset_nonpadded;
+
+    // sets up color loop
+    if( *iphase == 1 ){
+      // outer elements
+      nb_colors = mp->num_colors_outer_outer_core;
+      istart = 0;
+
+      // array offsets (acoustic elements start after elastic ones)
+      color_offset = mp->nspec_elastic * NGLL3_PADDED;
+      color_offset_nonpadded = mp->nspec_elastic * NGLL3;
+    }else{
+      // inner element colors (start after outer elements)
+      nb_colors = mp->num_colors_outer_outer_core + mp->num_colors_inner_outer_core;
+      istart = mp->num_colors_outer_outer_core;
+
+      // array offsets (inner elements start after outer ones)
+      color_offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
+      color_offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
+    }
+
+    // loops over colors
+    for(int icolor = istart; icolor < nb_colors; icolor++){
+
+      nb_blocks_to_compute = mp->h_num_elem_colors_outer_core[icolor];
+
+      Kernel_2_outer_core(nb_blocks_to_compute,mp,
+                          *iphase,
+                          mp->d_ibool_outer_core + color_offset_nonpadded,
+                          mp->d_xix_outer_core + color_offset,
+                          mp->d_xiy_outer_core + color_offset,
+                          mp->d_xiz_outer_core + color_offset,
+                          mp->d_etax_outer_core + color_offset,
+                          mp->d_etay_outer_core + color_offset,
+                          mp->d_etaz_outer_core + color_offset,
+                          mp->d_gammax_outer_core + color_offset,
+                          mp->d_gammay_outer_core + color_offset,
+                          mp->d_gammaz_outer_core + color_offset,
+                          time,b_time,
+                          mp->d_A_array_rotation + color_offset_nonpadded,
+                          mp->d_B_array_rotation + color_offset_nonpadded,
+                          mp->d_b_A_array_rotation + color_offset_nonpadded,
+                          mp->d_b_B_array_rotation + color_offset_nonpadded
+                         );
+
+      // for padded and aligned arrays
+      color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+      // for no-aligned arrays
+      color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+    }
+
+  }else{
+
+    // no mesh coloring: uses atomic updates
+    Kernel_2_outer_core(num_elements, mp,
+                        *iphase,
+                        mp->d_ibool_outer_core,
+                        mp->d_xix_outer_core,mp->d_xiy_outer_core,mp->d_xiz_outer_core,
+                        mp->d_etax_outer_core,mp->d_etay_outer_core,mp->d_etaz_outer_core,
+                        mp->d_gammax_outer_core,mp->d_gammay_outer_core,mp->d_gammaz_outer_core,
+                        time,b_time,
+                        mp->d_A_array_rotation,mp->d_B_array_rotation,
+                        mp->d_b_A_array_rotation,mp->d_b_B_array_rotation
+                        );
+
+  }
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_forces_outer_core_cuda");
+#endif    
+}
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,646 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC SIMULATIONS
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_kernels_cudakernel(int* ispec_is_elastic,
+                                           int* ibool,
+                                           realw* accel,
+                                           realw* b_displ,
+                                           realw* epsilondev_xx,
+                                           realw* epsilondev_yy,
+                                           realw* epsilondev_xy,
+                                           realw* epsilondev_xz,
+                                           realw* epsilondev_yz,
+                                           realw* b_epsilondev_xx,
+                                           realw* b_epsilondev_yy,
+                                           realw* b_epsilondev_xy,
+                                           realw* b_epsilondev_xz,
+                                           realw* b_epsilondev_yz,
+                                           realw* rho_kl,
+                                           realw deltat,
+                                           realw* mu_kl,
+                                           realw* kappa_kl,
+                                           realw* epsilon_trace_over_3,
+                                           realw* b_epsilon_trace_over_3,
+                                           int NSPEC_AB) {
+
+  int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // handles case when there is 1 extra block (due to rectangular grid)
+  if(ispec < NSPEC_AB) {
+
+    // elastic elements only
+    if( ispec_is_elastic[ispec] ) {
+
+      int ijk = threadIdx.x;
+      int ijk_ispec = ijk + NGLL3*ispec;
+      int iglob = ibool[ijk_ispec] - 1 ;
+
+      // isotropic kernels:
+      // density kernel
+      rho_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_displ[3*iglob]+
+                                     accel[3*iglob+1]*b_displ[3*iglob+1]+
+                                     accel[3*iglob+2]*b_displ[3*iglob+2]);
+
+
+      // shear modulus kernel
+      mu_kl[ijk_ispec] += deltat * (epsilondev_xx[ijk_ispec]*b_epsilondev_xx[ijk_ispec]+
+                                    epsilondev_yy[ijk_ispec]*b_epsilondev_yy[ijk_ispec]+
+                                    (epsilondev_xx[ijk_ispec]+epsilondev_yy[ijk_ispec])*
+                                    (b_epsilondev_xx[ijk_ispec]+b_epsilondev_yy[ijk_ispec])+
+                                    2*(epsilondev_xy[ijk_ispec]*b_epsilondev_xy[ijk_ispec]+
+                                       epsilondev_xz[ijk_ispec]*b_epsilondev_xz[ijk_ispec]+
+                                       epsilondev_yz[ijk_ispec]*b_epsilondev_yz[ijk_ispec]));
+
+      // bulk modulus kernel
+      kappa_kl[ijk_ispec] += deltat*(9*epsilon_trace_over_3[ijk_ispec]*
+                                     b_epsilon_trace_over_3[ijk_ispec]);
+
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_kernels_elastic_cuda,
+              COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
+                                            realw* deltat_f) {
+TRACE("compute_kernels_elastic_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+  realw deltat = *deltat_f;
+
+  int num_blocks_x = mp->NSPEC_AB;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
+                                               mp->d_accel, mp->d_b_displ,
+                                               mp->d_epsilondev_xx,
+                                               mp->d_epsilondev_yy,
+                                               mp->d_epsilondev_xy,
+                                               mp->d_epsilondev_xz,
+                                               mp->d_epsilondev_yz,
+                                               mp->d_b_epsilondev_xx,
+                                               mp->d_b_epsilondev_yy,
+                                               mp->d_b_epsilondev_xy,
+                                               mp->d_b_epsilondev_xz,
+                                               mp->d_b_epsilondev_yz,
+                                               mp->d_rho_kl,
+                                               deltat,
+                                               mp->d_mu_kl,
+                                               mp->d_kappa_kl,
+                                               mp->d_epsilon_trace_over_3,
+                                               mp->d_b_epsilon_trace_over_3,
+                                               mp->NSPEC_AB);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_kernels_elastic_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// NOISE SIMULATIONS
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void compute_kernels_strength_noise_cuda_kernel(realw* displ,
+                                                           int* free_surface_ispec,
+                                                           int* free_surface_ijk,
+                                                           int* ibool,
+                                                           realw* noise_surface_movie,
+                                                           realw* normal_x_noise,
+                                                           realw* normal_y_noise,
+                                                           realw* normal_z_noise,
+                                                           realw* Sigma_kl,
+                                                           realw deltat,
+                                                           int num_free_surface_faces) {
+  int iface = blockIdx.x + blockIdx.y*gridDim.x;
+
+  if(iface < num_free_surface_faces) {
+
+    int ispec = free_surface_ispec[iface]-1;
+    int igll = threadIdx.x;
+    int ipoin = igll + NGLL2*iface;
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1 ;
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1 ;
+
+    realw eta = ( noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x_noise[ipoin]+
+                 noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y_noise[ipoin]+
+                 noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z_noise[ipoin]);
+
+    Sigma_kl[INDEX4(5,5,5,i,j,k,ispec)] += deltat*eta*(normal_x_noise[ipoin]*displ[3*iglob]+
+                                                       normal_y_noise[ipoin]*displ[1+3*iglob]+
+                                                       normal_z_noise[ipoin]*displ[2+3*iglob]);
+  }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_kernels_strgth_noise_cu,
+              COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
+                                                    realw* h_noise_surface_movie,
+                                                    realw* deltat) {
+
+TRACE("compute_kernels_strgth_noise_cu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
+
+
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(NGLL2,1,1);
+
+  compute_kernels_strength_noise_cuda_kernel<<<grid,threads>>>(mp->d_displ,
+                                                               mp->d_free_surface_ispec,
+                                                               mp->d_free_surface_ijk,
+                                                               mp->d_ibool,
+                                                               mp->d_noise_surface_movie,
+                                                               mp->d_normal_x_noise,
+                                                               mp->d_normal_y_noise,
+                                                               mp->d_normal_z_noise,
+                                                               mp->d_Sigma_kl,*deltat,
+                                                               mp->num_free_surface_faces);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_kernels_strength_noise_cuda_kernel");
+#endif
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC SIMULATIONS
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__device__ void compute_gradient_kernel(int ijk,
+                                        int ispec,
+                                        realw* scalar_field,
+                                        realw* vector_field_element,
+                                        realw* hprime_xx,
+                                        realw* hprime_yy,
+                                        realw* hprime_zz,
+                                        realw* d_xix,
+                                        realw* d_xiy,
+                                        realw* d_xiz,
+                                        realw* d_etax,
+                                        realw* d_etay,
+                                        realw* d_etaz,
+                                        realw* d_gammax,
+                                        realw* d_gammay,
+                                        realw* d_gammaz,
+                                        realw rhol,
+                                        int gravity) {
+
+  realw temp1l,temp2l,temp3l;
+  realw hp1,hp2,hp3;
+  realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl;
+  realw rho_invl;
+  int l,offset,offset1,offset2,offset3;
+
+  //const int NGLLX = 5;
+  const int NGLL3_ALIGN = NGLL3_PADDED;
+
+  int K = (ijk/NGLL2);
+  int J = ((ijk-K*NGLL2)/NGLLX);
+  int I = (ijk-K*NGLL2-J*NGLLX);
+
+  // derivative along x
+  temp1l = 0.f;
+  for( l=0; l<NGLLX;l++){
+    hp1 = hprime_xx[l*NGLLX+I];
+    offset1 = K*NGLL2+J*NGLLX+l;
+    temp1l += scalar_field[offset1]*hp1;
+  }
+
+  // derivative along y
+  temp2l = 0.f;
+  for( l=0; l<NGLLX;l++){
+    hp2 = hprime_yy[l*NGLLX+J];
+    offset2 = K*NGLL2+l*NGLLX+I;
+    temp2l += scalar_field[offset2]*hp2;
+  }
+
+  // derivative along z
+  temp3l = 0.f;
+  for( l=0; l<NGLLX;l++){
+    hp3 = hprime_zz[l*NGLLX+K];
+    offset3 = l*NGLL2+J*NGLLX+I;
+    temp3l += scalar_field[offset3]*hp3;
+
+  }
+
+  offset = ispec*NGLL3_ALIGN + ijk;
+
+  xixl = d_xix[offset];
+  xiyl = d_xiy[offset];
+  xizl = d_xiz[offset];
+  etaxl = d_etax[offset];
+  etayl = d_etay[offset];
+  etazl = d_etaz[offset];
+  gammaxl = d_gammax[offset];
+  gammayl = d_gammay[offset];
+  gammazl = d_gammaz[offset];
+
+  if( gravity ){
+    // daniel: TODO - check gravity case here
+    rho_invl = 1.0f / rhol;
+  }else{
+    rho_invl = 1.0f / rhol;
+  }
+  // derivatives of acoustic scalar potential field on GLL points
+  vector_field_element[0] = (temp1l*xixl + temp2l*etaxl + temp3l*gammaxl) * rho_invl;
+  vector_field_element[1] = (temp1l*xiyl + temp2l*etayl + temp3l*gammayl) * rho_invl;
+  vector_field_element[2] = (temp1l*xizl + temp2l*etazl + temp3l*gammazl) * rho_invl;
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void compute_kernels_acoustic_kernel(int* ispec_is_acoustic,
+                                                int* ibool,
+                                                realw* rhostore,
+                                                realw* kappastore,
+                                                realw* hprime_xx,
+                                                realw* hprime_yy,
+                                                realw* hprime_zz,
+                                                realw* d_xix,
+                                                realw* d_xiy,
+                                                realw* d_xiz,
+                                                realw* d_etax,
+                                                realw* d_etay,
+                                                realw* d_etaz,
+                                                realw* d_gammax,
+                                                realw* d_gammay,
+                                                realw* d_gammaz,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
+                                                realw* rho_ac_kl,
+                                                realw* kappa_ac_kl,
+                                                realw deltat,
+                                                int NSPEC_AB,
+                                                int gravity) {
+
+  int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // handles case when there is 1 extra block (due to rectangular grid)
+  if( ispec < NSPEC_AB ){
+
+    // acoustic elements only
+    if( ispec_is_acoustic[ispec] ) {
+
+      int ijk = threadIdx.x;
+
+      // local and global indices
+      int ijk_ispec = ijk + NGLL3*ispec;
+      int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
+      int iglob = ibool[ijk_ispec] - 1;
+
+      realw accel_elm[3];
+      realw b_displ_elm[3];
+      realw rhol,kappal;
+
+      // shared memory between all threads within this block
+      __shared__ realw scalar_field_displ[NGLL3];
+      __shared__ realw scalar_field_accel[NGLL3];
+
+      // copy field values
+      scalar_field_displ[ijk] = b_potential_acoustic[iglob];
+      scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
+      __syncthreads();
+
+      // gets material parameter
+      rhol = rhostore[ijk_ispec_padded];
+
+      // displacement vector from backward field
+      compute_gradient_kernel(ijk,ispec,scalar_field_displ,b_displ_elm,
+                              hprime_xx,hprime_yy,hprime_zz,
+                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                              rhol,gravity);
+
+      // acceleration vector
+      compute_gradient_kernel(ijk,ispec,scalar_field_accel,accel_elm,
+                              hprime_xx,hprime_yy,hprime_zz,
+                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                              rhol,gravity);
+
+      // density kernel
+      rho_ac_kl[ijk_ispec] -= deltat * rhol * (accel_elm[0]*b_displ_elm[0] +
+                                               accel_elm[1]*b_displ_elm[1] +
+                                               accel_elm[2]*b_displ_elm[2]);
+
+      // bulk modulus kernel
+      kappal = kappastore[ijk_ispec];
+      kappa_ac_kl[ijk_ispec] -= deltat / kappal * potential_dot_dot_acoustic[iglob]
+                                                * b_potential_dot_dot_acoustic[iglob];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(compute_kernels_acoustic_cuda,
+              COMPUTE_KERNELS_ACOUSTIC_CUDA)(
+                                             long* Mesh_pointer,
+                                             realw* deltat_f) {
+
+TRACE("compute_kernels_acoustic_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+  realw deltat = *deltat_f;
+
+  int num_blocks_x = mp->NSPEC_AB;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  compute_kernels_acoustic_kernel<<<grid,threads>>>(mp->d_ispec_is_acoustic,
+                                                    mp->d_ibool,
+                                                    mp->d_rhostore,
+                                                    mp->d_kappastore,
+                                                    mp->d_hprime_xx,
+                                                    mp->d_hprime_yy,
+                                                    mp->d_hprime_zz,
+                                                    mp->d_xix,
+                                                    mp->d_xiy,
+                                                    mp->d_xiz,
+                                                    mp->d_etax,
+                                                    mp->d_etay,
+                                                    mp->d_etaz,
+                                                    mp->d_gammax,
+                                                    mp->d_gammay,
+                                                    mp->d_gammaz,
+                                                    mp->d_potential_dot_dot_acoustic,
+                                                    mp->d_b_potential_acoustic,
+                                                    mp->d_b_potential_dot_dot_acoustic,
+                                                    mp->d_rho_ac_kl,
+                                                    mp->d_kappa_ac_kl,
+                                                    deltat,
+                                                    mp->NSPEC_AB,
+                                                    mp->gravity);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_kernels_acoustic_kernel");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// preconditioner (approximate Hessian kernel)
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_kernels_hess_el_cudakernel(int* ispec_is_elastic,
+                                                   int* ibool,
+                                                   realw* accel,
+                                                   realw* b_accel,
+                                                   realw* hess_kl,
+                                                   realw deltat,
+                                                   int NSPEC_AB) {
+
+  int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // handles case when there is 1 extra block (due to rectangular grid)
+  if(ispec < NSPEC_AB) {
+
+    // elastic elements only
+    if( ispec_is_elastic[ispec] ) {
+
+      int ijk = threadIdx.x;
+      int ijk_ispec = ijk + NGLL3*ispec;
+      int iglob = ibool[ijk_ispec] - 1 ;
+
+      // approximate hessian
+      hess_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_accel[3*iglob]+
+                                      accel[3*iglob+1]*b_accel[3*iglob+1]+
+                                      accel[3*iglob+2]*b_accel[3*iglob+2]);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_kernels_hess_ac_cudakernel(int* ispec_is_acoustic,
+                                                   int* ibool,
+                                                   realw* potential_dot_dot_acoustic,
+                                                   realw* b_potential_dot_dot_acoustic,
+                                                   realw* rhostore,
+                                                   realw* hprime_xx,
+                                                   realw* hprime_yy,
+                                                   realw* hprime_zz,
+                                                   realw* d_xix,
+                                                   realw* d_xiy,
+                                                   realw* d_xiz,
+                                                   realw* d_etax,
+                                                   realw* d_etay,
+                                                   realw* d_etaz,
+                                                   realw* d_gammax,
+                                                   realw* d_gammay,
+                                                   realw* d_gammaz,
+                                                   realw* hess_kl,
+                                                   realw deltat,
+                                                   int NSPEC_AB,
+                                                   int gravity) {
+
+  int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // handles case when there is 1 extra block (due to rectangular grid)
+  if(ispec < NSPEC_AB) {
+
+    // acoustic elements only
+    if( ispec_is_acoustic[ispec] ){
+
+      // local and global indices
+      int ijk = threadIdx.x;
+      int ijk_ispec = ijk + NGLL3*ispec;
+      int iglob = ibool[ijk_ispec] - 1 ;
+
+      int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
+
+      realw accel_elm[3];
+      realw b_accel_elm[3];
+      realw rhol;
+
+      // shared memory between all threads within this block
+      __shared__ realw scalar_field_accel[NGLL3];
+      __shared__ realw scalar_field_b_accel[NGLL3];
+
+      // copy field values
+      scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
+      scalar_field_b_accel[ijk] = b_potential_dot_dot_acoustic[iglob];
+      __syncthreads();
+
+      // gets material parameter
+      rhol = rhostore[ijk_ispec_padded];
+
+      // acceleration vector
+      compute_gradient_kernel(ijk,ispec,
+                              scalar_field_accel,accel_elm,
+                              hprime_xx,hprime_yy,hprime_zz,
+                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                              rhol,gravity);
+
+      // acceleration vector from backward field
+      compute_gradient_kernel(ijk,ispec,
+                              scalar_field_b_accel,b_accel_elm,
+                              hprime_xx,hprime_yy,hprime_zz,
+                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+                              rhol,gravity);
+      // approximates hessian
+      hess_kl[ijk_ispec] += deltat * (accel_elm[0]*b_accel_elm[0] +
+                                      accel_elm[1]*b_accel_elm[1] +
+                                      accel_elm[2]*b_accel_elm[2]);
+
+    } // ispec_is_acoustic
+
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_kernels_hess_cuda,
+              COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
+                                         realw* deltat_f,
+                                         int* ELASTIC_SIMULATION,
+                                         int* ACOUSTIC_SIMULATION) {
+  TRACE("compute_kernels_hess_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+  realw deltat = *deltat_f;
+
+  int num_blocks_x = mp->NSPEC_AB;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if( *ELASTIC_SIMULATION ) {
+    compute_kernels_hess_el_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,
+                                                         mp->d_ibool,
+                                                         mp->d_accel,
+                                                         mp->d_b_accel,
+                                                         mp->d_hess_el_kl,
+                                                         deltat,
+                                                         mp->NSPEC_AB);
+  }
+
+  if( *ACOUSTIC_SIMULATION ) {
+    compute_kernels_hess_ac_cudakernel<<<grid,threads>>>(mp->d_ispec_is_acoustic,
+                                                         mp->d_ibool,
+                                                         mp->d_potential_dot_dot_acoustic,
+                                                         mp->d_b_potential_dot_dot_acoustic,
+                                                         mp->d_rhostore,
+                                                         mp->d_hprime_xx,
+                                                         mp->d_hprime_yy,
+                                                         mp->d_hprime_zz,
+                                                         mp->d_xix,
+                                                         mp->d_xiy,
+                                                         mp->d_xiz,
+                                                         mp->d_etax,
+                                                         mp->d_etay,
+                                                         mp->d_etaz,
+                                                         mp->d_gammax,
+                                                         mp->d_gammay,
+                                                         mp->d_gammaz,
+                                                         mp->d_hess_ac_kl,
+                                                         deltat,
+                                                         mp->NSPEC_AB,
+                                                         mp->gravity);
+  }
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_kernels_hess_cuda");
+#endif
+}
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,192 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_stacey_acoustic_kernel(realw* potential_dot_acoustic,
+                                               realw* potential_dot_dot_acoustic,
+                                               int* abs_boundary_ispec,
+                                               int* abs_boundary_ijk,
+                                               realw* abs_boundary_jacobian2Dw,
+                                               int* ibool,
+                                               realw* rhostore,
+                                               realw* kappastore,
+                                               int* ispec_is_inner,
+                                               int* ispec_is_acoustic,
+                                               int phase_is_inner,
+                                               int SIMULATION_TYPE, int SAVE_FORWARD,
+                                               int num_abs_boundary_faces,
+                                               realw* b_potential_dot_acoustic,
+                                               realw* b_potential_dot_dot_acoustic,
+                                               realw* b_absorb_potential,
+                                               int gravity) {
+
+  int igll = threadIdx.x;
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
+  int i,j,k,iglob,ispec;
+  realw rhol,kappal,cpl;
+  realw jacobianw;
+  realw vel;
+
+  // don't compute points outside NGLLSQUARE==NGLL2==25
+  // way 2: no further check needed since blocksize = 25
+  if( iface < num_abs_boundary_faces){
+
+  //  if(igll<NGLL2 && iface < num_abs_boundary_faces) {
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = abs_boundary_ispec[iface]-1;
+
+    if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
+
+      i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+      j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+      k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+      // determines bulk sound speed
+      rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+
+      kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+
+      cpl = sqrt( kappal / rhol );
+
+      // velocity
+      if( gravity ){
+        // daniel: TODO - check gravity and stacey condition here...
+        // uses a potential definition of: s = grad(chi)
+        vel = potential_dot_acoustic[iglob] / rhol ;
+      }else{
+        // uses a potential definition of: s = 1/rho grad(chi)
+        vel = potential_dot_acoustic[iglob] / rhol;
+      }
+
+      // gets associated, weighted jacobian
+      jacobianw = abs_boundary_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+
+      // Sommerfeld condition
+      atomicAdd(&potential_dot_dot_acoustic[iglob],-vel*jacobianw/cpl);
+
+      // adjoint simulations
+      if( SIMULATION_TYPE == 3 ){
+        // Sommerfeld condition
+        atomicAdd(&b_potential_dot_dot_acoustic[iglob],-b_absorb_potential[INDEX2(NGLL2,igll,iface)]);
+      }else if( SIMULATION_TYPE == 1 && SAVE_FORWARD ){
+        // saves boundary values
+        b_absorb_potential[INDEX2(NGLL2,igll,iface)] = vel*jacobianw/cpl;
+      }
+    }
+//  }
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_stacey_acoustic_cuda,
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(
+                                    long* Mesh_pointer_f,
+                                    int* phase_is_innerf,
+                                    int* SIMULATION_TYPEf,
+                                    int* SAVE_FORWARDf,
+                                    realw* h_b_absorb_potential) {
+TRACE("compute_stacey_acoustic_cuda");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  int phase_is_inner          = *phase_is_innerf;
+  int SIMULATION_TYPE         = *SIMULATION_TYPEf;
+  int SAVE_FORWARD            = *SAVE_FORWARDf;
+
+  // way 1: Elapsed time: 4.385948e-03
+  // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+  //  int blocksize = 32;
+
+  // way 2: Elapsed time: 4.379034e-03
+  // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+  int blocksize = NGLL2;
+
+  int num_blocks_x = mp->d_num_abs_boundary_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  //  adjoint simulations: reads in absorbing boundary
+  if (SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0 ){
+    // copies array to GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,h_b_absorb_potential,
+                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),7700);
+  }
+
+  compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_acoustic,
+                                                   mp->d_potential_dot_dot_acoustic,
+                                                   mp->d_abs_boundary_ispec,
+                                                   mp->d_abs_boundary_ijk,
+                                                   mp->d_abs_boundary_jacobian2Dw,
+                                                   mp->d_ibool,
+                                                   mp->d_rhostore,
+                                                   mp->d_kappastore,
+                                                   mp->d_ispec_is_inner,
+                                                   mp->d_ispec_is_acoustic,
+                                                   phase_is_inner,
+                                                   SIMULATION_TYPE,SAVE_FORWARD,
+                                                   mp->d_num_abs_boundary_faces,
+                                                   mp->d_b_potential_dot_acoustic,
+                                                   mp->d_b_potential_dot_dot_acoustic,
+                                                   mp->d_b_absorb_potential,
+                                                   mp->gravity);
+
+  //  adjoint simulations: stores absorbed wavefield part
+  if (SIMULATION_TYPE == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ){
+    // copies array to CPU
+    print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_potential,mp->d_b_absorb_potential,
+                                       mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),7701);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_stacey_acoustic_kernel");
+#endif
+}
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,214 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_stacey_elastic_kernel(realw* veloc,
+                                              realw* accel,
+                                              int* abs_boundary_ispec,
+                                              int* abs_boundary_ijk,
+                                              realw* abs_boundary_normal,
+                                              realw* abs_boundary_jacobian2Dw,
+                                              int* ibool,
+                                              realw* rho_vp,
+                                              realw* rho_vs,
+                                              int* ispec_is_inner,
+                                              int* ispec_is_elastic,
+                                              int phase_is_inner,
+                                              int SIMULATION_TYPE,
+                                              int SAVE_FORWARD,
+                                              int num_abs_boundary_faces,
+                                              realw* b_accel,
+                                              realw* b_absorb_field) {
+
+  int igll = threadIdx.x; // tx
+  int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
+
+  int i,j,k,iglob,ispec;
+  realw vx,vy,vz,vn;
+  realw nx,ny,nz;
+  realw rho_vp_temp,rho_vs_temp;
+  realw tx,ty,tz;
+  realw jacobianw;
+
+  // don't compute points outside NGLLSQUARE==NGLL2==25
+  // way 2: no further check needed since blocksize = 25
+  if( iface < num_abs_boundary_faces){
+
+  //if(igll < NGLL2 && iface < num_abs_boundary_faces) {
+
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = abs_boundary_ispec[iface]-1;
+
+    if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
+
+      i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+      j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+      k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+      iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+
+      // gets associated velocity
+
+      vx = veloc[iglob*3+0];
+      vy = veloc[iglob*3+1];
+      vz = veloc[iglob*3+2];
+
+      // gets associated normal
+      nx = abs_boundary_normal[INDEX3(NDIM,NGLL2,0,igll,iface)];
+      ny = abs_boundary_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
+      nz = abs_boundary_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
+
+      // // velocity component in normal direction (normal points out of element)
+      vn = vx*nx + vy*ny + vz*nz;
+
+      rho_vp_temp = rho_vp[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+      rho_vs_temp = rho_vs[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+
+      tx = rho_vp_temp*vn*nx + rho_vs_temp*(vx-vn*nx);
+      ty = rho_vp_temp*vn*ny + rho_vs_temp*(vy-vn*ny);
+      tz = rho_vp_temp*vn*nz + rho_vs_temp*(vz-vn*nz);
+
+      jacobianw = abs_boundary_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
+
+      atomicAdd(&accel[iglob*3],-tx*jacobianw);
+      atomicAdd(&accel[iglob*3+1],-ty*jacobianw);
+      atomicAdd(&accel[iglob*3+2],-tz*jacobianw);
+
+      if(SIMULATION_TYPE == 3) {
+        atomicAdd(&b_accel[iglob*3  ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
+        atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
+        atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
+      }
+      else if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
+        b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)] = tx*jacobianw;
+        b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw;
+        b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw;
+      } // SIMULATION_TYPE
+    }
+  } // num_abs_boundary_faces
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(compute_stacey_elastic_cuda,
+              COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* SIMULATION_TYPEf,
+                                           int* SAVE_FORWARDf,
+                                           realw* h_b_absorb_field) {
+
+TRACE("compute_stacey_elastic_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // check
+  if( mp->d_num_abs_boundary_faces == 0 ) return;
+
+  int phase_is_inner    = *phase_is_innerf;
+  int SIMULATION_TYPE   = *SIMULATION_TYPEf;
+  int SAVE_FORWARD      = *SAVE_FORWARDf;
+
+  // way 1
+  // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+  //int blocksize = 32;
+
+  // way 2: seems sligthly faster
+  // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+  int blocksize = NGLL2;
+
+  int num_blocks_x = mp->d_num_abs_boundary_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if(SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0) {
+    // The read is done in fortran
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field,
+                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("between cudamemcpy and compute_stacey_elastic_kernel");
+#endif
+
+  compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc,
+                                                  mp->d_accel,
+                                                  mp->d_abs_boundary_ispec,
+                                                  mp->d_abs_boundary_ijk,
+                                                  mp->d_abs_boundary_normal,
+                                                  mp->d_abs_boundary_jacobian2Dw,
+                                                  mp->d_ibool,
+                                                  mp->d_rho_vp,
+                                                  mp->d_rho_vs,
+                                                  mp->d_ispec_is_inner,
+                                                  mp->d_ispec_is_elastic,
+                                                  phase_is_inner,
+                                                  SIMULATION_TYPE,SAVE_FORWARD,
+                                                  mp->d_num_abs_boundary_faces,
+                                                  mp->d_b_accel,
+                                                  mp->d_b_absorb_field);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_stacey_elastic_kernel");
+#endif
+
+  // ! adjoint simulations: stores absorbed wavefield part
+  // if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) &
+  //   write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field
+
+  if(SIMULATION_TYPE == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ) {
+    print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_field,mp->d_b_absorb_field,
+                                       mp->d_b_reclen_field,cudaMemcpyDeviceToHost),7701);
+    // The write is done in fortran
+    // write_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&it);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after compute_stacey_elastic after cudamemcpy");
+#endif
+}
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,506 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+#define CUBLAS_ERROR(s,n)  if (s != CUBLAS_STATUS_SUCCESS) {  \
+fprintf (stderr, "CUBLAS Memory Write Error @ %d\n",n); \
+exit(EXIT_FAILURE); }
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elastic wavefield
+
+// KERNEL 1
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void UpdateDispVeloc_kernel(realw* displ,
+                                       realw* veloc,
+                                       realw* accel,
+                                       int size,
+                                       realw deltat,
+                                       realw deltatsqover2,
+                                       realw deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id];
+    veloc[id] = veloc[id] + deltatover2*accel[id];
+    accel[id] = 0; // can do this using memset...not sure if faster
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(it_update_displacement_cuda,
+              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
+                                                 int* size_F,
+                                                 realw* deltat_F,
+                                                 realw* deltatsqover2_F,
+                                                 realw* deltatover2_F,
+                                                 int* SIMULATION_TYPE,
+                                                 realw* b_deltat_F,
+                                                 realw* b_deltatsqover2_F,
+                                                 realw* b_deltatover2_F) {
+
+TRACE("it_update_displacement_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  //int i,device;
+
+  int size = *size_F;
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltat = *b_deltat_F;
+  realw b_deltatsqover2 = *b_deltatsqover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+  //cublasStatus status;
+
+  int blocksize = BLOCKSIZE_KERNEL1;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+
+//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+//  exit_on_cuda_error("Before UpdateDispVeloc_kernel");
+//#endif
+
+  //launch kernel
+  UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
+                                           size,deltat,deltatsqover2,deltatover2);
+
+  //cudaThreadSynchronize();
+//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+//  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+//  // sync and check to catch errors from previous async operations
+//  exit_on_cuda_error("UpdateDispVeloc_kernel");
+//#endif
+
+  // kernel for backward fields
+  if(*SIMULATION_TYPE == 3) {
+
+    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
+                                             size,b_deltat,b_deltatsqover2,b_deltatover2);
+
+//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+//    //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+//    exit_on_cuda_error("after SIM_TYPE==3 UpdateDispVeloc_kernel");
+//#endif
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("it_update_displacement_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic wavefield
+
+// KERNEL 1
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void UpdatePotential_kernel(realw* potential_acoustic,
+                                       realw* potential_dot_acoustic,
+                                       realw* potential_dot_dot_acoustic,
+                                       int size,
+                                       realw deltat,
+                                       realw deltatsqover2,
+                                       realw deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    potential_acoustic[id] = potential_acoustic[id]
+                            + deltat*potential_dot_acoustic[id]
+                            + deltatsqover2*potential_dot_dot_acoustic[id];
+
+    potential_dot_acoustic[id] = potential_dot_acoustic[id]
+                                + deltatover2*potential_dot_dot_acoustic[id];
+
+    potential_dot_dot_acoustic[id] = 0;
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(it_update_displacement_ac_cuda,
+              it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
+                                               int* size_F,
+                                               realw* deltat_F,
+                                               realw* deltatsqover2_F,
+                                               realw* deltatover2_F,
+                                               int* SIMULATION_TYPE,
+                                               realw* b_deltat_F,
+                                               realw* b_deltatsqover2_F,
+                                               realw* b_deltatover2_F) {
+TRACE("it_update_displacement_ac_cuda");
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  //int i,device;
+  int size = *size_F;
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltat = *b_deltat_F;
+  realw b_deltatsqover2 = *b_deltatsqover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+  //cublasStatus status;
+
+  int blocksize = BLOCKSIZE_KERNEL1;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  //launch kernel
+  UpdatePotential_kernel<<<grid,threads>>>(mp->d_potential_acoustic,
+                                           mp->d_potential_dot_acoustic,
+                                           mp->d_potential_dot_dot_acoustic,
+                                           size,deltat,deltatsqover2,deltatover2);
+
+  if(*SIMULATION_TYPE == 3) {
+    UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_potential_acoustic,
+                                             mp->d_b_potential_dot_acoustic,
+                                             mp->d_b_potential_dot_dot_acoustic,
+                                             size,b_deltat,b_deltatsqover2,b_deltatover2);
+  }
+
+  //cudaThreadSynchronize();
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("it_update_displacement_ac_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 3
+//
+// crust/mantle and inner core regions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_cuda_device(realw* veloc,
+                                     realw* accel, int size,
+                                     realw deltatover2,
+                                     realw* rmass) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    accel[3*id] = accel[3*id]*rmass[id];
+    accel[3*id+1] = accel[3*id+1]*rmass[id];
+    accel[3*id+2] = accel[3*id+2]*rmass[id];
+
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_accel_cuda_device(realw* accel,
+                                           int size,
+                                           realw* rmass) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    accel[3*id] = accel[3*id]*rmass[id];
+    accel[3*id+1] = accel[3*id+1]*rmass[id];
+    accel[3*id+2] = accel[3*id+2]*rmass[id];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_veloc_cuda_device(realw* veloc,
+                                           realw* accel,
+                                           int size,
+                                           realw deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_a_cuda,
+              KERNEL_3_A_CUDA)(long* Mesh_pointer,
+                               realw* deltatover2_F,
+                               int* SIMULATION_TYPE_f,
+                               realw* b_deltatover2_F,
+                               int* OCEANS) {
+  TRACE("kernel_3_a_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+  
+  int SIMULATION_TYPE = *SIMULATION_TYPE_f;
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL3;
+  int size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // crust/mantle region only
+  // check whether we can update accel and veloc, or only accel at this point
+  if( *OCEANS == 0 ){
+    // updates both, accel and veloc
+    kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle, 
+                                             mp->d_accel_crust_mantle, 
+                                             mp->NGLOB_CRUST_MANTLE, 
+                                             deltatover2, mp->d_rmass_crust_mantle);
+
+    if(SIMULATION_TYPE == 3) {
+      kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle, 
+                                               mp->d_b_accel_crust_mantle, 
+                                               mp->NGLOB_CRUST_MANTLE, 
+                                               b_deltatover2,mp->d_rmass_crust_mantle);
+    }
+  }else{
+    // updates only accel
+    kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle, 
+                                                   mp->NGLOB_CRUST_MANTLE, 
+                                                   mp->d_rmass_crust_mantle);
+
+    if(SIMULATION_TYPE == 3) {
+      kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle, 
+                                                     mp->NGLOB_CRUST_MANTLE, 
+                                                     mp->d_rmass_crust_mantle);
+    }
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel_3_a");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_b_cuda,
+              KERNEL_3_B_CUDA)(long* Mesh_pointer,
+                               realw* deltatover2_F,
+                               int* SIMULATION_TYPE_f,
+                               realw* b_deltatover2_F,
+                               int* OCEANS) {
+  TRACE("kernel_3_b_cuda");
+  int size_padded,num_blocks_x,num_blocks_y;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+  int SIMULATION_TYPE = *SIMULATION_TYPE_f;
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL3;
+  
+  // crust/mantle region
+  // in case of ocean loads, we still have to update the velocity for crust/mantle region
+  if( *OCEANS ){
+    size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
+    num_blocks_x = size_padded/blocksize;
+    num_blocks_y = 1;
+    while(num_blocks_x > 65535) {
+      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+      num_blocks_y = num_blocks_y*2;
+    }
+    dim3 grid(num_blocks_x,num_blocks_y);
+    dim3 threads(blocksize,1,1);
+    
+    // updates only veloc at this point
+    kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+                                                   mp->d_accel_crust_mantle,
+                                                   mp->NGLOB_CRUST_MANTLE,
+                                                   deltatover2);
+
+    if(SIMULATION_TYPE == 3) {
+      kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+                                                     mp->d_b_accel_crust_mantle,
+                                                     mp->NGLOB_CRUST_MANTLE,
+                                                     b_deltatover2);
+    }
+  }
+  
+  // inner core  
+  size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
+  num_blocks_x = size_padded/blocksize;
+  num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+  
+  // updates both, accel and veloc
+  kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core, 
+                                           mp->d_accel_inner_core, 
+                                           mp->NGLOB_INNER_CORE, 
+                                           deltatover2, mp->d_rmass_inner_core);
+  
+  if(SIMULATION_TYPE == 3) {
+    kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core, 
+                                             mp->d_b_accel_inner_core, 
+                                             mp->NGLOB_INNER_CORE, 
+                                             b_deltatover2,mp->d_rmass_inner_core);
+  }  
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel_3_b");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 3
+//
+// for outer_core
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void kernel_3_outer_core_cuda_device(realw* veloc,
+                                                realw* accel,int size,
+                                                realw deltatover2,
+                                                realw* rmass) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    // multiplies pressure with the inverse of the mass matrix
+    accel[id] = accel[id]*rmass[id];
+    
+    // Newmark time scheme: corrector term
+    veloc[id] = veloc[id] + deltatover2*accel[id];    
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_outer_core_cuda,
+              KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
+                                        realw* deltatover2_F,
+                                        int* SIMULATION_TYPE_f,
+                                        realw* b_deltatover2_F) {
+
+  TRACE("kernel_3_outer_core_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+  int SIMULATION_TYPE = *SIMULATION_TYPE_f;
+  realw deltatover2 = *deltatover2_F;
+  realw b_deltatover2 = *b_deltatover2_F;
+  
+  int blocksize = BLOCKSIZE_KERNEL3;
+  int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
+                                                      mp->d_accel_outer_core,
+                                                      mp->NGLOB_OUTER_CORE,
+                                                      deltatover2,mp->d_rmass_outer_core);
+
+  if(SIMULATION_TYPE == 3) {
+    kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
+                                                        mp->d_b_accel_outer_core,
+                                                        mp->NGLOB_OUTER_CORE,
+                                                        b_deltatover2,mp->d_rmass_outer_core);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel_3_outer_core");
+#endif
+}
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,706 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+/* trivia
+
+- for most working arrays we use now "realw" instead of "float" type declarations to make it easier to switch
+  between a real or double precision simulation
+  (matching CUSTOM_REAL == 4 or 8 in fortran routines).
+
+- instead of boolean "logical" declared in fortran routines, in C (or Cuda-C) we have to use "int" variables.
+  ifort / gfortran caveat:
+    to check whether it is true or false, do not check for == 1 to test for true values since ifort just uses
+    non-zero values for true (e.g. can be -1 for true). however, false will be always == 0.
+  thus, rather use: if( var ) {...}  for testing if true instead of if( var == 1){...} (alternative: one could use if( var != 0 ){...}
+
+*/
+
+#ifndef GPU_MESH_
+#define GPU_MESH_
+
+#include <sys/types.h>
+#include <unistd.h>
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for debugging and benchmarking
+
+/* ----------------------------------------------------------------------------------------------- */
+
+#define DEBUG 0
+#if DEBUG == 1
+#define TRACE(x) printf("%s\n",x);
+#else
+#define TRACE(x) // printf("%s\n",x);
+#endif
+
+#define MAXDEBUG 0
+#if MAXDEBUG == 1
+#define LOG(x) printf("%s\n",x)
+#define PRINT5(var,offset) for(;print_count<5;print_count++) printf("var(%d)=%2.20f\n",print_count,var[offset+print_count]);
+#define PRINT10(var) if(print_count<10) { printf("var=%1.20e\n",var); print_count++; }
+#define PRINT10i(var) if(print_count<10) { printf("var=%d\n",var); print_count++; }
+#else
+#define LOG(x) // printf("%s\n",x);
+#define PRINT5(var,offset) // for(i=0;i<10;i++) printf("var(%d)=%f\n",i,var[offset+i]);
+#endif
+
+// error checking after cuda function calls
+#define ENABLE_VERY_SLOW_ERROR_CHECKING
+
+// maximum function
+#define MAX(x,y)                    (((x) < (y)) ? (y) : (x))
+
+// utility functions: defined in check_fields_cuda.cu
+double get_time();
+void get_free_memory(double* free_db, double* used_db, double* total_db);
+void print_CUDA_error_if_any(cudaError_t err, int num);
+void pause_for_debugger(int pause);
+void exit_on_cuda_error(char* kernel_name);
+void exit_on_error(char* info);
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// cuda constant arrays
+
+/* ----------------------------------------------------------------------------------------------- */
+// (must match constants.h definitions)
+
+// dimensions
+#define NDIM 3
+
+// Gauss-Lobatto-Legendre
+#define NGLLX 5
+#define NGLL2 25
+#define NGLL3 125 // no padding: requires same size as in fortran for NGLLX * NGLLY * NGLLZ
+
+// padding: 128 == 2**7 might improve on older graphics cards w/ coalescent memory accesses:
+#define NGLL3_PADDED 128
+// no padding: 125 == 5*5*5 to avoid allocation of extra memory
+//#define NGLL3_PADDED 125
+
+// number of standard linear solids
+#define N_SLS 3
+
+// region ids
+#define IREGION_CRUST_MANTLE  1
+#define IREGION_INNER_CORE  3
+
+/* ----------------------------------------------------------------------------------------------- */
+
+//typedef float real;   // type of variables passed into function
+typedef float realw;  // type of "working" variables
+
+// double precision temporary variables leads to 10% performance
+// decrease in Kernel_2_impl (not very much..)
+typedef float reald;
+
+// (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
+// leads up to ~ 5% performance increase
+//#define USE_MESH_COLORING_GPU
+
+// (optional) unrolling loops
+// leads up to ~1% performance increase
+//#define MANUALLY_UNROLLED_LOOPS
+
+// cuda kernel block size for updating displacements/potential (newmark time scheme)
+// current hardware: 128 is slightly faster than 256 ( ~ 4%)
+#define BLOCKSIZE_KERNEL1 128
+#define BLOCKSIZE_KERNEL3 128
+#define BLOCKSIZE_TRANSFER 256
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// indexing
+
+#define INDEX2(xsize,x,y) x + (y)*xsize
+
+#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
+//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
+
+#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
+//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
+
+#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*(j))))
+//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
+
+#define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
+
+#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*NGLL3_PADDED
+//#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*NGLL3_PADDED
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// mesh pointer wrapper structure
+
+/* ----------------------------------------------------------------------------------------------- */
+
+typedef struct mesh_ {
+
+  // mesh resolution
+  // ------------------------------------------------------------------ //
+  // crust_mantle
+  // ------------------------------------------------------------------ //
+  int NSPEC_CRUST_MANTLE;
+  int NGLOB_CRUST_MANTLE;
+
+  // interpolators
+  realw* d_xix_crust_mantle; realw* d_xiy_crust_mantle; realw* d_xiz_crust_mantle;
+  realw* d_etax_crust_mantle; realw* d_etay_crust_mantle; realw* d_etaz_crust_mantle;
+  realw* d_gammax_crust_mantle; realw* d_gammay_crust_mantle; realw* d_gammaz_crust_mantle;
+
+  // model parameters
+  realw* d_rhostore_crust_mantle;
+  realw* d_kappavstore_crust_mantle; realw* d_muvstore_crust_mantle;
+  realw* d_kappahstore_crust_mantle; realw* d_muhstore_crust_mantle;
+  realw* d_eta_anisostore_crust_mantle;
+  realw* d_rmass_crust_mantle;
+
+  // global indexing
+  int* d_ibool_crust_mantle;
+  int* d_ispec_is_tiso_crust_mantle;
+
+  // mesh locations
+  realw* d_xstore_crust_mantle; realw* d_ystore_crust_mantle; realw* d_zstore_crust_mantle;
+
+  // anisotropic 3D mantle
+  realw* d_c11store_crust_mantle;
+  realw* d_c12store_crust_mantle;
+  realw* d_c13store_crust_mantle;
+  realw* d_c14store_crust_mantle;
+  realw* d_c15store_crust_mantle;
+  realw* d_c16store_crust_mantle;
+  realw* d_c22store_crust_mantle;
+  realw* d_c23store_crust_mantle;
+  realw* d_c24store_crust_mantle;
+  realw* d_c25store_crust_mantle;
+  realw* d_c26store_crust_mantle;
+  realw* d_c33store_crust_mantle;
+  realw* d_c34store_crust_mantle;
+  realw* d_c35store_crust_mantle;
+  realw* d_c36store_crust_mantle;
+  realw* d_c44store_crust_mantle;
+  realw* d_c45store_crust_mantle;
+  realw* d_c46store_crust_mantle;
+  realw* d_c55store_crust_mantle;
+  realw* d_c56store_crust_mantle;
+  realw* d_c66store_crust_mantle;
+
+  // wavefields
+  // displacement, velocity, acceleration
+  realw* d_displ_crust_mantle; realw* d_veloc_crust_mantle; realw* d_accel_crust_mantle;
+  // backward/reconstructed elastic wavefield
+  realw* d_b_displ_crust_mantle; realw* d_b_veloc_crust_mantle; realw* d_b_accel_crust_mantle;
+
+  // attenuation
+  realw* d_R_xx_crust_mantle;
+  realw* d_R_yy_crust_mantle;
+  realw* d_R_xy_crust_mantle;
+  realw* d_R_xz_crust_mantle;
+  realw* d_R_yz_crust_mantle;
+
+  realw* d_b_R_xx_crust_mantle;
+  realw* d_b_R_yy_crust_mantle;
+  realw* d_b_R_xy_crust_mantle;
+  realw* d_b_R_xz_crust_mantle;
+  realw* d_b_R_yz_crust_mantle;
+
+  realw* d_factor_common_crust_mantle;
+  realw* d_one_minus_sum_beta_crust_mantle;
+
+  realw* d_epsilondev_xx_crust_mantle;
+  realw* d_epsilondev_yy_crust_mantle;
+  realw* d_epsilondev_xy_crust_mantle;
+  realw* d_epsilondev_xz_crust_mantle;
+  realw* d_epsilondev_yz_crust_mantle;
+
+  realw* d_b_epsilondev_xx_crust_mantle;
+  realw* d_b_epsilondev_yy_crust_mantle;
+  realw* d_b_epsilondev_xy_crust_mantle;
+  realw* d_b_epsilondev_xz_crust_mantle;
+  realw* d_b_epsilondev_yz_crust_mantle;
+
+  realw* d_eps_trace_over_3_crust_mantle;
+  realw* d_b_eps_trace_over_3_crust_mantle;
+
+  // inner / outer elements
+  int* d_phase_ispec_inner_crust_mantle;
+  int num_phase_ispec_crust_mantle;
+
+  int nspec_outer_crust_mantle;
+  int nspec_inner_crust_mantle;
+
+  int num_colors_inner_crust_mantle;
+  int num_colors_outer_crust_mantle;
+  int* h_num_elem_colors_crust_mantle;
+
+
+  // ------------------------------------------------------------------ //
+  // outer_core
+  // ------------------------------------------------------------------ //
+  int NSPEC_OUTER_CORE;
+  int NGLOB_OUTER_CORE;
+
+  // interpolators
+  realw* d_xix_outer_core; realw* d_xiy_outer_core; realw* d_xiz_outer_core;
+  realw* d_etax_outer_core; realw* d_etay_outer_core; realw* d_etaz_outer_core;
+  realw* d_gammax_outer_core; realw* d_gammay_outer_core; realw* d_gammaz_outer_core;
+
+  // model parameters
+  realw* d_rhostore_outer_core; realw* d_kappavstore_outer_core;
+  realw* d_rmass_outer_core;
+
+  // global indexing
+  int* d_ibool_outer_core;
+
+  // mesh locations
+  realw* d_xstore_outer_core; realw* d_ystore_outer_core; realw* d_zstore_outer_core;
+
+  // wavefields
+  // displacement, velocity, acceleration
+  realw* d_displ_outer_core; realw* d_veloc_outer_core; realw* d_accel_outer_core;
+  // backward/reconstructed elastic wavefield
+  realw* d_b_displ_outer_core; realw* d_b_veloc_outer_core; realw* d_b_accel_outer_core;
+
+  // inner / outer elements
+  int* d_phase_ispec_inner_outer_core;
+  int num_phase_ispec_outer_core;
+
+  int nspec_outer_outer_core;
+  int nspec_inner_outer_core;
+
+  int num_colors_inner_outer_core;
+  int num_colors_outer_outer_core;
+  int* h_num_elem_colors_outer_core;
+
+
+  // ------------------------------------------------------------------ //
+  // inner_core
+  // ------------------------------------------------------------------ //
+  int NSPEC_INNER_CORE;
+  int NGLOB_INNER_CORE;
+
+  // interpolators
+  realw* d_xix_inner_core; realw* d_xiy_inner_core; realw* d_xiz_inner_core;
+  realw* d_etax_inner_core; realw* d_etay_inner_core; realw* d_etaz_inner_core;
+  realw* d_gammax_inner_core; realw* d_gammay_inner_core; realw* d_gammaz_inner_core;
+
+  // model parameters
+  realw* d_rhostore_inner_core;
+  realw* d_kappavstore_inner_core; realw* d_muvstore_inner_core;
+  realw* d_rmass_inner_core;
+
+  // global indexing
+  int* d_ibool_inner_core;
+  int* d_idoubling_inner_core;
+
+  // mesh locations
+  realw* d_xstore_inner_core; realw* d_ystore_inner_core; realw* d_zstore_inner_core;
+
+  // anisotropic 3D mantle
+  realw* d_c11store_inner_core;
+  realw* d_c12store_inner_core;
+  realw* d_c13store_inner_core;
+  realw* d_c33store_inner_core;
+  realw* d_c44store_inner_core;
+
+  // wavefields
+  // displacement, velocity, acceleration
+  realw* d_displ_inner_core; realw* d_veloc_inner_core; realw* d_accel_inner_core;
+  // backward/reconstructed elastic wavefield
+  realw* d_b_displ_inner_core; realw* d_b_veloc_inner_core; realw* d_b_accel_inner_core;
+
+  // attenuation
+  realw* d_R_xx_inner_core;
+  realw* d_R_yy_inner_core;
+  realw* d_R_xy_inner_core;
+  realw* d_R_xz_inner_core;
+  realw* d_R_yz_inner_core;
+
+  realw* d_b_R_xx_inner_core;
+  realw* d_b_R_yy_inner_core;
+  realw* d_b_R_xy_inner_core;
+  realw* d_b_R_xz_inner_core;
+  realw* d_b_R_yz_inner_core;
+
+
+  realw* d_factor_common_inner_core;
+  realw* d_one_minus_sum_beta_inner_core;
+
+  realw* d_epsilondev_xx_inner_core;
+  realw* d_epsilondev_yy_inner_core;
+  realw* d_epsilondev_xy_inner_core;
+  realw* d_epsilondev_xz_inner_core;
+  realw* d_epsilondev_yz_inner_core;
+
+  realw* d_b_epsilondev_xx_inner_core;
+  realw* d_b_epsilondev_yy_inner_core;
+  realw* d_b_epsilondev_xy_inner_core;
+  realw* d_b_epsilondev_xz_inner_core;
+  realw* d_b_epsilondev_yz_inner_core;
+
+  realw* d_eps_trace_over_3_inner_core;
+  realw* d_b_eps_trace_over_3_inner_core;
+
+  // inner / outer elements
+  int* d_phase_ispec_inner_inner_core;
+  int num_phase_ispec_inner_core;
+
+  int nspec_outer_inner_core;
+  int nspec_inner_inner_core;
+
+  int num_colors_inner_inner_core;
+  int num_colors_outer_inner_core;
+  int* h_num_elem_colors_inner_core;
+
+  // ------------------------------------------------------------------ //
+  // attenuation
+  // ------------------------------------------------------------------ //
+  realw* d_alphaval;
+  realw* d_betaval;
+  realw* d_gammaval;
+
+  realw* d_b_alphaval;
+  realw* d_b_betaval;
+  realw* d_b_gammaval;
+
+  // ------------------------------------------------------------------ //
+  // GLL points & weights
+  // ------------------------------------------------------------------ //
+
+  // pointers to constant memory arrays
+  realw* d_hprime_xx; realw* d_hprime_yy; realw* d_hprime_zz;
+  realw* d_hprimewgll_xx; realw* d_hprimewgll_yy; realw* d_hprimewgll_zz;
+  realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz;
+  realw* d_wgll_cube;
+
+  // simulation type: 1 = forward, 2 = adjoint, 3 = kernel
+  int simulation_type;
+
+  // mesh coloring flag
+  int use_mesh_coloring_gpu;
+
+  // simulation flags
+  int save_forward;
+  int absorbing_conditions;
+  int attenuation;
+  int use_attenuation_mimic;
+  int compute_and_store_strain;
+  int anisotropic_3D_mantle;
+  int gravity;
+  int rotation;
+  int anisotropic_inner_core;
+  int save_boundary_mesh;
+
+  // ------------------------------------------------------------------ //
+  // gravity
+  // ------------------------------------------------------------------ //
+  realw* d_d_ln_density_dr_table; // needed for no gravity case
+  realw* d_minus_rho_g_over_kappa_fluid;
+  realw* d_minus_gravity_table;
+  realw* d_minus_deriv_gravity_table;
+  realw* d_density_table;
+
+  //daniel: TODO old...
+  realw* d_minus_g;
+  realw* d_minus_deriv_gravity;
+
+  // ------------------------------------------------------------------ //
+  // rotation
+  // ------------------------------------------------------------------ //
+  realw d_two_omega_earth;
+  realw d_deltat;
+  realw* d_A_array_rotation; realw* d_B_array_rotation;
+
+  // needed for backward/reconstructed fields (kernel runs)
+  realw d_b_two_omega_earth;
+  realw d_b_deltat;
+  realw* d_b_A_array_rotation; realw* d_b_B_array_rotation;
+
+  // ------------------------------------------------------------------ //
+  // sources
+  // ------------------------------------------------------------------ //
+  int nsources_local;
+  realw* d_sourcearrays;
+  double* d_stf_pre_compute;
+  int* d_islice_selected_source;
+  int* d_ispec_selected_source;
+
+  // ------------------------------------------------------------------ //
+  // receivers
+  // ------------------------------------------------------------------ //
+  int* d_number_receiver_global;
+  int* d_ispec_selected_rec;
+  int* d_islice_selected_rec;
+  int nrec_local;
+  realw* d_station_seismo_field;
+  realw* h_station_seismo_field;
+
+  // adjoint receivers/sources
+  int nadj_rec_local;
+  realw* d_adj_sourcearrays;
+  realw* h_adj_sourcearrays_slice;
+  int* d_pre_computed_irec;
+
+  // ------------------------------------------------------------------ //
+  // assembly
+  // ------------------------------------------------------------------ //
+
+  int num_interfaces_crust_mantle;
+  int max_nibool_interfaces_crust_mantle;
+  int* d_nibool_interfaces_crust_mantle;
+  int* d_ibool_interfaces_crust_mantle;
+  realw* d_send_accel_buffer_crust_mantle;
+
+  int num_interfaces_inner_core;
+  int max_nibool_interfaces_inner_core;
+  int* d_nibool_interfaces_inner_core;
+  int* d_ibool_interfaces_inner_core;
+  realw* d_send_accel_buffer_inner_core;
+
+  int num_interfaces_outer_core;
+  int max_nibool_interfaces_outer_core;
+  int* d_nibool_interfaces_outer_core;
+  int* d_ibool_interfaces_outer_core;
+  realw* d_send_accel_buffer_outer_core;
+  
+
+// ------------------------------------------------------------------ //
+//daniel: TODO - former code...
+
+  // mesh resolution
+  int NSPEC_AB;
+  int NGLOB_AB;
+
+  // interpolators
+  realw* d_xix; realw* d_xiy; realw* d_xiz;
+  realw* d_etax; realw* d_etay; realw* d_etaz;
+  realw* d_gammax; realw* d_gammay; realw* d_gammaz;
+
+  // model parameters
+  realw* d_kappav; realw* d_muv;
+
+  // global indexing
+  int* d_ibool;
+
+  // inner / outer elements
+  int* d_ispec_is_inner;
+
+  // mpi buffers
+  int num_interfaces_ext_mesh;
+  int max_nibool_interfaces_ext_mesh;
+
+  // ------------------------------------------------------------------ //
+  // elastic wavefield parameters
+  // ------------------------------------------------------------------ //
+
+  // displacement, velocity, acceleration
+  realw* d_displ; realw* d_veloc; realw* d_accel;
+  // backward/reconstructed elastic wavefield
+  realw* d_b_displ; realw* d_b_veloc; realw* d_b_accel;
+
+  // elastic elements
+  int* d_ispec_is_elastic;
+
+  // elastic domain parameters
+  int* d_phase_ispec_inner_elastic;
+  int num_phase_ispec_elastic;
+
+  // ------------------------------------------------------------------ //
+  // mesh coloring
+  // ------------------------------------------------------------------ //
+  int* h_num_elem_colors_elastic;
+  int num_colors_outer_elastic,num_colors_inner_elastic;
+  int nspec_elastic;
+
+  realw* d_rmass;
+
+  // ------------------------------------------------------------------ //
+  // mpi buffer
+  // ------------------------------------------------------------------ //
+  realw* d_send_accel_buffer;
+
+  // interfaces
+  int* d_nibool_interfaces_ext_mesh;
+  int* d_ibool_interfaces_ext_mesh;
+
+  // ------------------------------------------------------------------ //
+  //used for absorbing stacey boundaries
+  // ------------------------------------------------------------------ //
+  int d_num_abs_boundary_faces;
+  int* d_abs_boundary_ispec;
+  int* d_abs_boundary_ijk;
+  realw* d_abs_boundary_normal;
+  realw* d_abs_boundary_jacobian2Dw;
+
+  realw* d_b_absorb_field;
+  int d_b_reclen_field;
+
+  realw* d_rho_vp;
+  realw* d_rho_vs;
+
+  // surface elements (to save for noise tomography and acoustic simulations)
+  int* d_free_surface_ispec;
+  int* d_free_surface_ijk;
+  int num_free_surface_faces;
+
+  // surface movie elements to save for noise tomography
+  realw* d_noise_surface_movie;
+
+  // attenuation
+  realw* d_R_xx;
+  realw* d_R_yy;
+  realw* d_R_xy;
+  realw* d_R_xz;
+  realw* d_R_yz;
+
+  realw* d_one_minus_sum_beta;
+  realw* d_factor_common;
+
+  // attenuation & kernel
+  realw* d_epsilondev_xx;
+  realw* d_epsilondev_yy;
+  realw* d_epsilondev_xy;
+  realw* d_epsilondev_xz;
+  realw* d_epsilondev_yz;
+  realw* d_epsilon_trace_over_3;
+
+  // anisotropy
+  realw* d_c11store;
+  realw* d_c12store;
+  realw* d_c13store;
+  realw* d_c14store;
+  realw* d_c15store;
+  realw* d_c16store;
+  realw* d_c22store;
+  realw* d_c23store;
+  realw* d_c24store;
+  realw* d_c25store;
+  realw* d_c26store;
+  realw* d_c33store;
+  realw* d_c34store;
+  realw* d_c35store;
+  realw* d_c36store;
+  realw* d_c44store;
+  realw* d_c45store;
+  realw* d_c46store;
+  realw* d_c55store;
+  realw* d_c56store;
+  realw* d_c66store;
+
+  // noise
+  realw* d_normal_x_noise;
+  realw* d_normal_y_noise;
+  realw* d_normal_z_noise;
+  realw* d_mask_noise;
+  realw* d_free_surface_jacobian2Dw;
+
+  realw* d_noise_sourcearray;
+
+  // attenuation & kernel backward fields
+  realw* d_b_R_xx;
+  realw* d_b_R_yy;
+  realw* d_b_R_xy;
+  realw* d_b_R_xz;
+  realw* d_b_R_yz;
+
+  realw* d_b_epsilondev_xx;
+  realw* d_b_epsilondev_yy;
+  realw* d_b_epsilondev_xy;
+  realw* d_b_epsilondev_xz;
+  realw* d_b_epsilondev_yz;
+  realw* d_b_epsilon_trace_over_3;
+
+  // sensitivity kernels
+  realw* d_rho_kl;
+  realw* d_mu_kl;
+  realw* d_kappa_kl;
+
+  // noise sensitivity kernel
+  realw* d_Sigma_kl;
+
+  // approximative hessian for preconditioning kernels
+  realw* d_hess_el_kl;
+
+  // oceans
+  realw* d_rmass_ocean_load;
+  realw* d_free_surface_normal;
+  int* d_updated_dof_ocean_load;
+
+  // ------------------------------------------------------------------ //
+  // acoustic wavefield
+  // ------------------------------------------------------------------ //
+  // potential and first and second time derivative
+  realw* d_potential_acoustic; realw* d_potential_dot_acoustic; realw* d_potential_dot_dot_acoustic;
+  // backward/reconstructed wavefield
+  realw* d_b_potential_acoustic; realw* d_b_potential_dot_acoustic; realw* d_b_potential_dot_dot_acoustic;
+
+  // acoustic domain parameters
+  int* d_ispec_is_acoustic;
+
+  int* d_phase_ispec_inner_acoustic;
+  int num_phase_ispec_acoustic;
+
+  // mesh coloring
+  int* h_num_elem_colors_acoustic;
+  int num_colors_outer_acoustic,num_colors_inner_acoustic;
+  int nspec_acoustic;
+
+  realw* d_rhostore;
+  realw* d_kappastore;
+  realw* d_rmass_acoustic;
+
+  // mpi buffer
+  realw* d_send_potential_dot_dot_buffer;
+
+  realw* d_b_absorb_potential;
+  int d_b_reclen_potential;
+
+  // for writing seismograms
+  realw* d_station_seismo_potential;
+  realw* h_station_seismo_potential;
+
+  // sensitivity kernels
+  realw* d_rho_ac_kl;
+  realw* d_kappa_ac_kl;
+
+  // approximative hessian for preconditioning kernels
+  realw* d_hess_ac_kl;
+
+  // coupling acoustic-elastic
+  int* d_coupling_ac_el_ispec;
+  int* d_coupling_ac_el_ijk;
+  realw* d_coupling_ac_el_normal;
+  realw* d_coupling_ac_el_jacobian2Dw;
+
+
+} Mesh;
+
+
+#endif

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,304 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){
+TRACE("fortranflush");
+
+  fflush(stdout);
+  fflush(stderr);
+  printf("Flushing proc %d!\n",*rank);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {
+TRACE("fortranprint");
+
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  printf("%d: sends msg_id %d\n",procid,*id);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {
+TRACE("fortranprintf");
+
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  printf("%d: sends val %e\n",procid,*val);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {
+TRACE("fortranprintd");
+
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  printf("%d: sends val %e\n",procid,*val);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// randomize displ for testing
+extern "C"
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {
+TRACE("make_displ_rand");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  // realw* displ_rnd = (realw*)malloc(mp->NGLOB_AB*3*sizeof(realw));
+  for(int i=0;i<mp->NGLOB_AB*3;i++) {
+    h_displ[i] = rand();
+  }
+  cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(realw),cudaMemcpyHostToDevice);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void transfer_surface_to_host_kernel(int* free_surface_ispec,
+                                                int* free_surface_ijk,
+                                                int num_free_surface_faces,
+                                                int* ibool,
+                                                realw* displ,
+                                                realw* noise_surface_movie) {
+  int igll = threadIdx.x;
+  int iface = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // int id = tx + blockIdx.x*blockDim.x + blockIdx.y*blockDim.x*gridDim.x;
+
+  if(iface < num_free_surface_faces) {
+    int ispec = free_surface_ispec[iface]-1; //-1 for C-based indexing
+
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+    noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)] = displ[iglob*3];
+    noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)] = displ[iglob*3+1];
+    noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)] = displ[iglob*3+2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_surface_to_host,
+              TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
+                                        realw* h_noise_surface_movie) {
+TRACE("transfer_surface_to_host");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(NGLL2,1,1);
+
+  transfer_surface_to_host_kernel<<<grid,threads>>>(mp->d_free_surface_ispec,
+                                                    mp->d_free_surface_ijk,
+                                                    mp->num_free_surface_faces,
+                                                    mp->d_ibool,
+                                                    mp->d_displ,
+                                                    mp->d_noise_surface_movie);
+
+  cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
+             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_surface_to_host");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void noise_read_add_surface_movie_cuda_kernel(realw* accel, int* ibool,
+                                                         int* free_surface_ispec,
+                                                         int* free_surface_ijk,
+                                                         int num_free_surface_faces,
+                                                         realw* noise_surface_movie,
+                                                         realw* normal_x_noise,
+                                                         realw* normal_y_noise,
+                                                         realw* normal_z_noise,
+                                                         realw* mask_noise,
+                                                         realw* free_surface_jacobian2Dw) {
+
+  int iface = blockIdx.x + gridDim.x*blockIdx.y; // surface element id
+
+  // when nspec_top > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+  if(iface < num_free_surface_faces) {
+    int ispec = free_surface_ispec[iface]-1;
+
+    int igll = threadIdx.x;
+
+    int ipoin = NGLL2*iface + igll;
+    int i=free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+    int j=free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+    int k=free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+    realw normal_x = normal_x_noise[ipoin];
+    realw normal_y = normal_y_noise[ipoin];
+    realw normal_z = normal_z_noise[ipoin];
+
+    realw eta = (noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x +
+                noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y +
+                noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z);
+
+    // error from cuda-memcheck and ddt seems "incorrect", because we
+    // are passing a __constant__ variable pointer around like it was
+    // made using cudaMalloc, which *may* be "incorrect", but produces
+    // correct results.
+
+    // ========= Invalid __global__ read of size
+    // 4 ========= at 0x00000cd8 in
+    // compute_add_sources_cuda.cu:260:noise_read_add_surface_movie_cuda_kernel
+    // ========= by thread (0,0,0) in block (3443,0) ========= Address
+    // 0x203000c8 is out of bounds
+
+    // non atomic version for speed testing -- atomic updates are needed for correctness
+    // accel[3*iglob] +=   eta*mask_noise[ipoin] * normal_x * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
+    // accel[3*iglob+1] += eta*mask_noise[ipoin] * normal_y * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
+    // accel[3*iglob+2] += eta*mask_noise[ipoin] * normal_z * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
+
+    // Fortran version in SVN -- note deletion of wgllwgll_xy?
+    // accel(1,iglob) = accel(1,iglob) + eta * mask_noise(ipoin) * normal_x_noise(ipoin) &
+    // * free_surface_jacobian2Dw(igll,iface)
+    // accel(2,iglob) = accel(2,iglob) + eta * mask_noise(ipoin) * normal_y_noise(ipoin) &
+    // * free_surface_jacobian2Dw(igll,iface)
+    // accel(3,iglob) = accel(3,iglob) + eta * mask_noise(ipoin) * normal_z_noise(ipoin) &
+    // * free_surface_jacobian2Dw(igll,iface) ! wgllwgll_xy(i,j) * jacobian2D_top(i,j,iface)
+
+    // atomicAdd(&accel[iglob*3]  ,eta*mask_noise[ipoin]*normal_x*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    // atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    // atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+
+    atomicAdd(&accel[iglob*3]  ,eta*mask_noise[ipoin]*normal_x*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+    atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(noise_read_add_surface_movie_cu,
+              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+                                               realw* h_noise_surface_movie,
+                                               int* NOISE_TOMOGRAPHYf) {
+TRACE("noise_read_add_surface_movie_cu");
+
+  // EPIK_TRACER("noise_read_add_surface_movie_cu");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf;
+
+  cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+             3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
+
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(NGLL2,1,1);
+
+  if(NOISE_TOMOGRAPHY == 2) { // add surface source to forward field
+    noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_accel,
+                                                               mp->d_ibool,
+                                                               mp->d_free_surface_ispec,
+                                                               mp->d_free_surface_ijk,
+                                                               mp->num_free_surface_faces,
+                                                               mp->d_noise_surface_movie,
+                                                               mp->d_normal_x_noise,
+                                                               mp->d_normal_y_noise,
+                                                               mp->d_normal_z_noise,
+                                                               mp->d_mask_noise,
+                                                               mp->d_free_surface_jacobian2Dw);
+  }
+  else if(NOISE_TOMOGRAPHY == 3) { // add surface source to adjoint (backward) field
+    noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
+                                                               mp->d_ibool,
+                                                               mp->d_free_surface_ispec,
+                                                               mp->d_free_surface_ijk,
+                                                               mp->num_free_surface_faces,
+                                                               mp->d_noise_surface_movie,
+                                                               mp->d_normal_x_noise,
+                                                               mp->d_normal_y_noise,
+                                                               mp->d_normal_z_noise,
+                                                               mp->d_mask_noise,
+                                                               mp->d_free_surface_jacobian2Dw);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("noise_read_add_surface_movie_cuda_kernel");
+#endif
+}

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,128 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#ifndef CUDA_HEADER_H
+#define CUDA_HEADER_H
+
+typedef float realw;  // type of "working" variables
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// setters for these const arrays (very ugly hack, but will have to do)
+
+// elastic
+void setConst_hprime_xx(realw* array,Mesh* mp);
+void setConst_hprime_yy(realw* array,Mesh* mp);
+void setConst_hprime_zz(realw* array,Mesh* mp);
+
+void setConst_hprimewgll_xx(realw* array,Mesh* mp);
+void setConst_hprimewgll_yy(realw* array,Mesh* mp);
+void setConst_hprimewgll_zz(realw* array,Mesh* mp);
+
+void setConst_wgllwgll_xy(realw* array,Mesh* mp);
+void setConst_wgllwgll_xz(realw* array, Mesh* mp);
+void setConst_wgllwgll_yz(realw* array, Mesh* mp);
+
+void setConst_wgll_cube(realw* array, Mesh* mp);
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* CUDA specific things from specfem3D_kernels.cu */
+
+#ifdef USE_TEXTURES
+  // declaration of textures
+  texture<realw, 1, cudaReadModeElementType> tex_displ;
+  texture<realw, 1, cudaReadModeElementType> tex_accel;
+
+  texture<realw, 1, cudaReadModeElementType> tex_potential;
+  texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot;
+
+  // for binding the textures
+
+  void bindTexturesDispl(realw* d_displ)
+  {
+    cudaError_t err;
+
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+
+    err = cudaBindTexture(NULL,tex_displ, d_displ, channelDescFloat, NDIM*NGLOB*sizeof(realw));
+    if (err != cudaSuccess)
+    {
+      fprintf(stderr, "Error in bindTexturesDispl for displ: %s\n", cudaGetErrorString(err));
+      exit(1);
+    }
+  }
+
+  void bindTexturesAccel(realw* d_accel)
+  {
+    cudaError_t err;
+
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+
+    err = cudaBindTexture(NULL,tex_accel, d_accel, channelDescFloat, NDIM*NGLOB*sizeof(realw));
+    if (err != cudaSuccess)
+    {
+      fprintf(stderr, "Error in bindTexturesAccel for accel: %s\n", cudaGetErrorString(err));
+      exit(1);
+    }
+  }
+
+  void bindTexturesPotential(realw* d_potential)
+  {
+    cudaError_t err;
+
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+
+    err = cudaBindTexture(NULL,tex_potential, d_potential,
+                          channelDescFloat, NGLOB*sizeof(realw));
+    if (err != cudaSuccess)
+    {
+      fprintf(stderr, "Error in bindTexturesPotential for potential: %s\n", cudaGetErrorString(err));
+      exit(1);
+    }
+  }
+
+  void bindTexturesPotential_dot_dot(realw* d_potential_dot_dot)
+  {
+    cudaError_t err;
+
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+
+    err = cudaBindTexture(NULL,tex_potential_dot_dot, d_potential_dot_dot,
+                          channelDescFloat, NGLOB*sizeof(realw));
+    if (err != cudaSuccess)
+    {
+      fprintf(stderr, "Error in bindTexturesPotential_dot_dot for potential_dot_dot: %s\n", cudaGetErrorString(err));
+      exit(1);
+    }
+  }
+
+#endif // USE_TEXTURES
+
+
+#endif //CUDA_HEADER_H

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,2755 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+#include "prepare_constants_cuda.h"
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// GPU preparation
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_cuda_device,
+              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+  TRACE("prepare_cuda_device");
+
+  // Gets rank number of MPI process
+  int myrank = *myrank_f;
+
+  // cuda initialization (needs -lcuda library)
+  // note:   cuInit initializes the driver API.
+  //             it is needed for any following CUDA driver API function call (format cuFUNCTION(..) )
+  //             however, for the CUDA runtime API functions (format cudaFUNCTION(..) )
+  //             the initialization is implicit, thus cuInit() here would not be needed...
+  CUresult status = cuInit(0);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA driver API device initialization failed\n");
+
+  // returns a handle to the first cuda compute device
+  CUdevice dev;
+  status = cuDeviceGet(&dev, 0);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device not found\n");
+
+  // gets device properties
+  int major,minor;
+  status = cuDeviceComputeCapability(&major,&minor,dev);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device information not found\n");
+
+  // make sure that the device has compute capability >= 1.3
+  if (major < 1){
+    fprintf(stderr,"Compute capability major number should be at least 1, got: %d \nexiting...\n",major);
+    exit_on_error("CUDA Compute capability major number should be at least 1\n");
+  }
+  if (major == 1 && minor < 3){
+    fprintf(stderr,"Compute capability should be at least 1.3, got: %d.%d \nexiting...\n",major,minor);
+    exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+  }
+
+  // note: from here on we use the runtime API  ...
+  // Gets number of GPU devices
+  int device_count = 0;
+  cudaGetDeviceCount(&device_count);
+  exit_on_cuda_error("CUDA runtime cudaGetDeviceCount: check if driver and runtime libraries work together\nexiting...\n");
+
+  // returns device count to fortran
+  if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
+  *ncuda_devices = device_count;
+
+
+  // Sets the active device
+  if(device_count > 1) {
+    // generalized for more GPUs per node
+    // note: without previous context release, cudaSetDevice will complain with the cuda error
+    //         "setting the device when a process is active is not allowed"
+    // releases previous contexts
+    cudaThreadExit();
+
+    //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count);
+    //MPI_Barrier(MPI_COMM_WORLD);
+
+    // sets active device
+    cudaSetDevice( myrank % device_count );
+    exit_on_cuda_error("cudaSetDevice");
+  }
+
+  // returns a handle to the active device
+  int device;
+  cudaGetDevice(&device);
+
+  // get device properties
+  struct cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp,device);
+
+  // exit if the machine has no CUDA-enabled device
+  if (deviceProp.major == 9999 && deviceProp.minor == 9999){
+    fprintf(stderr,"No CUDA-enabled device found, exiting...\n\n");
+    exit_on_error("CUDA runtime error: there is no CUDA-enabled device found\n");
+  }
+
+  // outputs device infos to file
+  char filename[BUFSIZ];
+  FILE* fp;
+  sprintf(filename,"OUTPUT_FILES/gpu_device_info_proc_%06d.txt",myrank);
+  fp = fopen(filename,"a+");
+  if (fp != NULL){
+    // display device properties
+    fprintf(fp,"Device Name = %s\n",deviceProp.name);
+    fprintf(fp,"multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
+    fprintf(fp,"totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
+    fprintf(fp,"totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
+    fprintf(fp,"sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
+    fprintf(fp,"Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+    fprintf(fp,"Maximum size of each dimension of a block: %d x %d x %d\n",
+            deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
+    fprintf(fp,"Maximum sizes of each dimension of a grid: %d x %d x %d\n",
+            deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
+    fprintf(fp,"Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
+    if(deviceProp.canMapHostMemory){
+      fprintf(fp,"canMapHostMemory: TRUE\n");
+    }else{
+      fprintf(fp,"canMapHostMemory: FALSE\n");
+    }
+    if(deviceProp.deviceOverlap){
+      fprintf(fp,"deviceOverlap: TRUE\n");
+    }else{
+      fprintf(fp,"deviceOverlap: FALSE\n");
+    }
+
+    // make sure that the device has compute capability >= 1.3
+    //if (deviceProp.major < 1){
+    //  fprintf(stderr,"Compute capability major number should be at least 1, exiting...\n\n");
+    //  exit_on_error("CUDA Compute capability major number should be at least 1");
+    //}
+    //if (deviceProp.major == 1 && deviceProp.minor < 3){
+    //  fprintf(stderr,"Compute capability should be at least 1.3, exiting...\n");
+    //  exit_on_error("CUDA Compute capability major number should be at least 1.3");
+    //}
+
+    // outputs initial memory infos via cudaMemGetInfo()
+    double free_db,used_db,total_db;
+    get_free_memory(&free_db,&used_db,&total_db);
+    fprintf(fp,"%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank,
+            used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+    fclose(fp);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// SIMULATION constants
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_constants_device,
+              PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
+                                        int* h_NGLLX,
+                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+                                        realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
+                                        realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
+                                        int* NSOURCES,int* nsources_local,
+                                        realw* h_sourcearrays,
+                                        int* h_islice_selected_source,
+                                        int* h_ispec_selected_source,
+                                        int* h_number_receiver_global,
+                                        int* h_ispec_selected_rec,
+                                        int* nrec,int* nrec_local,
+                                        int* NSPEC_CRUST_MANTLE, int* NGLOB_CRUST_MANTLE,
+                                        int* NSPEC_OUTER_CORE, int* NGLOB_OUTER_CORE,
+                                        int* NSPEC_INNER_CORE, int* NGLOB_INNER_CORE,
+                                        int* SIMULATION_TYPE,
+                                        int* SAVE_FORWARD_f,
+                                        int* ABSORBING_CONDITIONS_f,
+                                        int* GRAVITY_f,
+                                        int* ROTATION_f,
+                                        int* ATTENUATION_f,
+                                        int* USE_ATTENUATION_MIMIC_f,
+                                        int* COMPUTE_AND_STORE_STRAIN_f,
+                                        int* ANISOTROPIC_3D_MANTLE_f,
+                                        int* ANISOTROPIC_INNER_CORE_f,
+                                        int* SAVE_BOUNDARY_MESH_f,
+                                        int* USE_MESH_COLORING_GPU_f) {
+
+TRACE("prepare_constants_device");
+
+  // allocates mesh parameter structure
+  Mesh* mp = (Mesh*) malloc( sizeof(Mesh) );
+  if (mp == NULL) exit_on_error("error allocating mesh pointer");
+  *Mesh_pointer = (long)mp;
+
+  // checks if NGLLX == 5
+  if( *h_NGLLX != NGLLX ){
+    exit_on_error("NGLLX must be 5 for CUDA devices");
+  }
+
+  // sets constant arrays
+  setConst_hprime_xx(h_hprime_xx,mp);
+  setConst_hprime_yy(h_hprime_yy,mp);
+  setConst_hprime_zz(h_hprime_zz,mp);
+  setConst_hprimewgll_xx(h_hprimewgll_xx,mp);
+  setConst_hprimewgll_yy(h_hprimewgll_yy,mp);
+  setConst_hprimewgll_zz(h_hprimewgll_zz,mp);
+  setConst_wgllwgll_xy(h_wgllwgll_xy,mp);
+  setConst_wgllwgll_xz(h_wgllwgll_xz,mp);
+  setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
+
+  // sets global parameters
+  mp->NSPEC_CRUST_MANTLE = *NSPEC_CRUST_MANTLE;
+  mp->NGLOB_CRUST_MANTLE = *NGLOB_CRUST_MANTLE;
+  mp->NSPEC_OUTER_CORE = *NSPEC_OUTER_CORE;
+  mp->NGLOB_OUTER_CORE = *NGLOB_OUTER_CORE;
+  mp->NSPEC_INNER_CORE = *NSPEC_INNER_CORE;
+  mp->NGLOB_INNER_CORE = *NGLOB_INNER_CORE;
+
+  // simulation type
+  mp->simulation_type = *SIMULATION_TYPE;
+
+  // simulation flags initialization
+  mp->save_forward = *SAVE_FORWARD_f;
+  mp->absorbing_conditions = *ABSORBING_CONDITIONS_f;
+  mp->gravity = *GRAVITY_f;
+  mp->rotation = *ROTATION_f;
+  mp->attenuation = *ATTENUATION_f;
+  mp->use_attenuation_mimic = *USE_ATTENUATION_MIMIC_f;
+  mp->compute_and_store_strain = *COMPUTE_AND_STORE_STRAIN_f;
+  mp->anisotropic_3D_mantle = *ANISOTROPIC_3D_MANTLE_f;
+  mp->anisotropic_inner_core = *ANISOTROPIC_INNER_CORE_f;
+  mp->save_boundary_mesh = *SAVE_BOUNDARY_MESH_f;
+
+
+  // mesh coloring flag
+#ifdef USE_MESH_COLORING_GPU
+  mp->use_mesh_coloring_gpu = 1;
+  if( ! *USE_MESH_COLORING_GPU_f ) exit_on_error("error with USE_MESH_COLORING_GPU constant; please re-compile\n");
+#else
+  // mesh coloring
+  // note: this here passes the coloring as an option to the kernel routines
+  //          the performance seems to be the same if one uses the pre-processing directives above or not
+  mp->use_mesh_coloring_gpu = *USE_MESH_COLORING_GPU_f;
+#endif
+
+  // sources
+  mp->nsources_local = *nsources_local;
+  if( mp->simulation_type == 1  || mp->simulation_type == 3 ){
+    // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2)
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_sourcearrays,
+                                       sizeof(realw)* *NSOURCES*3*NGLL3),1301);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_sourcearrays, h_sourcearrays,
+                                       sizeof(realw)* *NSOURCES*3*NGLL3,cudaMemcpyHostToDevice),1302);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_stf_pre_compute,
+                                       *NSOURCES*sizeof(double)),1303);
+  }
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_source,
+                                     sizeof(int) * *NSOURCES),1401);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_source, h_islice_selected_source,
+                                     sizeof(int)* *NSOURCES,cudaMemcpyHostToDevice),1402);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_ispec_selected_source,
+                                     sizeof(int)* *NSOURCES),1403);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_source, h_ispec_selected_source,
+                                     sizeof(int)* *NSOURCES,cudaMemcpyHostToDevice),1404);
+
+
+  // receiver stations
+  // note that:   size(number_receiver_global) = nrec_local
+  //                   size(ispec_selected_rec) = nrec
+  // number of receiver located in this partition
+  mp->nrec_local = *nrec_local;
+  if( mp->nrec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_number_receiver_global),mp->nrec_local*sizeof(int)),1);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_number_receiver_global,h_number_receiver_global,
+                                     mp->nrec_local*sizeof(int),cudaMemcpyHostToDevice),1512);
+  }
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_selected_rec),(*nrec)*sizeof(int)),1513);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_rec,h_ispec_selected_rec,
+                                     (*nrec)*sizeof(int),cudaMemcpyHostToDevice),1514);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_constants_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ROTATION simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_fields_rotation_device,
+              PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
+                                              realw* two_omega_earth,
+                                              realw* deltat,
+                                              realw* A_array_rotation,
+                                              realw* B_array_rotation,
+                                              realw* b_two_omega_earth,
+                                              realw* b_deltat,
+                                              realw* b_A_array_rotation,
+                                              realw* b_B_array_rotation,
+                                              int* NSPEC_OUTER_CORE_ROTATION
+                                              ) {
+
+  TRACE("prepare_fields_rotation_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // arrays only needed when rotation is required
+  if( ! mp->rotation ){ exit_on_cuda_error("prepare_fields_rotation_device rotation not properly initialized"); }
+
+  // rotation arrays (needed only for outer core region)
+  mp->d_two_omega_earth = *two_omega_earth;
+  mp->d_deltat = *deltat;
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_A_array_rotation,
+                                     (*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw)),9000);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_A_array_rotation, A_array_rotation,
+                                     (*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw),cudaMemcpyHostToDevice),9001);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_B_array_rotation,
+                                     (*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw)),9000);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_B_array_rotation, B_array_rotation,
+                                     (*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw),cudaMemcpyHostToDevice),9001);
+
+  // backward/reconstructed fields
+  if( mp->simulation_type == 3 ){
+    mp->d_b_two_omega_earth = *b_two_omega_earth;
+    mp->d_b_deltat = *b_deltat;
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_A_array_rotation,
+                                       (*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw)),9000);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_A_array_rotation, b_A_array_rotation,
+                                       (*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw),cudaMemcpyHostToDevice),9001);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_B_array_rotation,
+                                       (*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw)),9000);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_B_array_rotation, b_B_array_rotation,
+                                       (*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw),cudaMemcpyHostToDevice),9001);
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// GRAVITY simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_fields_gravity_device,
+              PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
+                                             realw* d_ln_density_dr_table,
+                                             realw* minus_rho_g_over_kappa_fluid,
+                                             realw* minus_gravity_table,
+                                             realw* minus_deriv_gravity_table,
+                                             realw* density_table,
+                                             realw* h_wgll_cube,
+                                             int* NRAD_GRAVITY
+                                             ) {
+
+  TRACE("prepare_fields_gravity_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  if( ! mp->gravity ){
+    // no gravity case
+
+    // d ln(rho)/dr needed for the no gravity fluid potential
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_d_ln_density_dr_table,
+                                       (*NRAD_GRAVITY)*sizeof(realw)),8000);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_d_ln_density_dr_table, d_ln_density_dr_table,
+                                       (*NRAD_GRAVITY)*sizeof(realw),cudaMemcpyHostToDevice),8001);
+
+  }else{
+    // gravity case
+
+    // sets up gll weights cubed
+    setConst_wgll_cube(h_wgll_cube,mp);
+
+    // prepares gravity arrays
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_minus_rho_g_over_kappa_fluid,
+                                       (*NRAD_GRAVITY)*sizeof(realw)),8000);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_rho_g_over_kappa_fluid, minus_rho_g_over_kappa_fluid,
+                                       (*NRAD_GRAVITY)*sizeof(realw),cudaMemcpyHostToDevice),8001);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_minus_gravity_table,
+                                       (*NRAD_GRAVITY)*sizeof(realw)),8000);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_gravity_table, minus_gravity_table,
+                                       (*NRAD_GRAVITY)*sizeof(realw),cudaMemcpyHostToDevice),8001);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_minus_deriv_gravity_table,
+                                       (*NRAD_GRAVITY)*sizeof(realw)),8000);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_deriv_gravity_table, minus_deriv_gravity_table,
+                                       (*NRAD_GRAVITY)*sizeof(realw),cudaMemcpyHostToDevice),8001);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_density_table,
+                                       (*NRAD_GRAVITY)*sizeof(realw)),8000);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_density_table, density_table,
+                                       (*NRAD_GRAVITY)*sizeof(realw),cudaMemcpyHostToDevice),8001);
+  }
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ATTENUATION simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_fields_attenuat_device,
+              PREPARE_FIELDS_ATTENUAT_DEVICE)(long* Mesh_pointer_f,
+                                                 realw* R_xx_crust_mantle,
+                                                 realw* R_yy_crust_mantle,
+                                                 realw* R_xy_crust_mantle,
+                                                 realw* R_xz_crust_mantle,
+                                                 realw* R_yz_crust_mantle,
+                                                 realw* factor_common_crust_mantle,
+                                                 realw* one_minus_sum_beta_crust_mantle,
+                                                 realw* R_xx_inner_core,
+                                                 realw* R_yy_inner_core,
+                                                 realw* R_xy_inner_core,
+                                                 realw* R_xz_inner_core,
+                                                 realw* R_yz_inner_core,
+                                                 realw* factor_common_inner_core,
+                                                 realw* one_minus_sum_beta_inner_core,
+                                                 realw* alphaval,realw* betaval,realw* gammaval,
+                                                 realw* b_alphaval,realw* b_betaval,realw* b_gammaval
+                                                 ) {
+
+  TRACE("prepare_fields_attenuat_device");
+  int R_size1,R_size2,R_size3;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // checks flag
+  if( ! mp->attenuation ){ exit_on_cuda_error("prepare_fields_attenuat_device attenuation not properly initialized"); }
+
+  // crust_mantle
+  R_size1 = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
+  R_size2 = NGLL3*mp->NSPEC_CRUST_MANTLE;
+  R_size3 = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_one_minus_sum_beta_crust_mantle,
+                                     R_size2*sizeof(realw)),4430);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta_crust_mantle,one_minus_sum_beta_crust_mantle,
+                                     R_size2*sizeof(realw),cudaMemcpyHostToDevice),4431);
+
+  if( ! mp->use_attenuation_mimic ){
+
+    //daniel: TODO - re-add the ones below, for now too little memory...
+    printf("skipping R_memory arrays, memory too small...\n");
+    if( 0 == 1 ){
+
+    // common factor
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_factor_common_crust_mantle,
+                                         R_size3*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common_crust_mantle,factor_common_crust_mantle,
+                                         R_size3*sizeof(realw),cudaMemcpyHostToDevice),4433);
+
+    // memory variables
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xx_crust_mantle,
+                                       R_size1*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_yy_crust_mantle,
+                                       R_size1*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xy_crust_mantle,
+                                       R_size1*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xz_crust_mantle,
+                                       R_size1*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_yz_crust_mantle,
+                                       R_size1*sizeof(realw)),4401);
+
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx_crust_mantle,R_xx_crust_mantle,
+                                         R_size1*sizeof(realw),cudaMemcpyHostToDevice),4800);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy_crust_mantle,R_yy_crust_mantle,
+                                         R_size1*sizeof(realw),cudaMemcpyHostToDevice),4800);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy_crust_mantle,R_xy_crust_mantle,
+                                         R_size1*sizeof(realw),cudaMemcpyHostToDevice),4800);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz_crust_mantle,R_xz_crust_mantle,
+                                         R_size1*sizeof(realw),cudaMemcpyHostToDevice),4800);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz_crust_mantle,R_yz_crust_mantle,
+                                         R_size1*sizeof(realw),cudaMemcpyHostToDevice),4800);
+
+
+    }
+  }
+
+  // inner_core
+  R_size1 = 5*N_SLS*NGLL3*mp->NSPEC_INNER_CORE;
+  R_size2 = NGLL3*mp->NSPEC_INNER_CORE;
+  R_size3 = N_SLS*NGLL3*mp->NSPEC_INNER_CORE;
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_one_minus_sum_beta_inner_core,
+                                     R_size2*sizeof(realw)),4430);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta_inner_core,one_minus_sum_beta_inner_core,
+                                     R_size2*sizeof(realw),cudaMemcpyHostToDevice),4431);
+
+  if( ! mp->use_attenuation_mimic ){
+    // common factor
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_factor_common_inner_core,
+                                       R_size3*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common_inner_core,factor_common_inner_core,
+                                       R_size3*sizeof(realw),cudaMemcpyHostToDevice),4433);
+
+    // memory variables
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xx_inner_core,
+                                     R_size1*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_yy_inner_core,
+                                       R_size1*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xy_inner_core,
+                                       R_size1*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xz_inner_core,
+                                       R_size1*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_yz_inner_core,
+                                       R_size1*sizeof(realw)),4401);
+
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx_inner_core,R_xx_inner_core,
+                                     R_size1*sizeof(realw),cudaMemcpyHostToDevice),4402);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy_inner_core,R_yy_inner_core,
+                                       R_size1*sizeof(realw),cudaMemcpyHostToDevice),4402);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy_inner_core,R_xy_inner_core,
+                                       R_size1*sizeof(realw),cudaMemcpyHostToDevice),4402);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz_inner_core,R_xz_inner_core,
+                                       R_size1*sizeof(realw),cudaMemcpyHostToDevice),4402);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz_inner_core,R_yz_inner_core,
+                                       R_size1*sizeof(realw),cudaMemcpyHostToDevice),4402);
+  }
+
+  // alpha,beta,gamma factors
+  print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_alphaval),
+                                     N_SLS*sizeof(realw)),4434);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
+                                     N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_betaval),
+                                     N_SLS*sizeof(realw)),4436);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
+                                     N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_gammaval),
+                                     N_SLS*sizeof(realw)),4438);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
+                                     N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
+
+  if( mp->simulation_type == 3 ){
+    // alpha,beta,gamma factors for backward fields
+    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_b_alphaval),
+                                       N_SLS*sizeof(realw)),5434);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_b_betaval),
+                                       N_SLS*sizeof(realw)),5436);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437);
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_b_gammaval),
+                                       N_SLS*sizeof(realw)),5438);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439);
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// STRAIN simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_fields_strain_device,
+              PREPARE_FIELDS_STRAIN_DEVICE)(long* Mesh_pointer_f,
+                                            realw* epsilondev_xx_crust_mantle,
+                                            realw* epsilondev_yy_crust_mantle,
+                                            realw* epsilondev_xy_crust_mantle,
+                                            realw* epsilondev_xz_crust_mantle,
+                                            realw* epsilondev_yz_crust_mantle,
+                                            realw* b_epsilondev_xx_crust_mantle,
+                                            realw* b_epsilondev_yy_crust_mantle,
+                                            realw* b_epsilondev_xy_crust_mantle,
+                                            realw* b_epsilondev_xz_crust_mantle,
+                                            realw* b_epsilondev_yz_crust_mantle,
+                                            realw* eps_trace_over_3_crust_mantle,
+                                            realw* b_eps_trace_over_3_crust_mantle,
+                                            realw* epsilondev_xx_inner_core,
+                                            realw* epsilondev_yy_inner_core,
+                                            realw* epsilondev_xy_inner_core,
+                                            realw* epsilondev_xz_inner_core,
+                                            realw* epsilondev_yz_inner_core,
+                                            realw* b_epsilondev_xx_inner_core,
+                                            realw* b_epsilondev_yy_inner_core,
+                                            realw* b_epsilondev_xy_inner_core,
+                                            realw* b_epsilondev_xz_inner_core,
+                                            realw* b_epsilondev_yz_inner_core,
+                                            realw* eps_trace_over_3_inner_core,
+                                            realw* b_eps_trace_over_3_inner_core
+                                            ) {
+
+  TRACE("prepare_fields_strain_device");
+  int R_size,R_size2;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // checks flag
+  if( ! mp->compute_and_store_strain ){ exit_on_cuda_error("prepare_fields_strain_device strain not properly initialized"); }
+
+  // crust_mantle
+  R_size = NGLL3*mp->NSPEC_CRUST_MANTLE;
+  R_size2 = NGLL3*mp->NSPEC_CRUST_MANTLE;
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xx_crust_mantle,
+                                     R_size*sizeof(realw)),4432);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_yy_crust_mantle,
+                                     R_size*sizeof(realw)),4432);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xy_crust_mantle,
+                                     R_size*sizeof(realw)),4432);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xz_crust_mantle,
+                                     R_size*sizeof(realw)),4432);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_yz_crust_mantle,
+                                     R_size*sizeof(realw)),4432);
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx_crust_mantle,epsilondev_xx_crust_mantle,
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy_crust_mantle,epsilondev_yy_crust_mantle,
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy_crust_mantle,epsilondev_xy_crust_mantle,
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz_crust_mantle,epsilondev_xz_crust_mantle,
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz_crust_mantle,epsilondev_yz_crust_mantle,
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_eps_trace_over_3_crust_mantle,
+                                       R_size2*sizeof(realw)),4401);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_eps_trace_over_3_crust_mantle,eps_trace_over_3_crust_mantle,
+                                       R_size2*sizeof(realw),cudaMemcpyHostToDevice),4402);
+  // backward/reconstructed fields
+  if( mp->simulation_type == 3 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xx_crust_mantle,
+                                       R_size*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_yy_crust_mantle,
+                                       R_size*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xy_crust_mantle,
+                                       R_size*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xz_crust_mantle,
+                                       R_size*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_yz_crust_mantle,
+                                       R_size*sizeof(realw)),4432);
+
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx_crust_mantle,b_epsilondev_xx_crust_mantle,
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy_crust_mantle,b_epsilondev_yy_crust_mantle,
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy_crust_mantle,b_epsilondev_xy_crust_mantle,
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz_crust_mantle,b_epsilondev_xz_crust_mantle,
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz_crust_mantle,b_epsilondev_yz_crust_mantle,
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_eps_trace_over_3_crust_mantle,
+                                         R_size2*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_eps_trace_over_3_crust_mantle,b_eps_trace_over_3_crust_mantle,
+                                         R_size2*sizeof(realw),cudaMemcpyHostToDevice),4402);
+  }
+
+  // inner_core
+  R_size = NGLL3*mp->NSPEC_INNER_CORE;
+  R_size2 = NGLL3*mp->NSPEC_INNER_CORE;
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xx_inner_core,
+                                     R_size*sizeof(realw)),4432);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_yy_inner_core,
+                                     R_size*sizeof(realw)),4432);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xy_inner_core,
+                                     R_size*sizeof(realw)),4432);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xz_inner_core,
+                                     R_size*sizeof(realw)),4432);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_yz_inner_core,
+                                     R_size*sizeof(realw)),4432);
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx_inner_core,epsilondev_xx_inner_core,
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy_inner_core,epsilondev_yy_inner_core,
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy_inner_core,epsilondev_xy_inner_core,
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz_inner_core,epsilondev_xz_inner_core,
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz_inner_core,epsilondev_yz_inner_core,
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_eps_trace_over_3_inner_core,
+                                     R_size2*sizeof(realw)),4401);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_eps_trace_over_3_inner_core,eps_trace_over_3_inner_core,
+                                     R_size2*sizeof(realw),cudaMemcpyHostToDevice),4402);
+  // backward/reconstructed fields
+  if( mp->simulation_type == 3 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xx_inner_core,
+                                       R_size*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_yy_inner_core,
+                                       R_size*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xy_inner_core,
+                                       R_size*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xz_inner_core,
+                                       R_size*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_yz_inner_core,
+                                       R_size*sizeof(realw)),4432);
+
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx_inner_core,b_epsilondev_xx_inner_core,
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy_inner_core,b_epsilondev_yy_inner_core,
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy_inner_core,b_epsilondev_xy_inner_core,
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz_inner_core,b_epsilondev_xz_inner_core,
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz_inner_core,b_epsilondev_yz_inner_core,
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
+
+
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_eps_trace_over_3_inner_core,
+                                       R_size2*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_eps_trace_over_3_inner_core,b_eps_trace_over_3_inner_core,
+                                       R_size2*sizeof(realw),cudaMemcpyHostToDevice),4402);
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// MPI interfaces
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_mpi_buffers_device,
+              PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f, 
+                                          int* num_interfaces_crust_mantle,
+                                          int* max_nibool_interfaces_crust_mantle,
+                                          int* nibool_interfaces_crust_mantle,
+                                          int* ibool_interfaces_crust_mantle,
+                                          int* num_interfaces_inner_core,
+                                          int* max_nibool_interfaces_inner_core,
+                                          int* nibool_interfaces_inner_core,
+                                          int* ibool_interfaces_inner_core,
+                                          int* num_interfaces_outer_core,
+                                          int* max_nibool_interfaces_outer_core,
+                                          int* nibool_interfaces_outer_core,
+                                          int* ibool_interfaces_outer_core                                          
+                                          ){
+
+  TRACE("prepare_mpi_buffers_device");
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  
+  // prepares interprocess-edge exchange information
+
+  // crust/mantle mesh
+  mp->num_interfaces_crust_mantle = *num_interfaces_crust_mantle;
+  mp->max_nibool_interfaces_crust_mantle = *max_nibool_interfaces_crust_mantle;
+  if( mp->num_interfaces_crust_mantle > 0 ){
+    // number of ibool entries array
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_crust_mantle,
+                                       (mp->num_interfaces_crust_mantle)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_crust_mantle,nibool_interfaces_crust_mantle,
+                                       (mp->num_interfaces_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    // ibool entries (iglob indices) values on interface
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_crust_mantle,
+                                       (mp->num_interfaces_crust_mantle)*(mp->max_nibool_interfaces_crust_mantle)*sizeof(int)),1203);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle,
+                                       (mp->num_interfaces_crust_mantle)*(mp->max_nibool_interfaces_crust_mantle)*sizeof(int),
+                                       cudaMemcpyHostToDevice),1204);
+    // allocates mpi buffer for exchange with cpu
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_crust_mantle),
+                                       3*(mp->max_nibool_interfaces_crust_mantle)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);    
+  }  
+
+  // inner core mesh
+  mp->num_interfaces_inner_core = *num_interfaces_inner_core;
+  mp->max_nibool_interfaces_inner_core = *max_nibool_interfaces_inner_core;
+  if( mp->num_interfaces_inner_core > 0 ){
+    // number of ibool entries array
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_inner_core,
+                                       (mp->num_interfaces_inner_core)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_inner_core,nibool_interfaces_inner_core,
+                                       (mp->num_interfaces_inner_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    // ibool entries (iglob indices) values on interface
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_inner_core,
+                                       (mp->num_interfaces_inner_core)*(mp->max_nibool_interfaces_inner_core)*sizeof(int)),1203);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_inner_core,ibool_interfaces_inner_core,
+                                       (mp->num_interfaces_inner_core)*(mp->max_nibool_interfaces_inner_core)*sizeof(int),
+                                       cudaMemcpyHostToDevice),1204);
+    // allocates mpi buffer for exchange with cpu
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_inner_core),
+                                       3*(mp->max_nibool_interfaces_inner_core)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);    
+  }  
+
+  // outer core mesh
+  // note: uses only scalar wavefield arrays
+  mp->num_interfaces_outer_core = *num_interfaces_outer_core;
+  mp->max_nibool_interfaces_outer_core = *max_nibool_interfaces_outer_core;
+  if( mp->num_interfaces_outer_core > 0 ){
+    // number of ibool entries array
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_outer_core,
+                                       (mp->num_interfaces_outer_core)*sizeof(int)),1201);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_outer_core,nibool_interfaces_outer_core,
+                                       (mp->num_interfaces_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    // ibool entries (iglob indices) values on interface
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_outer_core,
+                                       (mp->num_interfaces_outer_core)*(mp->max_nibool_interfaces_outer_core)*sizeof(int)),1203);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_outer_core,ibool_interfaces_outer_core,
+                                       (mp->num_interfaces_outer_core)*(mp->max_nibool_interfaces_outer_core)*sizeof(int),
+                                       cudaMemcpyHostToDevice),1204);
+    // allocates mpi buffer for exchange with cpu
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_outer_core),
+                                       (mp->max_nibool_interfaces_outer_core)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);    
+  }  
+  
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Earth regions
+
+// CRUST / MANTLE
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_crust_mantle_device,
+              PREPARE_CRUST_MANTLE_DEVICE)(long* Mesh_pointer_f,
+                                        realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                        realw* h_etax, realw* h_etay, realw* h_etaz,
+                                        realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                        realw* h_rho,
+                                        realw* h_kappav, realw* h_muv,
+                                        realw* h_kappah, realw* h_muh,
+                                        realw* h_eta_aniso,
+                                        realw* h_rmass,
+                                        int* h_ibool,
+                                        realw* h_xstore, realw* h_ystore, realw* h_zstore,
+                                        int* h_ispec_is_tiso,
+                                        realw *c11store,realw *c12store,realw *c13store,
+                                        realw *c14store,realw *c15store,realw *c16store,
+                                        realw *c22store,realw *c23store,realw *c24store,
+                                        realw *c25store,realw *c26store,realw *c33store,
+                                        realw *c34store,realw *c35store,realw *c36store,
+                                        realw *c44store,realw *c45store,realw *c46store,
+                                        realw *c55store,realw *c56store,realw *c66store,
+                                        int* num_phase_ispec,
+                                        int* phase_ispec_inner,
+                                        int* nspec_outer,
+                                        int* nspec_inner
+                                        ) {
+
+  TRACE("prepare_crust_mantle_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
+  int size_padded = NGLL3_PADDED * (mp->NSPEC_CRUST_MANTLE);
+  int size_glob = mp->NGLOB_CRUST_MANTLE;
+
+  // mesh
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix_crust_mantle, size_padded*sizeof(realw)),1001);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy_crust_mantle, size_padded*sizeof(realw)),1002);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz_crust_mantle, size_padded*sizeof(realw)),1003);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax_crust_mantle, size_padded*sizeof(realw)),1004);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay_crust_mantle, size_padded*sizeof(realw)),1005);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz_crust_mantle, size_padded*sizeof(realw)),1006);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax_crust_mantle, size_padded*sizeof(realw)),1007);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay_crust_mantle, size_padded*sizeof(realw)),1008);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz_crust_mantle, size_padded*sizeof(realw)),1009);
+  // muv needed in case for attenuation (only Q_mu shear attenuation)
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muvstore_crust_mantle, size_padded*sizeof(realw)),1011);
+
+  // transfer constant element data with padding
+  for(int i=0;i < mp->NSPEC_CRUST_MANTLE;i++) {
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xix_crust_mantle + i*NGLL3_PADDED, &h_xix[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1501);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy_crust_mantle+i*NGLL3_PADDED,   &h_xiy[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1502);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz_crust_mantle+i*NGLL3_PADDED,   &h_xiz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1503);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etax_crust_mantle+i*NGLL3_PADDED,  &h_etax[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1504);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etay_crust_mantle+i*NGLL3_PADDED,  &h_etay[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1505);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz_crust_mantle+i*NGLL3_PADDED,  &h_etaz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1506);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax_crust_mantle+i*NGLL3_PADDED,&h_gammax[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1507);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay_crust_mantle+i*NGLL3_PADDED,&h_gammay[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1508);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz_crust_mantle+i*NGLL3_PADDED,&h_gammaz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1509);
+    // muvstore
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_muvstore_crust_mantle+i*NGLL3_PADDED,   &h_muv[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1511);
+  }
+
+  // global indexing
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_crust_mantle, size_padded*sizeof(int)),1021);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_crust_mantle, h_ibool,
+                                     NGLL3*(mp->NSPEC_CRUST_MANTLE)*sizeof(int),cudaMemcpyHostToDevice),1022);
+
+  // transverse isotropic elements
+  // only needed if not anisotropic 3D mantle
+  if( ! mp->anisotropic_3D_mantle ){
+    // no anisotropy
+
+    // transverse isotropy flag
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_tiso_crust_mantle, (mp->NSPEC_CRUST_MANTLE)*sizeof(int)),1025);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_tiso_crust_mantle, h_ispec_is_tiso,
+                                       (mp->NSPEC_CRUST_MANTLE)*sizeof(int),cudaMemcpyHostToDevice),1025);
+    
+    // kappavstore, kappahstore
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappavstore_crust_mantle, size_padded*sizeof(realw)),1010);
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappahstore_crust_mantle, size_padded*sizeof(realw)),1010);
+    // muhstore
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muhstore_crust_mantle, size_padded*sizeof(realw)),1010);
+    // eta_anisostore
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_eta_anisostore_crust_mantle, size_padded*sizeof(realw)),1010);
+
+    // transfer with padding
+    for(int i=0;i < mp->NSPEC_CRUST_MANTLE;i++) {
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_kappavstore_crust_mantle+i*NGLL3_PADDED,&h_kappav[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1510);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_kappahstore_crust_mantle+i*NGLL3_PADDED,&h_kappah[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1510);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_muhstore_crust_mantle+i*NGLL3_PADDED,&h_muh[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1510);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_eta_anisostore_crust_mantle+i*NGLL3_PADDED,&h_eta_aniso[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1510);
+    }
+  }else{
+    // anisotropic 3D mantle
+    
+    // allocates memory on GPU
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store_crust_mantle),
+                                       size_padded*sizeof(realw)),4700);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store_crust_mantle),
+                                       size_padded*sizeof(realw)),4701);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store_crust_mantle),
+                                       size_padded*sizeof(realw)),4702);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store_crust_mantle),
+                                       size_padded*sizeof(realw)),4703);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store_crust_mantle),
+                                       size_padded*sizeof(realw)),4704);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store_crust_mantle),
+                                       size_padded*sizeof(realw)),4705);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store_crust_mantle),
+                                       size_padded*sizeof(realw)),4706);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store_crust_mantle),
+                                       size_padded*sizeof(realw)),4707);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store_crust_mantle),
+                                       size_padded*sizeof(realw)),4708);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store_crust_mantle),
+                                       size_padded*sizeof(realw)),4709);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store_crust_mantle),
+                                       size_padded*sizeof(realw)),4710);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store_crust_mantle),
+                                       size_padded*sizeof(realw)),4711);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store_crust_mantle),
+                                       size_padded*sizeof(realw)),4712);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store_crust_mantle),
+                                       size_padded*sizeof(realw)),4713);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store_crust_mantle),
+                                       size_padded*sizeof(realw)),4714);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store_crust_mantle),
+                                       size_padded*sizeof(realw)),4715);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store_crust_mantle),
+                                       size_padded*sizeof(realw)),4716);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store_crust_mantle),
+                                       size_padded*sizeof(realw)),4717);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store_crust_mantle),
+                                       size_padded*sizeof(realw)),4718);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store_crust_mantle),
+                                       size_padded*sizeof(realw)),4719);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store_crust_mantle),
+                                       size_padded*sizeof(realw)),4720);
+
+    // transfer constant element data with padding
+    for(int i=0;i < mp->NSPEC_CRUST_MANTLE;i++) {
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store_crust_mantle + i*NGLL3_PADDED, &c11store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4800);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store_crust_mantle + i*NGLL3_PADDED, &c12store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4801);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store_crust_mantle + i*NGLL3_PADDED, &c13store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4802);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c14store_crust_mantle + i*NGLL3_PADDED, &c14store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4803);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c15store_crust_mantle + i*NGLL3_PADDED, &c15store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4804);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c16store_crust_mantle + i*NGLL3_PADDED, &c16store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4805);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c22store_crust_mantle + i*NGLL3_PADDED, &c22store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4806);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c23store_crust_mantle + i*NGLL3_PADDED, &c23store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4807);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c24store_crust_mantle + i*NGLL3_PADDED, &c24store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4808);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c25store_crust_mantle + i*NGLL3_PADDED, &c25store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4809);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c26store_crust_mantle + i*NGLL3_PADDED, &c26store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4810);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store_crust_mantle + i*NGLL3_PADDED, &c33store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4811);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c34store_crust_mantle + i*NGLL3_PADDED, &c34store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4812);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c35store_crust_mantle + i*NGLL3_PADDED, &c35store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4813);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c36store_crust_mantle + i*NGLL3_PADDED, &c36store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4814);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store_crust_mantle + i*NGLL3_PADDED, &c44store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4815);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c45store_crust_mantle + i*NGLL3_PADDED, &c45store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4816);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c46store_crust_mantle + i*NGLL3_PADDED, &c46store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4817);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c55store_crust_mantle + i*NGLL3_PADDED, &c55store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4818);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c56store_crust_mantle + i*NGLL3_PADDED, &c56store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4819);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c66store_crust_mantle + i*NGLL3_PADDED, &c66store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4820);
+    }
+  }
+
+  // needed for boundary kernel calculations
+  if( mp->simulation_type == 3 && mp->save_boundary_mesh ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rhostore_crust_mantle, size_padded*sizeof(realw)),1010);
+    for(int i=0;i < mp->NSPEC_CRUST_MANTLE;i++) {
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore_crust_mantle+i*NGLL3_PADDED, &h_rho[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),2106);
+    }
+  }
+
+  // mesh locations
+  // ystore & zstore needed for tiso elements
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ystore_crust_mantle),sizeof(realw)*size_glob),2005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ystore_crust_mantle,h_ystore,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_zstore_crust_mantle),sizeof(realw)*size_glob),2005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_zstore_crust_mantle,h_zstore,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  
+  // xstore only needed when gravity is on
+  if( mp->gravity ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_xstore_crust_mantle),sizeof(realw)*size_glob),2005);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xstore_crust_mantle,h_xstore,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  }
+
+  // inner/outer elements
+  mp->num_phase_ispec_crust_mantle = *num_phase_ispec;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_crust_mantle),
+                                     mp->num_phase_ispec_crust_mantle*2*sizeof(int)),2008);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_crust_mantle,phase_ispec_inner,
+                                     mp->num_phase_ispec_crust_mantle*2*sizeof(int),cudaMemcpyHostToDevice),2101);
+
+  mp->nspec_outer_crust_mantle = *nspec_outer;
+  mp->nspec_inner_crust_mantle = *nspec_inner;
+
+  // wavefield
+  int size = NDIM * mp->NGLOB_CRUST_MANTLE;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ_crust_mantle),sizeof(realw)*size),4001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc_crust_mantle),sizeof(realw)*size),4002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel_crust_mantle),sizeof(realw)*size),4003);
+  // backward/reconstructed wavefield
+  if( mp->simulation_type == 3 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_crust_mantle),sizeof(realw)*size),4001);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_crust_mantle),sizeof(realw)*size),4002);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_crust_mantle),sizeof(realw)*size),4003);
+  }
+
+  // mass matrix
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_crust_mantle),sizeof(realw)*size_glob),2005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_crust_mantle,h_rmass,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_crust_mantle_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// OUTER CORE
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_outer_core_device,
+              PREPARE_OUTER_CORE_DEVICE)(long* Mesh_pointer_f,
+                                         realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                         realw* h_etax, realw* h_etay, realw* h_etaz,
+                                         realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                         realw* h_rho, realw* h_kappav,
+                                         realw* h_rmass,
+                                         int* h_ibool,
+                                         realw* h_xstore, realw* h_ystore, realw* h_zstore,
+                                         int* num_phase_ispec,
+                                         int* phase_ispec_inner,
+                                         int* nspec_outer,
+                                         int* nspec_inner
+                                         ) {
+
+  TRACE("prepare_outer_core_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
+  int size_padded = NGLL3_PADDED * (mp->NSPEC_OUTER_CORE);
+  int size_glob = mp->NGLOB_OUTER_CORE;
+
+  // mesh
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix_outer_core, size_padded*sizeof(realw)),1001);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy_outer_core, size_padded*sizeof(realw)),1002);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz_outer_core, size_padded*sizeof(realw)),1003);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax_outer_core, size_padded*sizeof(realw)),1004);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay_outer_core, size_padded*sizeof(realw)),1005);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz_outer_core, size_padded*sizeof(realw)),1006);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax_outer_core, size_padded*sizeof(realw)),1007);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay_outer_core, size_padded*sizeof(realw)),1008);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz_outer_core, size_padded*sizeof(realw)),1009);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappavstore_outer_core, size_padded*sizeof(realw)),1010);
+
+  // transfer constant element data with padding
+  for(int i=0;i < mp->NSPEC_OUTER_CORE;i++) {
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xix_outer_core + i*NGLL3_PADDED, &h_xix[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1501);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy_outer_core+i*NGLL3_PADDED,   &h_xiy[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1502);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz_outer_core+i*NGLL3_PADDED,   &h_xiz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1503);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etax_outer_core+i*NGLL3_PADDED,  &h_etax[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1504);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etay_outer_core+i*NGLL3_PADDED,  &h_etay[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1505);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz_outer_core+i*NGLL3_PADDED,  &h_etaz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1506);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax_outer_core+i*NGLL3_PADDED,&h_gammax[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1507);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay_outer_core+i*NGLL3_PADDED,&h_gammay[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1508);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz_outer_core+i*NGLL3_PADDED,&h_gammaz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1509);
+
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_kappavstore_outer_core+i*NGLL3_PADDED,&h_kappav[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1510);
+  }
+
+  // needed for kernel calculations
+  if( mp->simulation_type == 3 ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rhostore_outer_core, size_padded*sizeof(realw)),1010);
+    for(int i=0;i < mp->NSPEC_OUTER_CORE;i++) {
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore_outer_core+i*NGLL3_PADDED, &h_rho[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),2106);
+    }
+  }
+
+  // global indexing
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_outer_core, size_padded*sizeof(int)),1021);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_outer_core, h_ibool,
+                                     NGLL3*(mp->NSPEC_OUTER_CORE)*sizeof(int),cudaMemcpyHostToDevice),1022);
+
+  // mesh locations
+  // always needed
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_xstore_outer_core),sizeof(realw)*size_glob),2005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_xstore_outer_core,h_xstore,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ystore_outer_core),sizeof(realw)*size_glob),2005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ystore_outer_core,h_ystore,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_zstore_outer_core),sizeof(realw)*size_glob),2005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_zstore_outer_core,h_zstore,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+
+  // inner/outer elements
+  mp->num_phase_ispec_outer_core = *num_phase_ispec;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_outer_core),
+                                     mp->num_phase_ispec_outer_core*2*sizeof(int)),2008);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_outer_core,phase_ispec_inner,
+                                     mp->num_phase_ispec_outer_core*2*sizeof(int),cudaMemcpyHostToDevice),2101);
+
+  mp->nspec_outer_outer_core = *nspec_outer;
+  mp->nspec_inner_outer_core = *nspec_inner;
+
+  // wavefield
+  int size = mp->NGLOB_OUTER_CORE;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ_outer_core),sizeof(realw)*size),4001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc_outer_core),sizeof(realw)*size),4002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel_outer_core),sizeof(realw)*size),4003);
+  // backward/reconstructed wavefield
+  if( mp->simulation_type == 3 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_outer_core),sizeof(realw)*size),4001);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_outer_core),sizeof(realw)*size),4002);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_outer_core),sizeof(realw)*size),4003);
+  }
+
+  // mass matrix
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_outer_core),sizeof(realw)*size_glob),2005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_outer_core,h_rmass,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_outer_core_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// INNER CORE
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_inner_core_device,
+              PREPARE_INNER_CORE_DEVICE)(long* Mesh_pointer_f,
+                                           realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                           realw* h_etax, realw* h_etay, realw* h_etaz,
+                                           realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                           realw* h_rho, realw* h_kappav, realw* h_muv,
+                                           realw* h_rmass,
+                                           int* h_ibool,
+                                           realw* h_xstore, realw* h_ystore, realw* h_zstore,
+                                           realw *c11store,realw *c12store,realw *c13store,
+                                           realw *c33store,realw *c44store,
+                                           int* h_idoubling_inner_core,
+                                           int* num_phase_ispec,
+                                           int* phase_ispec_inner,
+                                           int* nspec_outer,
+                                           int* nspec_inner
+                                           //int* iboolleft_xi, int* iboolright_xi,
+                                           //int* iboolleft_eta, int* iboolright_eta,
+                                           //int* npoin2D_xi, int* npoin2D_eta
+                                           ) {
+
+  TRACE("prepare_inner_core_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
+  int size_padded = NGLL3_PADDED * (mp->NSPEC_INNER_CORE);
+  int size_glob = mp->NGLOB_INNER_CORE;
+
+  // mesh
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix_inner_core, size_padded*sizeof(realw)),1001);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy_inner_core, size_padded*sizeof(realw)),1002);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz_inner_core, size_padded*sizeof(realw)),1003);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax_inner_core, size_padded*sizeof(realw)),1004);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay_inner_core, size_padded*sizeof(realw)),1005);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz_inner_core, size_padded*sizeof(realw)),1006);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax_inner_core, size_padded*sizeof(realw)),1007);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay_inner_core, size_padded*sizeof(realw)),1008);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz_inner_core, size_padded*sizeof(realw)),1009);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muvstore_inner_core, size_padded*sizeof(realw)),1011);
+
+  // transfer constant element data with padding
+  for(int i=0;i < mp->NSPEC_INNER_CORE;i++) {
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xix_inner_core + i*NGLL3_PADDED, &h_xix[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1501);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy_inner_core+i*NGLL3_PADDED,   &h_xiy[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1502);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz_inner_core+i*NGLL3_PADDED,   &h_xiz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1503);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etax_inner_core+i*NGLL3_PADDED,  &h_etax[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1504);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etay_inner_core+i*NGLL3_PADDED,  &h_etay[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1505);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz_inner_core+i*NGLL3_PADDED,  &h_etaz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1506);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax_inner_core+i*NGLL3_PADDED,&h_gammax[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1507);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay_inner_core+i*NGLL3_PADDED,&h_gammay[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1508);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz_inner_core+i*NGLL3_PADDED,&h_gammaz[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1509);
+
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_muvstore_inner_core+i*NGLL3_PADDED,   &h_muv[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1511);
+  }
+
+  // anisotropy
+  if( ! mp->anisotropic_inner_core ){
+    // no anisotropy (uses kappav and muv in inner core)
+    // kappavstore needed
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappavstore_inner_core, size_padded*sizeof(realw)),1010);
+    for(int i=0;i < mp->NSPEC_INNER_CORE;i++) {
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_kappavstore_inner_core+i*NGLL3_PADDED,&h_kappav[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1510);
+    }
+  }else{
+    // anisotropic inner core
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store_inner_core),
+                                     size_padded*sizeof(realw)),4700);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store_inner_core),
+                                     size_padded*sizeof(realw)),4701);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store_inner_core),
+                                     size_padded*sizeof(realw)),4702);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store_inner_core),
+                                     size_padded*sizeof(realw)),4703);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store_inner_core),
+                                     size_padded*sizeof(realw)),4704);
+
+    // transfer constant element data with padding
+    for(int i=0;i < mp->NSPEC_INNER_CORE;i++) {
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store_inner_core + i*NGLL3_PADDED, &c11store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4800);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store_inner_core + i*NGLL3_PADDED, &c12store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4801);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store_inner_core + i*NGLL3_PADDED, &c13store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4802);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store_inner_core + i*NGLL3_PADDED, &c33store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4803);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store_inner_core + i*NGLL3_PADDED, &c44store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4804);
+    }
+  }
+
+  // needed for boundary kernel calculations
+  if( mp->simulation_type == 3 && mp->save_boundary_mesh ){
+    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rhostore_inner_core, size_padded*sizeof(realw)),1010);
+    for(int i=0;i < mp->NSPEC_INNER_CORE;i++) {
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore_inner_core+i*NGLL3_PADDED, &h_rho[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),2106);
+    }
+  }
+
+  // global indexing
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_inner_core, size_padded*sizeof(int)),1021);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_inner_core, h_ibool,
+                                     NGLL3*(mp->NSPEC_INNER_CORE)*sizeof(int),cudaMemcpyHostToDevice),1022);
+
+  // fictious element flags
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_idoubling_inner_core,
+                                     mp->NSPEC_INNER_CORE*sizeof(int)),2010);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_idoubling_inner_core, h_idoubling_inner_core,
+                                     mp->NSPEC_INNER_CORE*sizeof(int),cudaMemcpyHostToDevice),2011);
+
+  // mesh locations
+  // only needed when gravity is on
+  if( mp->gravity ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_xstore_inner_core),sizeof(realw)*size_glob),2005);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_xstore_inner_core,h_xstore,
+                                       sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ystore_inner_core),sizeof(realw)*size_glob),2005);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_ystore_inner_core,h_ystore,
+                                       sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_zstore_inner_core),sizeof(realw)*size_glob),2005);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_zstore_inner_core,h_zstore,
+                                       sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  }
+
+  // inner/outer elements
+  mp->num_phase_ispec_inner_core = *num_phase_ispec;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_inner_core),
+                                     mp->num_phase_ispec_inner_core*2*sizeof(int)),2008);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_inner_core,phase_ispec_inner,
+                                     mp->num_phase_ispec_inner_core*2*sizeof(int),cudaMemcpyHostToDevice),2101);
+
+  mp->nspec_outer_inner_core = *nspec_outer;
+  mp->nspec_inner_inner_core = *nspec_inner;
+
+
+  // wavefield
+  int size = NDIM * mp->NGLOB_INNER_CORE;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ_inner_core),sizeof(realw)*size),4001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc_inner_core),sizeof(realw)*size),4002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel_inner_core),sizeof(realw)*size),4003);
+  // backward/reconstructed wavefield
+  if( mp->simulation_type == 3 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_inner_core),sizeof(realw)*size),4001);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_inner_core),sizeof(realw)*size),4002);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_inner_core),sizeof(realw)*size),4003);
+  }
+
+  // mass matrix
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_inner_core),sizeof(realw)*size_glob),2005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_inner_core,h_rmass,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+
+
+  // mpi communication
+//  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_iboolright_xi_inner_core,npoin2D_xi*sizeof(int)),270);
+//  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_iboolleft_xi_inner_core,npoin2D_xi*sizeof(int)),280);
+//  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_iboolright_eta_inner_core,npoin2D_eta*sizeof(int)),290);
+//  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_iboolleft_eta_inner_core,npoin2D_eta*sizeof(int)),300);
+//  
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_iboolright_xi_inner_core,iboolright_xi,
+//                                     npoin2D_xi*sizeof(int),cudaMemcpyHostToDevice),91);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_iboolleft_xi_inner_core,iboolleft_xi,
+//                                     npoin2D_xi*sizeof(int),cudaMemcpyHostToDevice),92);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_iboolright_eta_inner_core,iboolright_eta,
+//                                     npoin2D_eta*sizeof(int),cudaMemcpyHostToDevice),93);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_iboolleft_eta_inner_core,iboolleft_eta,
+//                                     npoin2D_eta*sizeof(int),cudaMemcpyHostToDevice),94);
+  
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_inner_core_device");
+#endif
+}                                  
+ 
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+/*
+extern "C"
+void FC_FUNC_(prepare_fields_acoustic_device,
+              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
+                                              realw* rmass_acoustic,
+                                              realw* rhostore,
+                                              realw* kappastore,
+                                              int* num_phase_ispec_acoustic,
+                                              int* phase_ispec_inner_acoustic,
+                                              int* ispec_is_acoustic,
+                                              int* NOISE_TOMOGRAPHY,
+                                              int* num_free_surface_faces,
+                                              int* free_surface_ispec,
+                                              int* free_surface_ijk,
+                                              int* ABSORBING_CONDITIONS,
+                                              int* b_reclen_potential,
+                                              realw* b_absorb_potential,
+                                              int* ELASTIC_SIMULATION,
+                                              int* num_coupling_ac_el_faces,
+                                              int* coupling_ac_el_ispec,
+                                              int* coupling_ac_el_ijk,
+                                              realw* coupling_ac_el_normal,
+                                              realw* coupling_ac_el_jacobian2Dw,
+                                              int* num_colors_outer_acoustic,
+                                              int* num_colors_inner_acoustic,
+                                              int* num_elem_colors_acoustic) {
+
+  TRACE("prepare_fields_acoustic_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  // Assuming NGLLX==5. Padded is then 128 (5^3+3)
+  int size_padded = NGLL3_PADDED * mp->NSPEC_AB;
+  int size_nonpadded = NGLL3 * mp->NSPEC_AB;
+  int size_glob = mp->NGLOB_AB;
+
+  // allocates arrays on device (GPU)
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),2001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),2002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),2003);
+
+  // mpi buffer
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
+                      (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),2004);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+
+  // padded array
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),2006);
+  // transfer constant element data with padding
+  for(int i=0; i < mp->NSPEC_AB; i++) {
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3],
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),2106);
+  }
+
+  // non-padded array
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),2007);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),2105);
+
+  // phase elements
+  mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic),
+                                      mp->num_phase_ispec_acoustic*2*sizeof(int)),2008);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
+                                     mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),2101);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),
+                                     mp->NSPEC_AB*sizeof(int)),2009);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
+                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),2102);
+
+  // free surface
+  if( *NOISE_TOMOGRAPHY == 0 ){
+    // allocate surface arrays
+    mp->num_free_surface_faces = *num_free_surface_faces;
+    if( mp->num_free_surface_faces > 0 ){
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
+                                       mp->num_free_surface_faces*sizeof(int)),2201);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
+                                       mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2203);
+
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
+                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int)),2202);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2204);
+    }
+  }
+
+  // absorbing boundaries
+  if( *ABSORBING_CONDITIONS ){
+    mp->d_b_reclen_potential = *b_reclen_potential;
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),2301);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
+                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2302);
+  }
+
+
+  // for seismograms
+  if( mp->nrec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
+                                       mp->nrec_local*NGLL3*sizeof(realw)),2107);
+
+    mp->h_station_seismo_potential = (realw*) malloc( mp->nrec_local*NGLL3*sizeof(realw) );
+    if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
+  }
+
+
+  // coupling with elastic parts
+  if( *ELASTIC_SIMULATION && *num_coupling_ac_el_faces > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec),
+                                       (*num_coupling_ac_el_faces)*sizeof(int)),2601);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,
+                                       (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2602);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
+                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),2603);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
+                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2604);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
+                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2605);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
+                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2606);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
+                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2607);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
+                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2608);
+
+  }
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+    mp->num_colors_outer_acoustic = *num_colors_outer_acoustic;
+    mp->num_colors_inner_acoustic = *num_colors_inner_acoustic;
+    mp->h_num_elem_colors_acoustic = (int*) num_elem_colors_acoustic;
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_fields_acoustic_device");
+#endif
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+extern "C"
+void FC_FUNC_(prepare_fields_acoustic_adj_dev,
+              PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPE,
+                                              int* APPROXIMATE_HESS_KL) {
+
+  TRACE("prepare_fields_acoustic_adj_dev");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  int size_glob = mp->NGLOB_AB;
+
+  // kernel simulations
+  if( *SIMULATION_TYPE != 3 ) return;
+
+  // allocates backward/reconstructed arrays on device (GPU)
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),3014);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),3015);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),3016);
+
+  // allocates kernels
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3017);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3018);
+
+  // initializes kernel values to zero
+  print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),3019);
+  print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),3020);
+
+  // preconditioner
+  if( *APPROXIMATE_HESS_KL ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3030);
+    // initializes with zeros
+    print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),3031);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_fields_acoustic_adj_dev");
+#endif
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for ELASTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+extern "C"
+void FC_FUNC_(prepare_fields_elastic_device,
+              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
+                                             int* size,
+                                             realw* rmass,
+                                             realw* rho_vp,
+                                             realw* rho_vs,
+                                             int* num_phase_ispec_elastic,
+                                             int* phase_ispec_inner_elastic,
+                                             int* ispec_is_elastic,
+                                             int* ABSORBING_CONDITIONS,
+                                             realw* h_b_absorb_field,
+                                             int* h_b_reclen_field,
+                                             int* SIMULATION_TYPE,int* SAVE_FORWARD,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                             realw* epsilondev_xz,realw* epsilondev_yz,
+                                             int* ATTENUATION,
+                                             int* R_size,
+                                             realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                             realw* one_minus_sum_beta,realw* factor_common,
+                                             realw* alphaval,realw* betaval,realw* gammaval,
+                                             int* OCEANS,
+                                             realw* rmass_ocean_load,
+                                             int* NOISE_TOMOGRAPHY,
+                                             realw* free_surface_normal,
+                                             int* free_surface_ispec,
+                                             int* free_surface_ijk,
+                                             int* num_free_surface_faces,
+                                             int* ACOUSTIC_SIMULATION,
+                                             int* num_colors_outer_elastic,
+                                             int* num_colors_inner_elastic,
+                                             int* num_elem_colors_elastic,
+                                             int* ANISOTROPY,
+                                             realw *c11store,
+                                             realw *c12store,
+                                             realw *c13store,
+                                             realw *c14store,
+                                             realw *c15store,
+                                             realw *c16store,
+                                             realw *c22store,
+                                             realw *c23store,
+                                             realw *c24store,
+                                             realw *c25store,
+                                             realw *c26store,
+                                             realw *c33store,
+                                             realw *c34store,
+                                             realw *c35store,
+                                             realw *c36store,
+                                             realw *c44store,
+                                             realw *c45store,
+                                             realw *c46store,
+                                             realw *c55store,
+                                             realw *c56store,
+                                             realw *c66store){
+
+TRACE("prepare_fields_elastic_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+  // Assuming NGLLX==5. Padded is then 128 (5^3+3)
+  int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
+  int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),4001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003);
+
+  // mpi buffer
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
+                        3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004);
+
+  // mass matrix
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
+  // transfer element data
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
+                                     sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010);
+
+
+  // element indices
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
+                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012);
+
+  // phase elements
+  mp->num_phase_ispec_elastic = *num_phase_ispec_elastic;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
+                                     mp->num_phase_ispec_elastic*2*sizeof(int)),4008);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
+                                     mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011);
+
+  // for seismograms
+  if( mp->nrec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
+                                     3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+
+    mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
+    if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
+  }
+
+  // absorbing conditions
+  if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
+    // non-padded arrays
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007);
+
+    // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014);
+
+    // absorb_field array used for file i/o
+    if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )){
+      mp->d_b_reclen_field = *h_b_reclen_field;
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
+                                       mp->d_b_reclen_field),4016);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
+                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017);
+    }
+  }
+
+  // strains used for attenuation and kernel simulations
+  if( *COMPUTE_AND_STORE_STRAIN ){
+    // strains
+    int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
+                                       epsilondev_size*sizeof(realw)),4301);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4302);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
+                                       epsilondev_size*sizeof(realw)),4302);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4303);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
+                                       epsilondev_size*sizeof(realw)),4304);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4305);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
+                                       epsilondev_size*sizeof(realw)),4306);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4307);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
+                                       epsilondev_size*sizeof(realw)),4308);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4309);
+
+  }
+
+  // attenuation memory variables
+  if( *ATTENUATION ){
+    // memory arrays
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
+                                       (*R_size)*sizeof(realw)),4401);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4402);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
+                                       (*R_size)*sizeof(realw)),4403);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4404);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
+                                       (*R_size)*sizeof(realw)),4405);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4406);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
+                                       (*R_size)*sizeof(realw)),4407);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4408);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
+                                       (*R_size)*sizeof(realw)),4409);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),4410);
+
+    // attenuation factors
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),4430);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
+                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
+                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433);
+
+    // alpha,beta,gamma factors
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
+                                       N_SLS*sizeof(realw)),4434);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
+                                       N_SLS*sizeof(realw)),4436);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
+                                       N_SLS*sizeof(realw)),4438);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
+
+  }
+
+  // anisotropy
+  if( *ANISOTROPY ){
+    // allocates memory on GPU
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),
+                                       size_padded*sizeof(realw)),4700);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),
+                                       size_padded*sizeof(realw)),4701);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),
+                                       size_padded*sizeof(realw)),4702);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),
+                                       size_padded*sizeof(realw)),4703);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),
+                                       size_padded*sizeof(realw)),4704);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),
+                                       size_padded*sizeof(realw)),4705);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),
+                                       size_padded*sizeof(realw)),4706);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),
+                                       size_padded*sizeof(realw)),4707);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),
+                                       size_padded*sizeof(realw)),4708);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),
+                                       size_padded*sizeof(realw)),4709);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),
+                                       size_padded*sizeof(realw)),4710);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),
+                                       size_padded*sizeof(realw)),4711);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),
+                                       size_padded*sizeof(realw)),4712);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),
+                                       size_padded*sizeof(realw)),4713);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),
+                                       size_padded*sizeof(realw)),4714);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),
+                                       size_padded*sizeof(realw)),4715);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),
+                                       size_padded*sizeof(realw)),4716);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),
+                                       size_padded*sizeof(realw)),4717);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),
+                                       size_padded*sizeof(realw)),4718);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),
+                                       size_padded*sizeof(realw)),4719);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),
+                                       size_padded*sizeof(realw)),4720);
+
+    // transfer constant element data with padding
+    for(int i=0;i < mp->NSPEC_AB;i++) {
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store + i*NGLL3_PADDED, &c11store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4800);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store + i*NGLL3_PADDED, &c12store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4801);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store + i*NGLL3_PADDED, &c13store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4802);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c14store + i*NGLL3_PADDED, &c14store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4803);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c15store + i*NGLL3_PADDED, &c15store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4804);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c16store + i*NGLL3_PADDED, &c16store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4805);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c22store + i*NGLL3_PADDED, &c22store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4806);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c23store + i*NGLL3_PADDED, &c23store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4807);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c24store + i*NGLL3_PADDED, &c24store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4808);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c25store + i*NGLL3_PADDED, &c25store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4809);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c26store + i*NGLL3_PADDED, &c26store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4810);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store + i*NGLL3_PADDED, &c33store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4811);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c34store + i*NGLL3_PADDED, &c34store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4812);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c35store + i*NGLL3_PADDED, &c35store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4813);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c36store + i*NGLL3_PADDED, &c36store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4814);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store + i*NGLL3_PADDED, &c44store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4815);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c45store + i*NGLL3_PADDED, &c45store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4816);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c46store + i*NGLL3_PADDED, &c46store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4817);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c55store + i*NGLL3_PADDED, &c55store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4818);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c56store + i*NGLL3_PADDED, &c56store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4819);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_c66store + i*NGLL3_PADDED, &c66store[i*NGLL3],
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4820);
+    }
+  }
+
+  // ocean load approximation
+  if( *OCEANS ){
+    // oceans needs a free surface
+    mp->num_free_surface_faces = *num_free_surface_faces;
+    if( mp->num_free_surface_faces > 0 ){
+      // mass matrix
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
+                                         sizeof(realw)*mp->NGLOB_AB),4501);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
+                                         sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502);
+      // surface normal
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
+                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
+                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504);
+
+      // temporary global array: used to synchronize updates on global accel array
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
+                                         sizeof(int)*mp->NGLOB_AB),4505);
+
+      if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
+        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
+                                          mp->num_free_surface_faces*sizeof(int)),4601);
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
+                                          mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603);
+        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
+                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602);
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604);
+      }
+    }
+  }
+
+  // mesh coloring
+  if( mp->use_mesh_coloring_gpu ){
+    mp->num_colors_outer_elastic = *num_colors_outer_elastic;
+    mp->num_colors_inner_elastic = *num_colors_inner_elastic;
+    mp->h_num_elem_colors_elastic = (int*) num_elem_colors_elastic;
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_fields_elastic_device");
+#endif
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+extern "C"
+void FC_FUNC_(prepare_fields_elastic_adj_dev,
+              PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
+                                             int* size,
+                                             int* SIMULATION_TYPE,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             realw* epsilon_trace_over_3,
+                                             realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                             realw* b_epsilon_trace_over_3,
+                                             int* ATTENUATION,
+                                             int* R_size,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
+                                             int* APPROXIMATE_HESS_KL){
+
+  TRACE("prepare_fields_elastic_adj_dev");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // checks if kernel simulation
+  if( *SIMULATION_TYPE != 3 ) return;
+
+  // kernel simulations
+  // allocates backward/reconstructed arrays on device (GPU)
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),5201);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),5202);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),5203);
+
+  // allocates kernels
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5204);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5205);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5206);
+
+  // initializes kernel values to zero
+  print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5207);
+  print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5208);
+  print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5209);
+
+  // strains used for attenuation and kernel simulations
+  if( *COMPUTE_AND_STORE_STRAIN ){
+    // strains
+    int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+
+    // solid pressure
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5310);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5311);
+    // backward solid pressure
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5312);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5313);
+    // prepares backward strains
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
+                                       epsilondev_size*sizeof(realw)),5321);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
+                                       epsilondev_size*sizeof(realw)),5322);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
+                                       epsilondev_size*sizeof(realw)),5323);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
+                                       epsilondev_size*sizeof(realw)),5324);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
+                                       epsilondev_size*sizeof(realw)),5325);
+
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5326);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5327);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5328);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5329);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5330);
+  }
+
+  // attenuation memory variables
+  if( *ATTENUATION ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
+                                       (*R_size)*sizeof(realw)),5421);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),5422);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
+                                       (*R_size)*sizeof(realw)),5423);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),5424);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
+                                       (*R_size)*sizeof(realw)),5425);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),5426);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
+                                       (*R_size)*sizeof(realw)),5427);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),5428);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
+                                       (*R_size)*sizeof(realw)),5429);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),5420);
+
+    // alpha,beta,gamma factors for backward fields
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
+                                       N_SLS*sizeof(realw)),5434);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
+                                       N_SLS*sizeof(realw)),5436);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
+                                       N_SLS*sizeof(realw)),5438);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439);
+  }
+
+  if( *APPROXIMATE_HESS_KL ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5450);
+    // initializes with zeros
+    print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5451);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_fields_elastic_adj_dev");
+#endif
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// purely adjoint & kernel simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+extern "C"
+void FC_FUNC_(prepare_sim2_or_3_const_device,
+              PREPARE_SIM2_OR_3_CONST_DEVICE)(
+                                              long* Mesh_pointer_f,
+                                              int* islice_selected_rec,
+                                              int* islice_selected_rec_size,
+                                              int* nadj_rec_local,
+                                              int* nrec,
+                                              int* myrank) {
+
+  TRACE("prepare_sim2_or_3_const_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // allocates arrays for receivers
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
+                                     *islice_selected_rec_size*sizeof(int)),6001);
+  // copies arrays to GPU device
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_rec,islice_selected_rec,
+                                     *islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),6002);
+
+  // adjoint source arrays
+  mp->nadj_rec_local = *nadj_rec_local;
+  if( mp->nadj_rec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
+                                       (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),6003);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
+                                       (mp->nadj_rec_local)*sizeof(int)),6004);
+
+    // prepares local irec array:
+    // the irec_local variable needs to be precomputed (as
+    // h_pre_comp..), because normally it is in the loop updating accel,
+    // and due to how it's incremented, it cannot be parallelized
+    int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) );
+    if( h_pre_computed_irec == NULL ) exit_on_error("prepare_sim2_or_3_const_device: h_pre_computed_irec not allocated\n");
+
+    int irec_local = 0;
+    for(int irec = 0; irec < *nrec; irec++) {
+      if(*myrank == islice_selected_rec[irec]) {
+        irec_local++;
+        h_pre_computed_irec[irec_local-1] = irec;
+      }
+    }
+    if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
+    // copies values onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
+                                       (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),6010);
+    free(h_pre_computed_irec);
+
+    // temporary array to prepare extracted source array values
+    mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
+    if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_sim2_or_3_const_device");
+#endif
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for NOISE simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+extern "C"
+void FC_FUNC_(prepare_fields_noise_device,
+              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+                                           int* NSPEC_AB, int* NGLOB_AB,
+                                           int* free_surface_ispec,
+                                           int* free_surface_ijk,
+                                           int* num_free_surface_faces,
+                                           int* SIMULATION_TYPE,
+                                           int* NOISE_TOMOGRAPHY,
+                                           int* NSTEP,
+                                           realw* noise_sourcearray,
+                                           realw* normal_x_noise,
+                                           realw* normal_y_noise,
+                                           realw* normal_z_noise,
+                                           realw* mask_noise,
+                                           realw* free_surface_jacobian2Dw) {
+
+  TRACE("prepare_fields_noise_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // free surface
+  mp->num_free_surface_faces = *num_free_surface_faces;
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec,
+                                     mp->num_free_surface_faces*sizeof(int)),7001);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec,
+                                     mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7002);
+
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int)),7003);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7004);
+
+  // alloc storage for the surface buffer to be copied
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),7005);
+
+  // prepares noise source array
+  if( *NOISE_TOMOGRAPHY == 1 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
+                                       3*NGLL3*(*NSTEP)*sizeof(realw)),7101);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
+                                       3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),7102);
+  }
+
+  // prepares noise directions
+  if( *NOISE_TOMOGRAPHY > 1 ){
+    int nface_size = NGLL2*(*num_free_surface_faces);
+    // allocates memory on GPU
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
+                                       nface_size*sizeof(realw)),7301);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
+                                       nface_size*sizeof(realw)),7302);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
+                                       nface_size*sizeof(realw)),7303);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
+                                       nface_size*sizeof(realw)),7304);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
+                                       nface_size*sizeof(realw)),7305);
+    // transfers data onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7306);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7307);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7310);
+  }
+
+  // prepares noise strength kernel
+  if( *NOISE_TOMOGRAPHY == 3 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
+                                       NGLL3*(mp->NSPEC_AB)*sizeof(realw)),7401);
+    // initializes kernel values to zero
+    print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),7403);
+
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("jacobian_size = %d\n",25*(*num_free_surface_faces));
+  exit_on_cuda_error("prepare_fields_noise_device");
+#endif
+}
+*/
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// cleanup
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_cleanup_device,
+              PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f) {
+
+TRACE("prepare_cleanup_device");
+
+  // frees allocated memory arrays
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // frees memory on GPU
+  // crust_mantle
+  cudaFree(mp->d_xix_crust_mantle);
+  cudaFree(mp->d_xiy_crust_mantle);
+  cudaFree(mp->d_xiz_crust_mantle);
+  cudaFree(mp->d_etax_crust_mantle);
+  cudaFree(mp->d_etay_crust_mantle);
+  cudaFree(mp->d_etaz_crust_mantle);
+  cudaFree(mp->d_gammax_crust_mantle);
+  cudaFree(mp->d_gammay_crust_mantle);
+  cudaFree(mp->d_gammaz_crust_mantle);
+
+  cudaFree(mp->d_muvstore_crust_mantle);
+  cudaFree(mp->d_ibool_crust_mantle);
+
+  if( ! mp->anisotropic_3D_mantle ){
+    cudaFree(mp->d_kappavstore_crust_mantle);
+    cudaFree(mp->d_kappahstore_crust_mantle);
+    cudaFree(mp->d_muhstore_crust_mantle);
+    cudaFree(mp->d_eta_anisostore_crust_mantle);    
+    cudaFree(mp->d_ispec_is_tiso_crust_mantle);
+  }else{
+    cudaFree(mp->d_c11store_crust_mantle);
+    cudaFree(mp->d_c12store_crust_mantle);
+    cudaFree(mp->d_c13store_crust_mantle);
+    cudaFree(mp->d_c14store_crust_mantle);
+    cudaFree(mp->d_c15store_crust_mantle);
+    cudaFree(mp->d_c16store_crust_mantle);
+    cudaFree(mp->d_c22store_crust_mantle);
+    cudaFree(mp->d_c23store_crust_mantle);
+    cudaFree(mp->d_c24store_crust_mantle);
+    cudaFree(mp->d_c25store_crust_mantle);
+    cudaFree(mp->d_c26store_crust_mantle);
+    cudaFree(mp->d_c33store_crust_mantle);
+    cudaFree(mp->d_c34store_crust_mantle);
+    cudaFree(mp->d_c35store_crust_mantle);
+    cudaFree(mp->d_c36store_crust_mantle);
+    cudaFree(mp->d_c44store_crust_mantle);
+    cudaFree(mp->d_c45store_crust_mantle);
+    cudaFree(mp->d_c46store_crust_mantle);
+    cudaFree(mp->d_c55store_crust_mantle);
+    cudaFree(mp->d_c56store_crust_mantle);
+    cudaFree(mp->d_c66store_crust_mantle);
+  }
+
+  if( mp->simulation_type == 3 && mp->save_boundary_mesh ){
+    cudaFree(mp->d_rhostore_crust_mantle);
+  }
+
+  cudaFree(mp->d_ystore_crust_mantle);
+  cudaFree(mp->d_zstore_crust_mantle);
+  if( mp->gravity ){
+    cudaFree(mp->d_xstore_crust_mantle);
+  }
+
+  cudaFree(mp->d_displ_crust_mantle);
+  cudaFree(mp->d_veloc_crust_mantle);
+  cudaFree(mp->d_accel_crust_mantle);
+  if( mp->simulation_type == 3 ){
+    cudaFree(mp->d_b_displ_crust_mantle);
+    cudaFree(mp->d_b_veloc_crust_mantle);
+    cudaFree(mp->d_b_accel_crust_mantle);
+  }
+
+
+  // outer_core
+  cudaFree(mp->d_xix_outer_core);
+  cudaFree(mp->d_xiy_outer_core);
+  cudaFree(mp->d_xiz_outer_core);
+  cudaFree(mp->d_etax_outer_core);
+  cudaFree(mp->d_etay_outer_core);
+  cudaFree(mp->d_etaz_outer_core);
+  cudaFree(mp->d_gammax_outer_core);
+  cudaFree(mp->d_gammay_outer_core);
+  cudaFree(mp->d_gammaz_outer_core);
+
+  cudaFree(mp->d_kappavstore_outer_core);
+  if( mp->simulation_type == 3 ){
+    cudaFree(mp->d_rhostore_outer_core);
+  }
+
+  cudaFree(mp->d_xstore_outer_core);
+  cudaFree(mp->d_ystore_outer_core);
+  cudaFree(mp->d_zstore_outer_core);
+  cudaFree(mp->d_ibool_outer_core);
+
+  cudaFree(mp->d_displ_outer_core);
+  cudaFree(mp->d_veloc_outer_core);
+  cudaFree(mp->d_accel_outer_core);
+  if( mp->simulation_type == 3 ){
+    cudaFree(mp->d_b_displ_outer_core);
+    cudaFree(mp->d_b_veloc_outer_core);
+    cudaFree(mp->d_b_accel_outer_core);
+  }
+
+  // inner_core
+  cudaFree(mp->d_xix_inner_core);
+  cudaFree(mp->d_xiy_inner_core);
+  cudaFree(mp->d_xiz_inner_core);
+  cudaFree(mp->d_etax_inner_core);
+  cudaFree(mp->d_etay_inner_core);
+  cudaFree(mp->d_etaz_inner_core);
+  cudaFree(mp->d_gammax_inner_core);
+  cudaFree(mp->d_gammay_inner_core);
+  cudaFree(mp->d_gammaz_inner_core);
+
+
+  cudaFree(mp->d_muvstore_inner_core);
+  cudaFree(mp->d_ibool_inner_core);
+
+  if( mp->gravity ){
+    cudaFree(mp->d_xstore_inner_core);
+    cudaFree(mp->d_ystore_inner_core);
+    cudaFree(mp->d_zstore_inner_core);
+  }
+
+  if( ! mp->anisotropic_inner_core ){
+    cudaFree(mp->d_kappavstore_inner_core);
+  }else{
+    cudaFree(mp->d_c11store_inner_core);
+    cudaFree(mp->d_c12store_inner_core);
+    cudaFree(mp->d_c13store_inner_core);
+    cudaFree(mp->d_c33store_inner_core);
+    cudaFree(mp->d_c44store_inner_core);
+  }
+
+  if( mp->simulation_type == 3 && mp->save_boundary_mesh ){
+    cudaFree(mp->d_rhostore_inner_core);
+  }
+
+  cudaFree(mp->d_displ_inner_core);
+  cudaFree(mp->d_veloc_inner_core);
+  cudaFree(mp->d_accel_inner_core);
+  if( mp->simulation_type == 3 ) {
+    cudaFree(mp->d_b_displ_inner_core);
+    cudaFree(mp->d_b_veloc_inner_core);
+    cudaFree(mp->d_b_accel_inner_core);
+  }
+
+  // sources
+  if( mp->simulation_type == 1  || mp->simulation_type == 3 ){
+    cudaFree(mp->d_sourcearrays);
+    cudaFree(mp->d_stf_pre_compute);
+  }
+
+  cudaFree(mp->d_islice_selected_source);
+  cudaFree(mp->d_ispec_selected_source);
+
+  // receivers
+  if( mp->nrec_local > 0 ) cudaFree(mp->d_number_receiver_global);
+  cudaFree(mp->d_ispec_selected_rec);
+
+  // rotation arrays
+  if( mp->rotation ){
+    cudaFree(mp->d_A_array_rotation);
+    cudaFree(mp->d_B_array_rotation);
+    if( mp->simulation_type == 3 ){
+      cudaFree(mp->d_b_A_array_rotation);
+      cudaFree(mp->d_b_B_array_rotation);
+    }
+  }
+
+  // gravity arrays
+  if( ! mp->gravity ){
+    cudaFree(mp->d_d_ln_density_dr_table);
+  }else{
+    cudaFree(mp->d_minus_rho_g_over_kappa_fluid);
+    cudaFree(mp->d_minus_gravity_table);
+    cudaFree(mp->d_minus_deriv_gravity_table);
+    cudaFree(mp->d_density_table);
+  }
+
+  // attenuation arrays
+  if( mp->attenuation ){
+    cudaFree(mp->d_one_minus_sum_beta_crust_mantle);
+    cudaFree(mp->d_one_minus_sum_beta_inner_core);
+    if( ! mp->use_attenuation_mimic ){
+      cudaFree(mp->d_R_xx_crust_mantle);
+      cudaFree(mp->d_R_yy_crust_mantle);
+      cudaFree(mp->d_R_xy_crust_mantle);
+      cudaFree(mp->d_R_xz_crust_mantle);
+      cudaFree(mp->d_R_yz_crust_mantle);
+      cudaFree(mp->d_factor_common_crust_mantle);
+      cudaFree(mp->d_R_xx_inner_core);
+      cudaFree(mp->d_R_yy_inner_core);
+      cudaFree(mp->d_R_xy_inner_core);
+      cudaFree(mp->d_R_xz_inner_core);
+      cudaFree(mp->d_R_yz_inner_core);
+      cudaFree(mp->d_factor_common_inner_core);
+    }
+    cudaFree(mp->d_alphaval);
+    cudaFree(mp->d_betaval);
+    cudaFree(mp->d_gammaval);
+    if( mp->simulation_type == 3 ){
+      cudaFree(mp->d_b_alphaval);
+      cudaFree(mp->d_b_betaval);
+      cudaFree(mp->d_b_gammaval);
+    }
+  }
+
+  // strain
+  if( mp->compute_and_store_strain ){
+    cudaFree(mp->d_epsilondev_xx_crust_mantle);
+    cudaFree(mp->d_epsilondev_yy_crust_mantle);
+    cudaFree(mp->d_epsilondev_xy_crust_mantle);
+    cudaFree(mp->d_epsilondev_xz_crust_mantle);
+    cudaFree(mp->d_epsilondev_yz_crust_mantle);
+
+    cudaFree(mp->d_epsilondev_xx_inner_core);
+    cudaFree(mp->d_epsilondev_yy_inner_core);
+    cudaFree(mp->d_epsilondev_xy_inner_core);
+    cudaFree(mp->d_epsilondev_xz_inner_core);
+    cudaFree(mp->d_epsilondev_yz_inner_core);
+
+    cudaFree(mp->d_eps_trace_over_3_crust_mantle);
+    cudaFree(mp->d_eps_trace_over_3_inner_core);
+    if( mp->simulation_type == 3 ){
+      cudaFree(mp->d_b_epsilondev_xx_crust_mantle);
+      cudaFree(mp->d_b_epsilondev_yy_crust_mantle);
+      cudaFree(mp->d_b_epsilondev_xy_crust_mantle);
+      cudaFree(mp->d_b_epsilondev_xz_crust_mantle);
+      cudaFree(mp->d_b_epsilondev_yz_crust_mantle);
+
+      cudaFree(mp->d_b_epsilondev_xx_inner_core);
+      cudaFree(mp->d_b_epsilondev_yy_inner_core);
+      cudaFree(mp->d_b_epsilondev_xy_inner_core);
+      cudaFree(mp->d_b_epsilondev_xz_inner_core);
+      cudaFree(mp->d_b_epsilondev_yz_inner_core);
+
+      cudaFree(mp->d_b_eps_trace_over_3_crust_mantle);
+      cudaFree(mp->d_b_eps_trace_over_3_inner_core);
+    }
+  }
+
+
+/*
+  // absorbing boundaries
+  if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0 ){
+    cudaFree(mp->d_abs_boundary_ispec);
+    cudaFree(mp->d_abs_boundary_ijk);
+    cudaFree(mp->d_abs_boundary_normal);
+    cudaFree(mp->d_abs_boundary_jacobian2Dw);
+  }
+
+  // interfaces
+  cudaFree(mp->d_nibool_interfaces_ext_mesh);
+  cudaFree(mp->d_ibool_interfaces_ext_mesh);
+
+  // global indexing
+  cudaFree(mp->d_ispec_is_inner);
+  cudaFree(mp->d_ibool);
+
+  // ACOUSTIC arrays
+  if( *ACOUSTIC_SIMULATION ){
+    cudaFree(mp->d_potential_acoustic);
+    cudaFree(mp->d_potential_dot_acoustic);
+    cudaFree(mp->d_potential_dot_dot_acoustic);
+    cudaFree(mp->d_send_potential_dot_dot_buffer);
+    cudaFree(mp->d_rmass_acoustic);
+    cudaFree(mp->d_rhostore);
+    cudaFree(mp->d_kappastore);
+    cudaFree(mp->d_phase_ispec_inner_acoustic);
+    cudaFree(mp->d_ispec_is_acoustic);
+
+    if( *NOISE_TOMOGRAPHY == 0 ){
+      cudaFree(mp->d_free_surface_ispec);
+      cudaFree(mp->d_free_surface_ijk);
+    }
+
+    if( *ABSORBING_CONDITIONS ) cudaFree(mp->d_b_absorb_potential);
+
+    if( *SIMULATION_TYPE == 3 ) {
+      cudaFree(mp->d_b_potential_acoustic);
+      cudaFree(mp->d_b_potential_dot_acoustic);
+      cudaFree(mp->d_b_potential_dot_dot_acoustic);
+      cudaFree(mp->d_rho_ac_kl);
+      cudaFree(mp->d_kappa_ac_kl);
+      if( *APPROXIMATE_HESS_KL) cudaFree(mp->d_hess_ac_kl);
+    }
+
+
+    if(mp->nrec_local > 0 ){
+      cudaFree(mp->d_station_seismo_potential);
+      free(mp->h_station_seismo_potential);
+    }
+
+  } // ACOUSTIC_SIMULATION
+
+  // ELASTIC arrays
+  if( *ELASTIC_SIMULATION ){
+    cudaFree(mp->d_displ);
+    cudaFree(mp->d_veloc);
+    cudaFree(mp->d_accel);
+    cudaFree(mp->d_send_accel_buffer);
+    cudaFree(mp->d_rmass);
+
+    cudaFree(mp->d_phase_ispec_inner_elastic);
+    cudaFree(mp->d_ispec_is_elastic);
+
+    if( mp->nrec_local > 0 ){
+      cudaFree(mp->d_station_seismo_field);
+      free(mp->h_station_seismo_field);
+    }
+
+    if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
+      cudaFree(mp->d_rho_vp);
+      cudaFree(mp->d_rho_vs);
+
+      if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD ))
+          cudaFree(mp->d_b_absorb_field);
+    }
+
+    if( *SIMULATION_TYPE == 3 ) {
+      cudaFree(mp->d_b_displ);
+      cudaFree(mp->d_b_veloc);
+      cudaFree(mp->d_b_accel);
+      cudaFree(mp->d_rho_kl);
+      cudaFree(mp->d_mu_kl);
+      cudaFree(mp->d_kappa_kl);
+      if( *APPROXIMATE_HESS_KL ) cudaFree(mp->d_hess_el_kl);
+    }
+
+    if( *COMPUTE_AND_STORE_STRAIN ){
+      cudaFree(mp->d_epsilondev_xx);
+      cudaFree(mp->d_epsilondev_yy);
+      cudaFree(mp->d_epsilondev_xy);
+      cudaFree(mp->d_epsilondev_xz);
+      cudaFree(mp->d_epsilondev_yz);
+      if( *SIMULATION_TYPE == 3 ){
+        cudaFree(mp->d_epsilon_trace_over_3);
+        cudaFree(mp->d_b_epsilon_trace_over_3);
+        cudaFree(mp->d_b_epsilondev_xx);
+        cudaFree(mp->d_b_epsilondev_yy);
+        cudaFree(mp->d_b_epsilondev_xy);
+        cudaFree(mp->d_b_epsilondev_xz);
+        cudaFree(mp->d_b_epsilondev_yz);
+      }
+    }
+
+    if( *ATTENUATION ){
+      cudaFree(mp->d_factor_common);
+      cudaFree(mp->d_one_minus_sum_beta);
+      cudaFree(mp->d_alphaval);
+      cudaFree(mp->d_betaval);
+      cudaFree(mp->d_gammaval);
+      cudaFree(mp->d_R_xx);
+      cudaFree(mp->d_R_yy);
+      cudaFree(mp->d_R_xy);
+      cudaFree(mp->d_R_xz);
+      cudaFree(mp->d_R_yz);
+      if( *SIMULATION_TYPE == 3){
+        cudaFree(mp->d_b_R_xx);
+        cudaFree(mp->d_b_R_yy);
+        cudaFree(mp->d_b_R_xy);
+        cudaFree(mp->d_b_R_xz);
+        cudaFree(mp->d_b_R_yz);
+        cudaFree(mp->d_b_alphaval);
+        cudaFree(mp->d_b_betaval);
+        cudaFree(mp->d_b_gammaval);
+      }
+    }
+
+    if( *ANISOTROPY ){
+      cudaFree(mp->d_c11store);
+      cudaFree(mp->d_c12store);
+      cudaFree(mp->d_c13store);
+      cudaFree(mp->d_c14store);
+      cudaFree(mp->d_c15store);
+      cudaFree(mp->d_c16store);
+      cudaFree(mp->d_c22store);
+      cudaFree(mp->d_c23store);
+      cudaFree(mp->d_c24store);
+      cudaFree(mp->d_c25store);
+      cudaFree(mp->d_c26store);
+      cudaFree(mp->d_c33store);
+      cudaFree(mp->d_c34store);
+      cudaFree(mp->d_c35store);
+      cudaFree(mp->d_c36store);
+      cudaFree(mp->d_c44store);
+      cudaFree(mp->d_c45store);
+      cudaFree(mp->d_c46store);
+      cudaFree(mp->d_c55store);
+      cudaFree(mp->d_c56store);
+      cudaFree(mp->d_c66store);
+    }
+
+    if( *OCEANS ){
+      if( mp->num_free_surface_faces > 0 ){
+        cudaFree(mp->d_rmass_ocean_load);
+        cudaFree(mp->d_free_surface_normal);
+        cudaFree(mp->d_updated_dof_ocean_load);
+        if( *NOISE_TOMOGRAPHY == 0){
+          cudaFree(mp->d_free_surface_ispec);
+          cudaFree(mp->d_free_surface_ijk);
+        }
+      }
+    }
+  } // ELASTIC_SIMULATION
+
+  // purely adjoint & kernel array
+  if( *SIMULATION_TYPE == 2 || *SIMULATION_TYPE == 3 ){
+    cudaFree(mp->d_islice_selected_rec);
+    if(mp->nadj_rec_local > 0 ){
+      cudaFree(mp->d_adj_sourcearrays);
+      cudaFree(mp->d_pre_computed_irec);
+      free(mp->h_adj_sourcearrays_slice);
+    }
+  }
+
+  // NOISE arrays
+  if( *NOISE_TOMOGRAPHY > 0 ){
+    cudaFree(mp->d_free_surface_ispec);
+    cudaFree(mp->d_free_surface_ijk);
+    cudaFree(mp->d_noise_surface_movie);
+    if( *NOISE_TOMOGRAPHY == 1 ) cudaFree(mp->d_noise_sourcearray);
+    if( *NOISE_TOMOGRAPHY > 1 ){
+      cudaFree(mp->d_normal_x_noise);
+      cudaFree(mp->d_normal_y_noise);
+      cudaFree(mp->d_normal_z_noise);
+      cudaFree(mp->d_mask_noise);
+      cudaFree(mp->d_free_surface_jacobian2Dw);
+    }
+    if( *NOISE_TOMOGRAPHY == 3 ) cudaFree(mp->d_Sigma_kl);
+  }
+
+*/
+
+  // mesh pointer - not needed anymore
+  free(mp);
+}
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,316 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <errno.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+
+void save_to_max_surface_file_(float* maxval) {
+  int rank;
+  char filename[BUFSIZ];
+  FILE* fp;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+#else
+  rank = 0;
+#endif
+  sprintf(filename,"maxval_surface_proc_%03d.dat",rank);
+  fp = fopen(filename,"a+");
+  fprintf(fp,"%e\n",*maxval);
+  fclose(fp);
+}
+
+void save_fvector_(float* vector, int* size, int* id, int* cpu_or_gpu) {
+  FILE* fp;
+  char filename[BUFSIZ];
+  if(*cpu_or_gpu == 0) {
+    sprintf(filename, "debug_output_cpu_%d.dat",*id);
+  }
+  else {
+    sprintf(filename, "debug_output_gpu_%d.dat",*id);
+  }
+  fp = fopen(filename, "wb");
+  printf("writing vector, vector[0]=%e\n",vector[0]);
+  fwrite(vector, sizeof(float), *size, fp);
+  fclose(fp);
+
+}
+
+void save_ivector_(int* vector, int* size, int* id, int* cpu_or_gpu) {
+  FILE* fp;
+  char filename[BUFSIZ];
+  if(*cpu_or_gpu == 0) {
+    sprintf(filename, "debug_output_cpu_%d.dat",*id);
+  }
+  else {
+    sprintf(filename, "debug_output_gpu_%d.dat",*id);
+  }
+  fp = fopen(filename, "wb");
+  fwrite(vector, sizeof(int), *size, fp);
+  fclose(fp);
+
+}
+
+
+void get_max_from_surface_file_(int* nodes_per_iterationf,int* NSTEP) {
+  int nodes_per_iteration = *nodes_per_iterationf;
+  char filename[BUFSIZ];
+  int procid;
+#ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+  procid = 0;
+#endif
+  sprintf(filename,"/scratch/eiger/rietmann/SPECFEM3D_AIGLE/in_out_files/DATABASES_MPI/proc%06d_surface_movie",procid);
+
+  FILE* fp; int it;
+  printf("Opening %s for analysis\n",filename);
+  fp = fopen(filename,"rb");
+  //char* errorstr;
+  if(fp == 0) {
+    //errorstr = (char*) strerror(errno);
+    printf("FILE ERROR:%s\n",strerror(errno));
+    perror("file error\n");
+    exit(1);
+  }
+
+  float* vector = (float*)malloc(nodes_per_iteration*sizeof(float));
+  float max_val;
+  int i;
+  for(it=0;it<*NSTEP;it++) {
+    int pos = (sizeof(float)*nodes_per_iteration)*(it);
+    fseek(fp,pos,SEEK_SET);
+    fread(vector,sizeof(float),nodes_per_iteration,fp);
+    for(i=0;i<nodes_per_iteration;i++) {
+      max_val = MAX(max_val,vector[i]);
+    }
+    if(it % 500 == 0) {
+      printf("scanning it=%d\n",it);
+    }
+  }
+  printf("max_val=%e\n",max_val);
+}
+
+void compare_two_vectors_exact_(int* sizef,float* vector1,float* vector2,int* num_errors) {
+
+  int size = *sizef;
+  int i;
+  int error_count = 0;
+
+  for(i=0;i<size;++i) {
+    if(vector1[i] != vector2[i]) {
+      error_count++;
+      if(error_count < 10) {
+  printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
+      }
+    }
+  }
+  printf("**** Error Count: %d ****\n",error_count);
+  *num_errors = error_count;
+}
+
+void compare_two_vectors_(int* sizef,float* vector1,float* vector2,int* num_errors) {
+
+  int size = *sizef;
+  int i;
+  int error_count = 0;
+  for(i=0;i<size;++i) {
+    if(vector1[i] != 0) {
+      if( fabsf(vector1[i]-vector2[i])/vector1[i] > 0.01) {
+  if(fabsf(vector1[i]-vector2[i]) > 1e-20) {
+  error_count++;
+      if(error_count<10) {
+        printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
+      }
+      }
+      }
+    }
+    /* if(vector1[i] != vector2[i]) { */
+    /*   if(fabsf(vector1[i]-vector2[i]) > 1e-25) { */
+    /*  error_count++; */
+    /*  if(error_count<50) { */
+    /*    printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]); */
+    /*  } */
+    /*   } */
+    /* } */
+  }
+  printf("**** Error Count: %d ****\n",error_count);
+  *num_errors = error_count;
+}
+
+void compare_surface_files_(int* bytes_per_iteration, int* number_of_iterations) {
+
+  char* cpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_proc000001_surface_movie";
+  char* gpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_v2_proc000001_surface_movie";
+
+  FILE* fp_cpu;
+  fp_cpu = fopen(cpu_file,"rb");
+  //char* errorstr;
+  if(fp_cpu == 0) {
+    //errorstr = (char*) strerror(errno);
+    //printf("CPU FILE ERROR:%s\n",errorstr);
+    printf("CPU FILE ERROR:%s\n",strerror(errno));
+    perror("cpu file error\n");
+  }
+  FILE* fp_gpu;
+  fp_gpu = fopen(gpu_file,"rb");
+
+  if(fp_gpu == NULL) {
+    //errorstr = (char*) strerror(errno);
+    //printf("GPU FILE ERROR:%s\n",errorstr);
+    printf("GPU FILE ERROR:%s\n",strerror(errno));
+    perror("gpu file error\n");
+  }
+
+  /* pause_for_debug(); */
+
+  float* gpu_vector = (float*)malloc(*bytes_per_iteration);
+  float* cpu_vector = (float*)malloc(*bytes_per_iteration);
+  int i,it,error_count=0;
+  for(it=0;it<*number_of_iterations;it++) {
+    int pos = (*bytes_per_iteration)*(it);
+
+    fseek(fp_cpu,pos,SEEK_SET);
+    fseek(fp_gpu,pos,SEEK_SET);
+
+    int number_of_nodes = *bytes_per_iteration/sizeof(float);
+    fread(cpu_vector,sizeof(float),number_of_nodes,fp_cpu);
+    fread(gpu_vector,sizeof(float),number_of_nodes,fp_gpu);
+    int size = number_of_nodes;
+    float gpu_min_val=10;
+    float gpu_max_val=0;
+    float cpu_min_val=10;
+    float cpu_max_val=0;
+    if(it<100) {
+      for(i=0;i<size;i++) {
+  if((fabs(cpu_vector[i] - gpu_vector[i])/(fabs(cpu_vector[i])+1e-31) > 0.01)) {
+    if(error_count < 30) printf("ERROR[%d]: %g != %g\n",i,cpu_vector[i], gpu_vector[i]);
+    if(cpu_vector[i] > 1e-30) error_count++;
+  }
+  if(gpu_vector[i]>gpu_max_val) gpu_max_val = gpu_vector[i];
+  if(gpu_vector[i]<gpu_min_val) gpu_min_val = gpu_vector[i];
+  if(cpu_vector[i]>cpu_max_val) cpu_max_val = cpu_vector[i];
+  if(cpu_vector[i]<cpu_min_val) cpu_min_val = cpu_vector[i];
+      }
+      printf("%d Total Errors\n",error_count);
+      printf("size:%d\n",size);
+      printf("GPU:[min/max]=%e/%e\n",gpu_min_val,gpu_max_val);
+      printf("CPU:[min/max]=%e/%e\n",cpu_min_val,cpu_max_val);
+    }
+  }
+  printf("End of Surface Compare\n");
+  exit(1);
+}
+
+
+void compare_fvector_(float* vector, int* size, int* id, int* cpu_or_gpu) {
+  FILE* fp;
+  char cmp_filename[BUFSIZ];
+  float* compare_vector = (float*)malloc(*size*sizeof(float));
+  if(*cpu_or_gpu == 0) { //swap gpu/cpu for compare
+    sprintf(cmp_filename, "debug_output_gpu_%d.dat",*id);
+  }
+  else {
+    sprintf(cmp_filename, "debug_output_cpu_%d.dat",*id);
+  }
+  fopen(cmp_filename, "rb");
+  /* read the values */
+  if((fp=fopen(cmp_filename, "rb"))==NULL) {
+    printf("Cannot open comparison file %s.\n",cmp_filename);
+    exit(1);
+  }
+  if(fread(compare_vector, sizeof(float), *size, fp) != *size) {
+    if(feof(fp))
+       printf("Premature end of file.");
+    else
+       printf("File read error.");
+  }
+
+  fclose(fp);
+
+  int i;
+  int error_count=0;
+  for(i=0;i<*size;i++) {
+    if((fabs(vector[i] - compare_vector[i])/vector[i] > 0.0001)) {
+      if(error_count < 30) {
+  printf("ERROR[%d]: %g != %g\n",i,compare_vector[i], vector[i]);
+      }
+      error_count++;
+      /* if(compare_vector[i] > 1e-30) error_count++; */
+    }
+  }
+  printf("%d Total Errors\n",error_count);
+  printf("size:%d\n",*size);
+  /* for(i=0;i<30;i++) { */
+  /*   printf("val[%d]: %g != %g\n",i,compare_vector[i], vector[i]); */
+  /*   /\* printf("error_check[%d]= %g\n",abs(vector[i] - compare_vector[i])/vector[i]); *\/ */
+  /* } */
+}
+
+void compare_ivector_(int* vector, int* size, int* id, int* cpu_or_gpu) {
+  FILE* fp;
+  char cmp_filename[BUFSIZ];
+  int* compare_vector = (int*)malloc(*size*sizeof(int));
+  if(*cpu_or_gpu == 0) { //swap gpu/cpu for compare
+    sprintf(cmp_filename, "debug_output_gpu_%d.dat",*id);
+  }
+  else {
+    sprintf(cmp_filename, "debug_output_cpu_%d.dat",*id);
+  }
+  fopen(cmp_filename, "rb");
+  /* read the values */
+  if((fp=fopen(cmp_filename, "rb"))==NULL) {
+    printf("Cannot open comparison file %s.\n",cmp_filename);
+    exit(1);
+  }
+  if(fread(compare_vector, sizeof(int), *size, fp) != *size) {
+    if(feof(fp))
+       printf("Premature end of file.");
+    else
+       printf("File read error.");
+  }
+
+  fclose(fp);
+
+  int i;
+  int error_count=0;
+  for(i=0;i<*size;i++) {
+    if((abs(vector[i] - compare_vector[i])/vector[i] > 0.01) && error_count < 30) {
+      printf("ERROR[%d]: %g != %g\n",i,compare_vector[i], vector[i]);
+      error_count++;
+    }
+  }
+  printf("%d Total Errors\n",error_count);
+}

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,896 @@
+/* 
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config.h"
+
+typedef float realw;
+
+ 
+
+//
+// src/cuda/assemble_MPI_scalar_cuda.cu
+//
+
+void FC_FUNC_(transfer_boun_pot_from_device,
+              TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer_f,
+                                             realw* send_potential_dot_dot_buffer,
+                                             int* FORWARD_OR_ADJOINT){} 
+
+void FC_FUNC_(transfer_asmbl_pot_to_device,
+              TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
+                                            realw* buffer_recv_scalar,
+                                            int* FORWARD_OR_ADJOINT) {} 
+
+
+//
+// src/cuda/assemble_MPI_vector_cuda.cu
+//
+
+void FC_FUNC_(transfer_boun_accel_from_device,
+              TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
+                                                  realw* send_accel_buffer,
+                                                  int* IREGION,
+                                                  int* FORWARD_OR_ADJOINT){} 
+
+void FC_FUNC_(transfer_asmbl_accel_to_device,
+              TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
+                                              realw* buffer_recv_vector,
+                                              int* IREGION,
+                                              int* FORWARD_OR_ADJOINT) {} 
+
+
+//
+// src/cuda/check_fields_cuda.cu
+//
+
+void FC_FUNC_(pause_for_debug,
+              PAUSE_FOR_DEBUG)() {} 
+
+void FC_FUNC_(output_free_device_memory,
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {} 
+
+void FC_FUNC_(get_free_device_memory,
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {} 
+
+void FC_FUNC_(check_max_norm_displ_gpu,
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {} 
+
+void FC_FUNC_(check_max_norm_vector,
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {} 
+
+void FC_FUNC_(check_max_norm_displ,
+              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {} 
+
+void FC_FUNC_(check_max_norm_b_displ_gpu,
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {} 
+
+void FC_FUNC_(check_max_norm_b_accel_gpu,
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {} 
+
+void FC_FUNC_(check_max_norm_b_veloc_gpu,
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {} 
+
+void FC_FUNC_(check_max_norm_b_displ,
+              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {} 
+
+void FC_FUNC_(check_max_norm_b_accel,
+              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {} 
+
+void FC_FUNC_(check_error_vectors,
+              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {} 
+
+void FC_FUNC_(get_max_accel,
+              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} 
+
+void FC_FUNC_(get_norm_acoustic_from_device,
+              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
+                                                  long* Mesh_pointer_f,
+                                                  int* SIMULATION_TYPE) {} 
+
+void FC_FUNC_(get_norm_elastic_from_device,
+              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+                                                 long* Mesh_pointer_f,
+                                                 int* SIMULATION_TYPE) {} 
+
+
+//
+// src/cuda/compute_add_sources_acoustic_cuda.cu
+//
+
+void FC_FUNC_(compute_add_sources_ac_cuda,
+              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
+                                                 int* phase_is_innerf,
+                                                 int* NSOURCESf,
+                                                 int* SIMULATION_TYPEf,
+                                                 double* h_stf_pre_compute,
+                                                 int* myrankf) {} 
+
+void FC_FUNC_(compute_add_sources_ac_s3_cuda,
+              COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
+                                                      int* phase_is_innerf,
+                                                      int* NSOURCESf,
+                                                      int* SIMULATION_TYPEf,
+                                                      double* h_stf_pre_compute,
+                                                      int* myrankf) {} 
+
+void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
+              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_acoustic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {} 
+
+
+//
+// src/cuda/compute_add_sources_elastic_cuda.cu
+//
+
+void FC_FUNC_(compute_add_sources_el_cuda,
+              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* NSOURCESf,
+                                            double* h_stf_pre_compute,
+                                            int* myrankf) {} 
+
+void FC_FUNC_(compute_add_sources_el_s3_cuda,
+              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
+                                              double* h_stf_pre_compute,
+                                              int* NSOURCESf,
+                                              int* phase_is_inner,
+                                              int* myrank) {} 
+
+void FC_FUNC_(add_source_master_rec_noise_cu,
+              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
+                                                int* myrank_f,
+                                                int* it_f,
+                                                int* irec_master_noise_f,
+                                                int* islice_selected_rec) {} 
+
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+                                               realw* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_elastic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {} 
+
+
+//
+// src/cuda/compute_coupling_cuda.cu
+//
+
+void FC_FUNC_(compute_coupling_ac_el_cuda,
+              COMPUTE_COUPLING_AC_EL_CUDA)(
+                                            long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* num_coupling_ac_el_facesf,
+                                            int* SIMULATION_TYPEf) {} 
+
+void FC_FUNC_(compute_coupling_el_ac_cuda,
+              COMPUTE_COUPLING_EL_AC_CUDA)(
+                                                 long* Mesh_pointer_f,
+                                                 int* phase_is_innerf,
+                                                 int* num_coupling_ac_el_facesf,
+                                                 int* SIMULATION_TYPEf) {} 
+
+void FC_FUNC_(compute_coupling_ocean_cuda,
+              COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
+                                       int* SIMULATION_TYPE) {} 
+
+
+//
+// src/cuda/compute_forces_crust_mantle_cuda.cu
+//
+
+void FC_FUNC_(compute_forces_crust_mantle_cuda,
+              COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
+                                                int* iphase) {} 
+
+
+//
+// src/cuda/compute_forces_inner_core_cuda.cu
+//
+
+void FC_FUNC_(compute_forces_inner_core_cuda,
+              COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
+                                                int* iphase) {} 
+
+
+//
+// src/cuda/compute_forces_outer_core_cuda.cu
+//
+
+void FC_FUNC_(compute_forces_outer_core_cuda,
+              COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
+                                            int* iphase,
+                                            realw* time_f,
+                                            realw* b_time_f) {} 
+
+
+//
+// src/cuda/compute_kernels_cuda.cu
+//
+
+void FC_FUNC_(compute_kernels_elastic_cuda,
+              COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
+                                            realw* deltat_f) {} 
+
+void FC_FUNC_(compute_kernels_strgth_noise_cu,
+              COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
+                                                    realw* h_noise_surface_movie,
+                                                    realw* deltat) {} 
+
+void FC_FUNC_(compute_kernels_acoustic_cuda,
+              COMPUTE_KERNELS_ACOUSTIC_CUDA)(
+                                             long* Mesh_pointer,
+                                             realw* deltat_f) {} 
+
+void FC_FUNC_(compute_kernels_hess_cuda,
+              COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
+                                         realw* deltat_f,
+                                         int* ELASTIC_SIMULATION,
+                                         int* ACOUSTIC_SIMULATION) {} 
+
+
+//
+// src/cuda/compute_stacey_acoustic_cuda.cu
+//
+
+void FC_FUNC_(compute_stacey_acoustic_cuda,
+              COMPUTE_STACEY_ACOUSTIC_CUDA)(
+                                    long* Mesh_pointer_f,
+                                    int* phase_is_innerf,
+                                    int* SIMULATION_TYPEf,
+                                    int* SAVE_FORWARDf,
+                                    realw* h_b_absorb_potential) {} 
+
+
+//
+// src/cuda/compute_stacey_elastic_cuda.cu
+//
+
+void FC_FUNC_(compute_stacey_elastic_cuda,
+              COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* SIMULATION_TYPEf,
+                                           int* SAVE_FORWARDf,
+                                           realw* h_b_absorb_field) {} 
+
+
+//
+// src/cuda/it_update_displacement_cuda.cu
+//
+
+void FC_FUNC_(it_update_displacement_cuda,
+              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
+                                                 int* size_F,
+                                                 realw* deltat_F,
+                                                 realw* deltatsqover2_F,
+                                                 realw* deltatover2_F,
+                                                 int* SIMULATION_TYPE,
+                                                 realw* b_deltat_F,
+                                                 realw* b_deltatsqover2_F,
+                                                 realw* b_deltatover2_F) {} 
+
+void FC_FUNC_(it_update_displacement_ac_cuda,
+              it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
+                                               int* size_F,
+                                               realw* deltat_F,
+                                               realw* deltatsqover2_F,
+                                               realw* deltatover2_F,
+                                               int* SIMULATION_TYPE,
+                                               realw* b_deltat_F,
+                                               realw* b_deltatsqover2_F,
+                                               realw* b_deltatover2_F) {} 
+
+void FC_FUNC_(kernel_3_a_cuda,
+              KERNEL_3_A_CUDA)(long* Mesh_pointer,
+                               realw* deltatover2_F,
+                               int* SIMULATION_TYPE_f,
+                               realw* b_deltatover2_F,
+                               int* OCEANS) {} 
+
+void FC_FUNC_(kernel_3_b_cuda,
+              KERNEL_3_B_CUDA)(long* Mesh_pointer,
+                               realw* deltatover2_F,
+                               int* SIMULATION_TYPE_f,
+                               realw* b_deltatover2_F,
+                               int* OCEANS) {} 
+
+void FC_FUNC_(kernel_3_outer_core_cuda,
+              KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
+                                        realw* deltatover2_F,
+                                        int* SIMULATION_TYPE_f,
+                                        realw* b_deltatover2_F) {} 
+
+
+//
+// src/cuda/noise_tomography_cuda.cu
+//
+
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){} 
+
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {} 
+
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {} 
+
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {} 
+
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {} 
+
+void FC_FUNC_(transfer_surface_to_host,
+              TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
+                                        realw* h_noise_surface_movie) {} 
+
+void FC_FUNC_(noise_read_add_surface_movie_cu,
+              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+                                               realw* h_noise_surface_movie,
+                                               int* NOISE_TOMOGRAPHYf) {} 
+
+
+//
+// src/cuda/prepare_mesh_constants_cuda.cu
+//
+
+void FC_FUNC_(prepare_cuda_device,
+              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { 
+ fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
+ exit(1);
+} 
+
+void FC_FUNC_(prepare_constants_device,
+              PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
+                                        int* h_NGLLX,
+                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+                                        realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
+                                        realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
+                                        int* NSOURCES,int* nsources_local,
+                                        realw* h_sourcearrays,
+                                        int* h_islice_selected_source,
+                                        int* h_ispec_selected_source,
+                                        int* h_number_receiver_global,
+                                        int* h_ispec_selected_rec,
+                                        int* nrec,int* nrec_local,
+                                        int* NSPEC_CRUST_MANTLE, int* NGLOB_CRUST_MANTLE,
+                                        int* NSPEC_OUTER_CORE, int* NGLOB_OUTER_CORE,
+                                        int* NSPEC_INNER_CORE, int* NGLOB_INNER_CORE,
+                                        int* SIMULATION_TYPE,
+                                        int* SAVE_FORWARD_f,
+                                        int* ABSORBING_CONDITIONS_f,
+                                        int* GRAVITY_f,
+                                        int* ROTATION_f,
+                                        int* ATTENUATION_f,
+                                        int* USE_ATTENUATION_MIMIC_f,
+                                        int* COMPUTE_AND_STORE_STRAIN_f,
+                                        int* ANISOTROPIC_3D_MANTLE_f,
+                                        int* ANISOTROPIC_INNER_CORE_f,
+                                        int* SAVE_BOUNDARY_MESH_f,
+                                        int* USE_MESH_COLORING_GPU_f) {} 
+
+void FC_FUNC_(prepare_fields_rotation_device,
+              PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
+                                              realw* two_omega_earth,
+                                              realw* deltat,
+                                              realw* A_array_rotation,
+                                              realw* B_array_rotation,
+                                              realw* b_two_omega_earth,
+                                              realw* b_deltat,
+                                              realw* b_A_array_rotation,
+                                              realw* b_B_array_rotation,
+                                              int* NSPEC_OUTER_CORE_ROTATION
+                                              ) {} 
+
+void FC_FUNC_(prepare_fields_gravity_device,
+              PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
+                                             realw* d_ln_density_dr_table,
+                                             realw* minus_rho_g_over_kappa_fluid,
+                                             realw* minus_gravity_table,
+                                             realw* minus_deriv_gravity_table,
+                                             realw* density_table,
+                                             realw* h_wgll_cube,
+                                             int* NRAD_GRAVITY
+                                             ) {} 
+
+void FC_FUNC_(prepare_fields_attenuat_device,
+              PREPARE_FIELDS_ATTENUAT_DEVICE)(long* Mesh_pointer_f,
+                                                 realw* R_xx_crust_mantle,
+                                                 realw* R_yy_crust_mantle,
+                                                 realw* R_xy_crust_mantle,
+                                                 realw* R_xz_crust_mantle,
+                                                 realw* R_yz_crust_mantle,
+                                                 realw* factor_common_crust_mantle,
+                                                 realw* one_minus_sum_beta_crust_mantle,
+                                                 realw* R_xx_inner_core,
+                                                 realw* R_yy_inner_core,
+                                                 realw* R_xy_inner_core,
+                                                 realw* R_xz_inner_core,
+                                                 realw* R_yz_inner_core,
+                                                 realw* factor_common_inner_core,
+                                                 realw* one_minus_sum_beta_inner_core,
+                                                 realw* alphaval,realw* betaval,realw* gammaval,
+                                                 realw* b_alphaval,realw* b_betaval,realw* b_gammaval
+                                                 ) {} 
+
+void FC_FUNC_(prepare_fields_strain_device,
+              PREPARE_FIELDS_STRAIN_DEVICE)(long* Mesh_pointer_f,
+                                            realw* epsilondev_xx_crust_mantle,
+                                            realw* epsilondev_yy_crust_mantle,
+                                            realw* epsilondev_xy_crust_mantle,
+                                            realw* epsilondev_xz_crust_mantle,
+                                            realw* epsilondev_yz_crust_mantle,
+                                            realw* b_epsilondev_xx_crust_mantle,
+                                            realw* b_epsilondev_yy_crust_mantle,
+                                            realw* b_epsilondev_xy_crust_mantle,
+                                            realw* b_epsilondev_xz_crust_mantle,
+                                            realw* b_epsilondev_yz_crust_mantle,
+                                            realw* eps_trace_over_3_crust_mantle,
+                                            realw* b_eps_trace_over_3_crust_mantle,
+                                            realw* epsilondev_xx_inner_core,
+                                            realw* epsilondev_yy_inner_core,
+                                            realw* epsilondev_xy_inner_core,
+                                            realw* epsilondev_xz_inner_core,
+                                            realw* epsilondev_yz_inner_core,
+                                            realw* b_epsilondev_xx_inner_core,
+                                            realw* b_epsilondev_yy_inner_core,
+                                            realw* b_epsilondev_xy_inner_core,
+                                            realw* b_epsilondev_xz_inner_core,
+                                            realw* b_epsilondev_yz_inner_core,
+                                            realw* eps_trace_over_3_inner_core,
+                                            realw* b_eps_trace_over_3_inner_core
+                                            ) {} 
+
+void FC_FUNC_(prepare_mpi_buffers_device,
+              PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f,
+                                          int* num_interfaces_crust_mantle,
+                                          int* max_nibool_interfaces_crust_mantle,
+                                          int* nibool_interfaces_crust_mantle,
+                                          int* ibool_interfaces_crust_mantle,
+                                          int* num_interfaces_inner_core,
+                                          int* max_nibool_interfaces_inner_core,
+                                          int* nibool_interfaces_inner_core,
+                                          int* ibool_interfaces_inner_core,
+                                          int* num_interfaces_outer_core,
+                                          int* max_nibool_interfaces_outer_core,
+                                          int* nibool_interfaces_outer_core,
+                                          int* ibool_interfaces_outer_core
+                                          ){} 
+
+void FC_FUNC_(prepare_crust_mantle_device,
+              PREPARE_CRUST_MANTLE_DEVICE)(long* Mesh_pointer_f,
+                                        realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                        realw* h_etax, realw* h_etay, realw* h_etaz,
+                                        realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                        realw* h_rho,
+                                        realw* h_kappav, realw* h_muv,
+                                        realw* h_kappah, realw* h_muh,
+                                        realw* h_eta_aniso,
+                                        realw* h_rmass,
+                                        int* h_ibool,
+                                        realw* h_xstore, realw* h_ystore, realw* h_zstore,
+                                        int* h_ispec_is_tiso,
+                                        realw *c11store,realw *c12store,realw *c13store,
+                                        realw *c14store,realw *c15store,realw *c16store,
+                                        realw *c22store,realw *c23store,realw *c24store,
+                                        realw *c25store,realw *c26store,realw *c33store,
+                                        realw *c34store,realw *c35store,realw *c36store,
+                                        realw *c44store,realw *c45store,realw *c46store,
+                                        realw *c55store,realw *c56store,realw *c66store,
+                                        int* num_phase_ispec,
+                                        int* phase_ispec_inner,
+                                        int* nspec_outer,
+                                        int* nspec_inner
+                                        ) {} 
+
+void FC_FUNC_(prepare_outer_core_device,
+              PREPARE_OUTER_CORE_DEVICE)(long* Mesh_pointer_f,
+                                         realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                         realw* h_etax, realw* h_etay, realw* h_etaz,
+                                         realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                         realw* h_rho, realw* h_kappav,
+                                         realw* h_rmass,
+                                         int* h_ibool,
+                                         realw* h_xstore, realw* h_ystore, realw* h_zstore,
+                                         int* num_phase_ispec,
+                                         int* phase_ispec_inner,
+                                         int* nspec_outer,
+                                         int* nspec_inner
+                                         ) {} 
+
+void FC_FUNC_(prepare_inner_core_device,
+              PREPARE_INNER_CORE_DEVICE)(long* Mesh_pointer_f,
+                                           realw* h_xix, realw* h_xiy, realw* h_xiz,
+                                           realw* h_etax, realw* h_etay, realw* h_etaz,
+                                           realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+                                           realw* h_rho, realw* h_kappav, realw* h_muv,
+                                           realw* h_rmass,
+                                           int* h_ibool,
+                                           realw* h_xstore, realw* h_ystore, realw* h_zstore,
+                                           realw *c11store,realw *c12store,realw *c13store,
+                                           realw *c33store,realw *c44store,
+                                           int* h_idoubling_inner_core,
+                                           int* num_phase_ispec,
+                                           int* phase_ispec_inner,
+                                           int* nspec_outer,
+                                           int* nspec_inner
+                                           //int* iboolleft_xi, int* iboolright_xi,
+                                           //int* iboolleft_eta, int* iboolright_eta,
+                                           //int* npoin2D_xi, int* npoin2D_eta
+                                           ) {} 
+
+void FC_FUNC_(prepare_fields_acoustic_device,
+              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
+                                              realw* rmass_acoustic,
+                                              realw* rhostore,
+                                              realw* kappastore,
+                                              int* num_phase_ispec_acoustic,
+                                              int* phase_ispec_inner_acoustic,
+                                              int* ispec_is_acoustic,
+                                              int* NOISE_TOMOGRAPHY,
+                                              int* num_free_surface_faces,
+                                              int* free_surface_ispec,
+                                              int* free_surface_ijk,
+                                              int* ABSORBING_CONDITIONS,
+                                              int* b_reclen_potential,
+                                              realw* b_absorb_potential,
+                                              int* ELASTIC_SIMULATION,
+                                              int* num_coupling_ac_el_faces,
+                                              int* coupling_ac_el_ispec,
+                                              int* coupling_ac_el_ijk,
+                                              realw* coupling_ac_el_normal,
+                                              realw* coupling_ac_el_jacobian2Dw,
+                                              int* num_colors_outer_acoustic,
+                                              int* num_colors_inner_acoustic,
+                                              int* num_elem_colors_acoustic) {} 
+
+void FC_FUNC_(prepare_fields_acoustic_adj_dev,
+              PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPE,
+                                              int* APPROXIMATE_HESS_KL) {} 
+
+void FC_FUNC_(prepare_fields_elastic_device,
+              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
+                                             int* size,
+                                             realw* rmass,
+                                             realw* rho_vp,
+                                             realw* rho_vs,
+                                             int* num_phase_ispec_elastic,
+                                             int* phase_ispec_inner_elastic,
+                                             int* ispec_is_elastic,
+                                             int* ABSORBING_CONDITIONS,
+                                             realw* h_b_absorb_field,
+                                             int* h_b_reclen_field,
+                                             int* SIMULATION_TYPE,int* SAVE_FORWARD,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+                                             realw* epsilondev_xz,realw* epsilondev_yz,
+                                             int* ATTENUATION,
+                                             int* R_size,
+                                             realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                             realw* one_minus_sum_beta,realw* factor_common,
+                                             realw* alphaval,realw* betaval,realw* gammaval,
+                                             int* OCEANS,
+                                             realw* rmass_ocean_load,
+                                             int* NOISE_TOMOGRAPHY,
+                                             realw* free_surface_normal,
+                                             int* free_surface_ispec,
+                                             int* free_surface_ijk,
+                                             int* num_free_surface_faces,
+                                             int* ACOUSTIC_SIMULATION,
+                                             int* num_colors_outer_elastic,
+                                             int* num_colors_inner_elastic,
+                                             int* num_elem_colors_elastic,
+                                             int* ANISOTROPY,
+                                             realw *c11store,
+                                             realw *c12store,
+                                             realw *c13store,
+                                             realw *c14store,
+                                             realw *c15store,
+                                             realw *c16store,
+                                             realw *c22store,
+                                             realw *c23store,
+                                             realw *c24store,
+                                             realw *c25store,
+                                             realw *c26store,
+                                             realw *c33store,
+                                             realw *c34store,
+                                             realw *c35store,
+                                             realw *c36store,
+                                             realw *c44store,
+                                             realw *c45store,
+                                             realw *c46store,
+                                             realw *c55store,
+                                             realw *c56store,
+                                             realw *c66store){} 
+
+void FC_FUNC_(prepare_fields_elastic_adj_dev,
+              PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
+                                             int* size,
+                                             int* SIMULATION_TYPE,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             realw* epsilon_trace_over_3,
+                                             realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+                                             realw* b_epsilon_trace_over_3,
+                                             int* ATTENUATION,
+                                             int* R_size,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
+                                             int* APPROXIMATE_HESS_KL){} 
+
+void FC_FUNC_(prepare_sim2_or_3_const_device,
+              PREPARE_SIM2_OR_3_CONST_DEVICE)(
+                                              long* Mesh_pointer_f,
+                                              int* islice_selected_rec,
+                                              int* islice_selected_rec_size,
+                                              int* nadj_rec_local,
+                                              int* nrec,
+                                              int* myrank) {} 
+
+void FC_FUNC_(prepare_fields_noise_device,
+              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+                                           int* NSPEC_AB, int* NGLOB_AB,
+                                           int* free_surface_ispec,
+                                           int* free_surface_ijk,
+                                           int* num_free_surface_faces,
+                                           int* SIMULATION_TYPE,
+                                           int* NOISE_TOMOGRAPHY,
+                                           int* NSTEP,
+                                           realw* noise_sourcearray,
+                                           realw* normal_x_noise,
+                                           realw* normal_y_noise,
+                                           realw* normal_z_noise,
+                                           realw* mask_noise,
+                                           realw* free_surface_jacobian2Dw) {} 
+
+void FC_FUNC_(prepare_cleanup_device,
+              PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f) {} 
+
+
+//
+// src/cuda/transfer_fields_cuda.cu
+//
+
+void FC_FUNC_(transfer_fields_cm_to_device,
+              TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_fields_ic_to_device,
+              TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_fields_oc_to_device,
+              TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_fields_cm_to_device,
+              TRANSFER_FIELDS_B_CM_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                              long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_fields_ic_to_device,
+              TRANSFER_FIELDS_B_IC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                              long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_fields_oc_to_device,
+              TRANSFER_FIELDS_B_OC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                              long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_fields_cm_from_device,
+              TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_fields_ic_from_device,
+              TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_fields_oc_from_device,
+              TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_fields_cm_from_device,
+              TRANSFER_B_FIELDS_CM_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_fields_ic_from_device,
+              TRANSFER_B_FIELDS_IC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_fields_oc_from_device,
+              TRANSFER_B_FIELDS_OC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_accel_cm_to_device,
+              TRNASFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_accel_cm_from_device,
+              TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_accel_cm_from_device,
+              TRNASFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_sigma_from_device,
+              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_displ_from_device,
+              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_displ_from_device,
+              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_compute_kernel_answers_from_device,
+              TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
+                                                           realw* rho_kl,int* size_rho,
+                                                           realw* mu_kl, int* size_mu,
+                                                           realw* kappa_kl, int* size_kappa) {} 
+
+void FC_FUNC_(transfer_compute_kernel_fields_from_device,
+              TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
+                                                          realw* accel, int* size_accel,
+                                                          realw* b_displ, int* size_b_displ,
+                                                          realw* epsilondev_xx,
+                                                          realw* epsilondev_yy,
+                                                          realw* epsilondev_xy,
+                                                          realw* epsilondev_xz,
+                                                          realw* epsilondev_yz,
+                                                          int* size_epsilondev,
+                                                          realw* b_epsilondev_xx,
+                                                          realw* b_epsilondev_yy,
+                                                          realw* b_epsilondev_xy,
+                                                          realw* b_epsilondev_xz,
+                                                          realw* b_epsilondev_yz,
+                                                          int* size_b_epsilondev,
+                                                          realw* rho_kl,int* size_rho,
+                                                          realw* mu_kl, int* size_mu,
+                                                          realw* kappa_kl, int* size_kappa,
+                                                          realw* epsilon_trace_over_3,
+                                                          realw* b_epsilon_trace_over_3,
+                                                          int* size_epsilon_trace_over_3) {} 
+
+void FC_FUNC_(transfer_b_fields_att_to_device,
+              TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             int* size_R,
+                                             realw* b_epsilondev_xx,
+                                             realw* b_epsilondev_yy,
+                                             realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,
+                                             realw* b_epsilondev_yz,
+                                             int* size_epsilondev) {} 
+
+void FC_FUNC_(transfer_fields_att_from_device,
+              TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
+                                               realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                               int* size_R,
+                                               realw* epsilondev_xx,
+                                               realw* epsilondev_yy,
+                                               realw* epsilondev_xy,
+                                               realw* epsilondev_xz,
+                                               realw* epsilondev_yz,
+                                               int* size_epsilondev) {} 
+
+void FC_FUNC_(transfer_kernels_el_to_host,
+              TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
+                                                    realw* h_rho_kl,
+                                                    realw* h_mu_kl,
+                                                    realw* h_kappa_kl,
+                                                    int* NSPEC_AB) {} 
+
+void FC_FUNC_(transfer_kernels_noise_to_host,
+              TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
+                                                          realw* h_Sigma_kl,
+                                                          int* NSPEC_AB) {} 
+
+void FC_FUNC_(transfer_fields_ac_to_device,
+              TRANSFER_FIELDS_AC_TO_DEVICE)(
+                                                  int* size,
+                                                  realw* potential_acoustic,
+                                                  realw* potential_dot_acoustic,
+                                                  realw* potential_dot_dot_acoustic,
+                                                  long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_fields_ac_to_device,
+              TRANSFER_B_FIELDS_AC_TO_DEVICE)(
+                                                    int* size,
+                                                    realw* b_potential_acoustic,
+                                                    realw* b_potential_dot_acoustic,
+                                                    realw* b_potential_dot_dot_acoustic,
+                                                    long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_fields_ac_from_device,TRANSFER_FIELDS_AC_FROM_DEVICE)(
+                                                                                         int* size,
+                                                                                         realw* potential_acoustic,
+                                                                                         realw* potential_dot_acoustic,
+                                                                                         realw* potential_dot_dot_acoustic,
+                                                                                         long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_fields_ac_from_device,
+              TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
+                                                      int* size,
+                                                      realw* b_potential_acoustic,
+                                                      realw* b_potential_dot_acoustic,
+                                                      realw* b_potential_dot_dot_acoustic,
+                                                      long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_dot_dot_from_device,
+              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_dot_dot_from_device,
+              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_kernels_ac_to_host,
+              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
+                                                             realw* h_rho_ac_kl,
+                                                             realw* h_kappa_ac_kl,
+                                                             int* NSPEC_AB) {} 
+
+void FC_FUNC_(transfer_kernels_hess_el_tohost,
+              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
+                                              realw* h_hess_kl,
+                                              int* NSPEC_AB) {} 
+
+void FC_FUNC_(transfer_kernels_hess_ac_tohost,
+              TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
+                                             realw* h_hess_ac_kl,
+                                             int* NSPEC_AB) {} 
+
+
+//
+// src/cuda/write_seismograms_cuda.cu
+//
+
+void FC_FUNC_(transfer_station_el_from_device,
+              TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
+                                                   realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                   long* Mesh_pointer_f,int* number_receiver_global,
+                                                   int* ispec_selected_rec,int* ispec_selected_source,
+                                                   int* ibool,int* SIMULATION_TYPEf) {} 
+
+void FC_FUNC_(transfer_station_ac_from_device,
+              TRANSFER_STATION_AC_FROM_DEVICE)(
+                                                realw* potential_acoustic,
+                                                realw* potential_dot_acoustic,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
+                                                long* Mesh_pointer_f,
+                                                int* number_receiver_global,
+                                                int* ispec_selected_rec,
+                                                int* ispec_selected_source,
+                                                int* ibool,
+                                                int* SIMULATION_TYPEf) {} 
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,728 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+#include "prepare_constants_cuda.h"
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Transfer functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// transfer memory from CPU host to GPU device
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// crust_mantle
+extern "C"
+void FC_FUNC_(transfer_fields_cm_to_device,
+              TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+
+TRACE("transfer_fields_cm_to_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_displ_crust_mantle,displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc_crust_mantle,veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40004);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel_crust_mantle,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40005);
+
+}
+
+// inner_core
+extern "C"
+void FC_FUNC_(transfer_fields_ic_to_device,
+              TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+
+  TRACE("transfer_fields_ic_to_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_displ_inner_core,displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc_inner_core,veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40004);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel_inner_core,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40005);
+
+}
+
+// outer_core
+extern "C"
+void FC_FUNC_(transfer_fields_oc_to_device,
+              TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+
+  TRACE("transfer_fields_oc_to_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_displ_outer_core,displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc_outer_core,veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40004);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel_outer_core,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40005);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// backward/reconstructed fields
+
+// crust_mantle
+extern "C"
+void FC_FUNC_(transfer_b_fields_cm_to_device,
+              TRANSFER_FIELDS_B_CM_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                              long* Mesh_pointer_f) {
+
+  TRACE("transfer_fields_b_cm_to_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  cudaMemcpy(mp->d_b_displ_crust_mantle,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_veloc_crust_mantle,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_accel_crust_mantle,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+
+}
+
+// inner_core
+extern "C"
+void FC_FUNC_(transfer_b_fields_ic_to_device,
+              TRANSFER_FIELDS_B_IC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                              long* Mesh_pointer_f) {
+
+  TRACE("transfer_fields_b_ic_to_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  cudaMemcpy(mp->d_b_displ_inner_core,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_veloc_inner_core,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_accel_inner_core,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+
+}
+
+// outer_core
+extern "C"
+void FC_FUNC_(transfer_b_fields_oc_to_device,
+              TRANSFER_FIELDS_B_OC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                              long* Mesh_pointer_f) {
+
+  TRACE("transfer_fields_b_oc_to_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  cudaMemcpy(mp->d_b_displ_outer_core,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_veloc_outer_core,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_accel_outer_core,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// transfer memory from GPU device to CPU host
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// crust_mantle
+extern "C"
+void FC_FUNC_(transfer_fields_cm_from_device,
+              TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+
+  TRACE("transfer_fields_cm_from_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+  print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008);
+
+}
+
+// inner_core
+extern "C"
+void FC_FUNC_(transfer_fields_ic_from_device,
+              TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+
+  TRACE("transfer_fields_ic_from_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+  print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008);
+
+}
+
+// outer_core
+extern "C"
+void FC_FUNC_(transfer_fields_oc_from_device,
+              TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+
+  TRACE("transfer_fields_oc_from_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ_outer_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+  print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc_outer_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel_outer_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008);
+
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// crust_mantle
+extern "C"
+void FC_FUNC_(transfer_b_fields_cm_from_device,
+              TRANSFER_B_FIELDS_CM_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                long* Mesh_pointer_f) {
+
+TRACE("transfer_b_fields_cm_from_device_");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  cudaMemcpy(b_displ,mp->d_b_displ_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_veloc,mp->d_b_veloc_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_accel,mp->d_b_accel_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+
+}
+
+// inner_core
+extern "C"
+void FC_FUNC_(transfer_b_fields_ic_from_device,
+              TRANSFER_B_FIELDS_IC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                long* Mesh_pointer_f) {
+  
+  TRACE("transfer_fields_b_ic_from_device_");
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  
+  print_CUDA_error_if_any(cudaMemcpy(b_displ,mp->d_b_displ_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+  print_CUDA_error_if_any(cudaMemcpy(b_veloc,mp->d_b_veloc_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
+  print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008);
+  
+}
+
+// outer_core
+extern "C"
+void FC_FUNC_(transfer_b_fields_oc_from_device,
+              TRANSFER_B_FIELDS_OC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                long* Mesh_pointer_f) {
+  
+  TRACE("transfer_b_fields_oc_from_device_");
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  
+  print_CUDA_error_if_any(cudaMemcpy(b_displ,mp->d_b_displ_outer_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+  print_CUDA_error_if_any(cudaMemcpy(b_veloc,mp->d_b_veloc_outer_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
+  print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel_outer_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008);
+  
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_accel_cm_to_device,
+              TRNASFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
+
+TRACE("transfer_accel_cm_to_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel_crust_mantle,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40016);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_accel_cm_from_device,
+              TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
+
+TRACE("transfer_accel_cm_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40026);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_accel_cm_from_device,
+              TRNASFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {
+
+TRACE("transfer_b_accel_cm_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40036);
+
+}
+
+
+//daniel: TODO old code routines...
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_sigma_from_device,
+              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {
+
+TRACE("transfer_sigma_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40046);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_displ_from_device,
+              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
+
+TRACE("transfer_b_displ_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40056);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_displ_from_device,
+              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
+
+TRACE("transfer_displ_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40066);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+/*
+extern "C"
+void FC_FUNC_(transfer_compute_kernel_answers_from_device,
+              TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
+                                                           realw* rho_kl,int* size_rho,
+                                                           realw* mu_kl, int* size_mu,
+                                                           realw* kappa_kl, int* size_kappa) {
+TRACE("transfer_compute_kernel_answers_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
+
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+/*
+extern "C"
+void FC_FUNC_(transfer_compute_kernel_fields_from_device,
+              TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
+                                                          realw* accel, int* size_accel,
+                                                          realw* b_displ, int* size_b_displ,
+                                                          realw* epsilondev_xx,
+                                                          realw* epsilondev_yy,
+                                                          realw* epsilondev_xy,
+                                                          realw* epsilondev_xz,
+                                                          realw* epsilondev_yz,
+                                                          int* size_epsilondev,
+                                                          realw* b_epsilondev_xx,
+                                                          realw* b_epsilondev_yy,
+                                                          realw* b_epsilondev_xy,
+                                                          realw* b_epsilondev_xz,
+                                                          realw* b_epsilondev_yz,
+                                                          int* size_b_epsilondev,
+                                                          realw* rho_kl,int* size_rho,
+                                                          realw* mu_kl, int* size_mu,
+                                                          realw* kappa_kl, int* size_kappa,
+                                                          realw* epsilon_trace_over_3,
+                                                          realw* b_epsilon_trace_over_3,
+                                                          int* size_epsilon_trace_over_3) {
+TRACE("transfer_compute_kernel_fields_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
+       cudaMemcpyDeviceToHost);
+  cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
+       cudaMemcpyDeviceToHost);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_compute_kernel_fields_from_device");
+#endif
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// attenuation fields
+
+extern "C"
+void FC_FUNC_(transfer_b_fields_att_to_device,
+              TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
+                                             realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+                                             int* size_R,
+                                             realw* b_epsilondev_xx,
+                                             realw* b_epsilondev_yy,
+                                             realw* b_epsilondev_xy,
+                                             realw* b_epsilondev_xz,
+                                             realw* b_epsilondev_yz,
+                                             int* size_epsilondev) {
+  TRACE("transfer_b_fields_att_to_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+
+  cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_fields_att_to_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// attenuation fields
+
+extern "C"
+void FC_FUNC_(transfer_fields_att_from_device,
+              TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
+                                               realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+                                               int* size_R,
+                                               realw* epsilondev_xx,
+                                               realw* epsilondev_yy,
+                                               realw* epsilondev_xy,
+                                               realw* epsilondev_xz,
+                                               realw* epsilondev_yz,
+                                               int* size_epsilondev) {
+  TRACE("transfer_fields_att_from_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_fields_att_from_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_kernels_el_to_host,
+              TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
+                                                    realw* h_rho_kl,
+                                                    realw* h_mu_kl,
+                                                    realw* h_kappa_kl,
+                                                    int* NSPEC_AB) {
+TRACE("transfer_kernels_el_to_host");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*NGLL3*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),40101);
+  print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*NGLL3*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),40102);
+  print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*NGLL3*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),40103);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for NOISE simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_kernels_noise_to_host,
+              TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
+                                                          realw* h_Sigma_kl,
+                                                          int* NSPEC_AB) {
+TRACE("transfer_kernels_noise_to_host");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),40201);
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_fields_ac_to_device,
+              TRANSFER_FIELDS_AC_TO_DEVICE)(
+                                                  int* size,
+                                                  realw* potential_acoustic,
+                                                  realw* potential_dot_acoustic,
+                                                  realw* potential_dot_dot_acoustic,
+                                                  long* Mesh_pointer_f) {
+TRACE("transfer_fields_ac_to_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_acoustic,potential_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),50110);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_acoustic,potential_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),50120);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),50130);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_fields_ac_to_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_fields_ac_to_device,
+              TRANSFER_B_FIELDS_AC_TO_DEVICE)(
+                                                    int* size,
+                                                    realw* b_potential_acoustic,
+                                                    realw* b_potential_dot_acoustic,
+                                                    realw* b_potential_dot_dot_acoustic,
+                                                    long* Mesh_pointer_f) {
+TRACE("transfer_b_fields_ac_to_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_acoustic,b_potential_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),51110);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),51120);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyHostToDevice),51130);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_fields_ac_to_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_fields_ac_from_device,TRANSFER_FIELDS_AC_FROM_DEVICE)(
+                                                                                         int* size,
+                                                                                         realw* potential_acoustic,
+                                                                                         realw* potential_dot_acoustic,
+                                                                                         realw* potential_dot_dot_acoustic,
+                                                                                         long* Mesh_pointer_f) {
+TRACE("transfer_fields_ac_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(potential_acoustic,mp->d_potential_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52111);
+  print_CUDA_error_if_any(cudaMemcpy(potential_dot_acoustic,mp->d_potential_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52121);
+  print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52131);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_fields_ac_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_fields_ac_from_device,
+              TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
+                                                      int* size,
+                                                      realw* b_potential_acoustic,
+                                                      realw* b_potential_dot_acoustic,
+                                                      realw* b_potential_dot_dot_acoustic,
+                                                      long* Mesh_pointer_f) {
+TRACE("transfer_b_fields_ac_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(b_potential_acoustic,mp->d_b_potential_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53111);
+  print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_acoustic,mp->d_b_potential_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53121);
+  print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53131);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_fields_ac_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_dot_dot_from_device,
+              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+
+  TRACE("transfer_dot_dot_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50041);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_dot_dot_from_device,
+              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+
+  TRACE("transfer_b_dot_dot_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
+                                     sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50042);
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_kernels_ac_to_host,
+              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
+                                                             realw* h_rho_ac_kl,
+                                                             realw* h_kappa_ac_kl,
+                                                             int* NSPEC_AB) {
+
+  TRACE("transfer_kernels_ac_to_host");
+
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+  int size = *NSPEC_AB*NGLL3;
+
+  // copies kernel values over to CPU host
+  print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),54101);
+  print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),54102);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for Hess kernel calculations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_kernels_hess_el_tohost,
+              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
+                                              realw* h_hess_kl,
+                                              int* NSPEC_AB) {
+TRACE("transfer_kernels_hess_el_tohost");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_el_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),70201);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_kernels_hess_ac_tohost,
+              TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
+                                             realw* h_hess_ac_kl,
+                                             int* NSPEC_AB) {
+  TRACE("transfer_kernels_hess_ac_tohost");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(h_hess_ac_kl,mp->d_hess_ac_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),70202);
+}
+
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,316 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void transfer_stations_fields_from_device_kernel(int* number_receiver_global,
+                                                            int* ispec_selected_rec,
+                                                            int* ibool,
+                                                            realw* station_seismo_field,
+                                                            realw* desired_field,
+                                                            int nrec_local) {
+  int blockID = blockIdx.x + blockIdx.y*gridDim.x;
+  if(blockID<nrec_local) {
+    int irec = number_receiver_global[blockID]-1;
+    int ispec = ispec_selected_rec[irec]-1;
+    int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+
+    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+0] = desired_field[3*iglob];
+    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+1] = desired_field[3*iglob+1];
+    station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+2] = desired_field[3*iglob+2];
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void transfer_field_from_device(Mesh* mp, realw* d_field,realw* h_field,
+                                          int* number_receiver_global,
+                                          int* d_ispec_selected,
+                                          int* h_ispec_selected,
+                                          int* ibool) {
+
+TRACE("transfer_field_from_device");
+
+  // checks if anything to do
+  if( mp->nrec_local == 0 ) return;
+
+  int blocksize = NGLL3;
+  int num_blocks_x = mp->nrec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // prepare field transfer array on device
+  transfer_stations_fields_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+                                                                d_ispec_selected,
+                                                                mp->d_ibool,
+                                                                mp->d_station_seismo_field,
+                                                                d_field,
+                                                                mp->nrec_local);
+
+  cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
+       (3*NGLL3)*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  int irec_local;
+  for(irec_local=0;irec_local<mp->nrec_local;irec_local++) {
+    int irec = number_receiver_global[irec_local] - 1;
+    int ispec = h_ispec_selected[irec] - 1;
+
+    for(int i=0;i<NGLL3;i++) {
+      int iglob = ibool[i+NGLL3*ispec] - 1;
+      h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
+      h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
+      h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
+    }
+
+  }
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_field_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_station_el_from_device,
+              TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
+                                                   realw* b_displ, realw* b_veloc, realw* b_accel,
+                                                   long* Mesh_pointer_f,int* number_receiver_global,
+                                                   int* ispec_selected_rec,int* ispec_selected_source,
+                                                   int* ibool,int* SIMULATION_TYPEf) {
+TRACE("transfer_station_el_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  // checks if anything to do
+  if( mp->nrec_local == 0 ) return;
+
+  int SIMULATION_TYPE = *SIMULATION_TYPEf;
+
+  if(SIMULATION_TYPE == 1) {
+    transfer_field_from_device(mp,mp->d_displ,displ, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_from_device(mp,mp->d_veloc,veloc, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_from_device(mp,mp->d_accel,accel, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+  }
+  else if(SIMULATION_TYPE == 2) {
+    transfer_field_from_device(mp,mp->d_displ,displ, number_receiver_global,
+             mp->d_ispec_selected_source, ispec_selected_source, ibool);
+    transfer_field_from_device(mp,mp->d_veloc,veloc, number_receiver_global,
+             mp->d_ispec_selected_source, ispec_selected_source, ibool);
+    transfer_field_from_device(mp,mp->d_accel,accel, number_receiver_global,
+             mp->d_ispec_selected_source, ispec_selected_source, ibool);
+  }
+  else if(SIMULATION_TYPE == 3) {
+    transfer_field_from_device(mp,mp->d_b_displ,b_displ, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_from_device(mp,mp->d_b_veloc,b_veloc, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_from_device(mp,mp->d_b_accel,b_accel, number_receiver_global,
+             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+  }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void transfer_stations_fields_acoustic_from_device_kernel(int* number_receiver_global,
+                                                                     int* ispec_selected_rec,
+                                                                     int* ibool,
+                                                                     realw* station_seismo_potential,
+                                                                     realw* desired_potential) {
+
+  int blockID = blockIdx.x + blockIdx.y*gridDim.x;
+  int nodeID = threadIdx.x + blockID*blockDim.x;
+
+  int irec = number_receiver_global[blockID]-1;
+  int ispec = ispec_selected_rec[irec]-1;
+  int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+
+  //if(threadIdx.x == 0 ) printf("node acoustic: %i %i %i %i %i %e \n",blockID,nodeID,irec,ispec,iglob,desired_potential[iglob]);
+
+  station_seismo_potential[nodeID] = desired_potential[iglob];
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void transfer_field_acoustic_from_device(Mesh* mp,
+                                         realw* d_potential,
+                                         realw* h_potential,
+                                         int* number_receiver_global,
+                                         int* d_ispec_selected,
+                                         int* h_ispec_selected,
+                                         int* ibool) {
+
+TRACE("transfer_field_acoustic_from_device");
+
+  int irec_local,irec,ispec,iglob,j;
+
+  // checks if anything to do
+  if( mp->nrec_local == 0 ) return;
+
+  // sets up kernel dimensions
+  int blocksize = NGLL3;
+  int num_blocks_x = mp->nrec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // prepare field transfer array on device
+  transfer_stations_fields_acoustic_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+                                                                         d_ispec_selected,
+                                                                         mp->d_ibool,
+                                                                         mp->d_station_seismo_potential,
+                                                                         d_potential);
+
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential,
+                                     mp->nrec_local*NGLL3*sizeof(realw),cudaMemcpyDeviceToHost),500);
+
+  //printf("copy local receivers: %i \n",mp->nrec_local);
+
+  for(irec_local=0; irec_local < mp->nrec_local; irec_local++) {
+    irec = number_receiver_global[irec_local]-1;
+    ispec = h_ispec_selected[irec]-1;
+
+    // copy element values
+    // note: iglob may vary and can be irregularly accessing the h_potential array
+    for(j=0; j < NGLL3; j++){
+      iglob = ibool[j+NGLL3*ispec]-1;
+      h_potential[iglob] = mp->h_station_seismo_potential[j+irec_local*NGLL3];
+    }
+
+    // copy each station element's points to working array
+    // note: this works if iglob values would be all aligned...
+    //memcpy(&(h_potential[iglob]),&(mp->h_station_seismo_potential[irec_local*NGLL3]),NGLL3*sizeof(realw));
+
+  }
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_field_acoustic_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_station_ac_from_device,
+              TRANSFER_STATION_AC_FROM_DEVICE)(
+                                                realw* potential_acoustic,
+                                                realw* potential_dot_acoustic,
+                                                realw* potential_dot_dot_acoustic,
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_acoustic,
+                                                realw* b_potential_dot_dot_acoustic,
+                                                long* Mesh_pointer_f,
+                                                int* number_receiver_global,
+                                                int* ispec_selected_rec,
+                                                int* ispec_selected_source,
+                                                int* ibool,
+                                                int* SIMULATION_TYPEf) {
+
+TRACE("transfer_station_ac_from_device");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  // checks if anything to do
+  if( mp->nrec_local == 0 ) return;
+
+  int SIMULATION_TYPE = *SIMULATION_TYPEf;
+
+  if(SIMULATION_TYPE == 1) {
+    transfer_field_acoustic_from_device(mp,mp->d_potential_acoustic,potential_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_acoustic,potential_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+  }
+  else if(SIMULATION_TYPE == 2) {
+    transfer_field_acoustic_from_device(mp,mp->d_potential_acoustic,potential_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_source, ispec_selected_source, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_acoustic,potential_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_source, ispec_selected_source, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_source, ispec_selected_source, ibool);
+  }
+  else if(SIMULATION_TYPE == 3) {
+    transfer_field_acoustic_from_device(mp,mp->d_b_potential_acoustic,b_potential_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    transfer_field_acoustic_from_device(mp,mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic,
+                                        number_receiver_global,
+                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("transfer_station_ac_from_device");
+#endif
+}
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/calc_jacobian.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/calc_jacobian.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/calc_jacobian.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -177,9 +177,13 @@
                  xgamma*(yxi*zeta-yeta*zxi)
 
             ! Check the jacobian
+            ! note: when honoring the moho, we squeeze and stretch elements
+            !          thus, it can happen that with a coarse mesh resolution, the jacobian encounters problems
             if(jacobian <= ZERO) then
               call xyz_2_rthetaphi_dble(xmesh,ymesh,zmesh,r,theta,phi)
-              print*,'r/lat/lon:',r*R_EARTH_KM,90.0-theta*180./PI,phi*180./PI
+              print*,'error jacobian rank:',myrank
+              print*,'  location r/lat/lon: ',r*R_EARTH_KM,90.0-theta*180./PI,phi*180./PI
+              print*,'  jacobian: ',jacobian
               call exit_MPI(myrank,'3D Jacobian undefined in recalc_jacobian_gll3D.f90')
             end if
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_attenuation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_attenuation.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_attenuation.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -34,6 +34,8 @@
 !     Univeristy of Rhode Island
 !
 !  <savage at uri.edu>.
+!  <savage13 at gps.caltech.edu>
+!  <savage13 at dtm.ciw.edu>
 !
 !   It is based upon formulation in the following references:
 !
@@ -44,23 +46,11 @@
 !      Velocity dispersion due to anelasticity: implications for seismology and mantle composition
 !      Geophys, J. R. asts. Soc, Vol 47, pp. 41-58
 !
-!   The methodology can be found in Brian Savage, Dimitri Komatitsch and Jeroen Tromp,
-!   Effects of 3D attenuation on seismic wave amplitude and phase measurements, Bulletin of the Seismological Society of America,
-!   vol. 100(3), p. 1241-1251, doi: 10.1785/0120090263 (2010).
+!   The methodology can be found in Savage and Tromp, 2006, unpublished
 !
-! @ARTICLE{SaKoTr10,
-!   author = {Brian Savage and Dimitri Komatitsch and Jeroen Tromp},
-!   title = {Effects of {3D} attenuation on seismic wave amplitude and phase measurements},
-!   journal = {Bull. Seismol. Soc. Am.},
-!   year = {2010},
-!   volume = {100},
-!   pages = {1241-1251},
-!   number = {3},
-!   doi = {10.1785/0120090263}}
-!
-!
 !--------------------------------------------------------------------------------------------------
 
+
   subroutine model_attenuation_broadcast(myrank,AM_V,MIN_ATTENUATION_PERIOD,MAX_ATTENUATION_PERIOD)
 
 ! standard routine to setup model

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_s362ani.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_s362ani.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_s362ani.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -1102,7 +1102,7 @@
 
   integer :: ncon,nver
 
-!daniel: original
+! originally define
 !
 !  real(kind=4) verlat(1)
 !  real(kind=4) verlon(1)
@@ -1111,7 +1111,7 @@
 !  integer icon(1)
 !  real(kind=4) con(1)
 
-!daniel: avoiding out-of-bounds errors
+! avoiding out-of-bounds errors
   real(kind=4) verlat(nver)
   real(kind=4) verlon(nver)
   real(kind=4) verrad(nver)
@@ -1255,11 +1255,12 @@
         call ylm(y,x,lmax,ylmcof(1,ihpa),wk1,wk2,wk3)
       else if(itypehpa(ihpa) == 2) then
         numcof=numcoe(ihpa)
-!daniel
+! originally called
 !        call splcon(y,x,numcof,xlaspl(1,ihpa), &
 !              xlospl(1,ihpa),radspl(1,ihpa), &
 !              nconpt(ihpa),iconpt(1,ihpa),conpt(1,ihpa))
 
+! making sure of array bounds
         call splcon(y,x,numcof,xlaspl(1:numcof,ihpa), &
               xlospl(1:numcof,ihpa),radspl(1:numcof,ihpa), &
               nconpt(ihpa),iconpt(1,ihpa),conpt(1,ihpa))
@@ -1416,11 +1417,12 @@
     else if(itypehpa(ihpa) == 2) then
       numcof=numcoe(ihpa)
 
-!daniel
+! originally called
 !        call splcon(y,x,numcof,xlaspl(1,ihpa), &
 !              xlospl(1,ihpa),radspl(1,ihpa), &
 !              nconpt(ihpa),iconpt(1,ihpa),conpt(1,ihpa))
 
+! making sure array bounds
       call splcon(y,x,numcof,xlaspl(1:numcof,ihpa), &
               xlospl(1:numcof,ihpa),radspl(1:numcof,ihpa), &
               nconpt(ihpa),iconpt(1:maxver,ihpa),conpt(1:maxver,ihpa))

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/broadcast_compute_parameters.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/broadcast_compute_parameters.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/broadcast_compute_parameters.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -317,3 +317,26 @@
   endif
 
   end subroutine broadcast_compute_parameters
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine broadcast_gpu_parameters(myrank,GPU_MODE)
+
+  implicit none
+
+! standard include of the MPI library
+  include 'mpif.h'
+  include "constants.h"
+  include "precision.h"
+
+  integer:: myrank
+  logical:: GPU_MODE
+  ! local parameters
+  integer :: ier
+
+  call MPI_BCAST(GPU_MODE,1,MPI_LOGICAL,0,MPI_COMM_WORLD,ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error broadcasting GPU_MODE')
+
+  end subroutine broadcast_gpu_parameters

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -40,10 +40,10 @@
 ! identifier for error message file
   integer, parameter :: IERROR = 30
 
-  integer myrank
+  integer :: myrank
   character(len=*) error_msg
 
-  integer ier
+  integer :: ier
   character(len=80) outputname
   character(len=150) OUTPUT_FILES
 
@@ -93,7 +93,7 @@
 
   character(len=*) error_msg
 
-  integer ier
+  integer :: ier
 
 ! write error message to screen
   write(*,*) error_msg(1:len(error_msg))
@@ -105,3 +105,24 @@
 
   end subroutine exit_MPI_without_rank
 
+!
+!----
+!
+
+  subroutine sync_all()
+
+  implicit none
+
+! standard include of the MPI library
+  include 'mpif.h'
+
+  integer :: ier,rank
+
+  ! gets callers rank
+  call MPI_COMM_RANK(MPI_COMM_WORLD,rank,ier)
+  
+  ! synchronizes MPI processes  
+  call MPI_BARRIER(MPI_COMM_WORLD,ier)
+  if( ier /= 0 ) call exit_mpi(rank,'error synchronize MPI processes')
+  
+  end subroutine sync_all

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/memory_eval.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/memory_eval.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/memory_eval.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -244,7 +244,7 @@
 ! idoubling_crust_mantle (not needed anymore..)
 !  static_memory_size = static_memory_size + NSPEC(IREGION_CRUST_MANTLE)*dble(SIZE_INTEGER)
 ! idoubling_outer_core
-  static_memory_size = static_memory_size + NSPEC(IREGION_OUTER_CORE)*dble(SIZE_INTEGER)
+!  static_memory_size = static_memory_size + NSPEC(IREGION_OUTER_CORE)*dble(SIZE_INTEGER)
 ! idoubling_inner_core
   static_memory_size = static_memory_size + NSPEC(IREGION_INNER_CORE)*dble(SIZE_INTEGER)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_parameter_file.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_parameter_file.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_parameter_file.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -192,3 +192,26 @@
 
   end subroutine read_parameter_file
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_gpu_mode(GPU_MODE)
+
+  implicit none
+  include "constants.h"
+
+  logical :: GPU_MODE
+
+  ! initializes flags
+  GPU_MODE = .false.
+
+  ! opens file Par_file
+  call open_parameter_file()
+
+  call read_value_logical(GPU_MODE, 'solver.GPU_MODE')
+
+  ! close parameter file
+  call close_parameter_file()
+
+  end subroutine read_gpu_mode

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/save_header_file.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/save_header_file.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/save_header_file.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -134,7 +134,7 @@
   endif
 
   write(IOUT,*) '!'
-  write(IOUT,*) '! number of processors = ',NPROCTOT
+  write(IOUT,*) '! number of processors = ',NPROCTOT ! should be = NPROC
   write(IOUT,*) '!'
   write(IOUT,*) '! maximum number of points per region = ',nglob(IREGION_CRUST_MANTLE)
   write(IOUT,*) '!'
@@ -147,16 +147,16 @@
   write(IOUT,*) '! total points per slice = ',sum(nglob)
   write(IOUT,*) '!'
 
-  write(IOUT,*) '! total for full 6-chunk mesh:'
+  write(IOUT,'(1x,a,i1,a)') '! total for full ',NCHUNKS,'-chunk mesh:'
   write(IOUT,*) '! ---------------------------'
   write(IOUT,*) '!'
   write(IOUT,*) '! exact total number of spectral elements in entire mesh = '
-  write(IOUT,*) '! ',6.d0*dble(NPROC)*dble(sum(NSPEC)) - subtract_central_cube_elems
+  write(IOUT,*) '! ',dble(NCHUNKS)*dble(NPROC)*dble(sum(NSPEC)) - subtract_central_cube_elems
   write(IOUT,*) '! approximate total number of points in entire mesh = '
-  write(IOUT,*) '! ',2.d0*dble(NPROC)*(3.d0*dble(sum(nglob))) - subtract_central_cube_points
+  write(IOUT,*) '! ',dble(NCHUNKS)*dble(NPROC)*dble(sum(nglob)) - subtract_central_cube_points
 ! there are 3 DOFs in solid regions, but only 1 in fluid outer core
   write(IOUT,*) '! approximate total number of degrees of freedom in entire mesh = '
-  write(IOUT,*) '! ',6.d0*dble(NPROC)*(3.d0*(dble(sum(nglob))) &
+  write(IOUT,*) '! ',dble(NCHUNKS)*dble(NPROC)*(3.d0*(dble(sum(nglob))) &
     - 2.d0*dble(nglob(IREGION_OUTER_CORE))) &
     - 3.d0*subtract_central_cube_points
   write(IOUT,*) '!'

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/Makefile.in	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/Makefile.in	2012-02-14 15:11:07 UTC (rev 19622)
@@ -27,6 +27,23 @@
 
 # @configure_input@
 
+# CUDA
+ at COND_CUDA_TRUE@CUDA_LIBS = -lcuda -lcudart -lcublas
+ at COND_CUDA_FALSE@CUDA_LIBS = 
+
+# with configure: ./configure CUDA_LIB=.. CUDA_INC=.. MPI_INC=..
+CUDA_LIB_LOCATION = @CUDA_LIB@
+CUDA_LINK = $(CUDA_LIB_LOCATION) $(CUDA_LIBS)
+CUDA_INC = @CUDA_INC@ -I../../setup -I../../
+MPI_INC = @MPI_INC@
+
+ at COND_CUDA_TRUE@NVCC = nvcc
+ at COND_CUDA_FALSE@NVCC = g++
+
+ at COND_CUDA_TRUE@NVCC_FLAGS = $(CUDA_INC) $(MPI_INC) -DWITH_MPI -DCUDA -gencode=arch=compute_20,code=sm_20
+ at COND_CUDA_FALSE@NVCC_FLAGS = $(MPI_INC) -DWITH_MPI
+
+
 FC = @FC@
 FCFLAGS = #@FCFLAGS@
 MPIFC = @MPIFC@
@@ -66,100 +83,134 @@
 SETUP = ../../setup
 ## output file directory
 OUTPUT = ../../OUTPUT_FILES
+# CUDAD : cuda directory
+CUDAD = ../cuda
 
 #######################################
 
 libspecfem_a_OBJECTS_SOLVER = \
-	$O/assemble_MPI_central_cube.o \
-	$O/assemble_MPI_scalar.o \
-	$O/assemble_MPI_vector.o \
-	$O/assemble_MPI_central_cube_block.o \
-	$O/assemble_MPI_scalar_block.o \
-	$O/assemble_MPI_vector_block.o \
-	$O/auto_ner.o \
-	$O/broadcast_compute_parameters.o \
-	$O/calendar.o \
-	$O/comp_source_spectrum.o \
-	$O/comp_source_time_function.o \
-	$O/compute_adj_source_frechet.o \
-	$O/compute_arrays_source.o \
-	$O/convert_time.o \
-	$O/create_central_cube_buffers.o \
-	$O/create_name_database.o \
-	$O/count_number_of_sources.o \
-	$O/define_derivation_matrices.o \
-	$O/euler_angles.o \
-	$O/force_ftz.o \
-	$O/get_attenuation.o \
-	$O/get_backazimuth.o \
-	$O/get_cmt.o \
-	$O/get_event_info.o \
-	$O/get_model_parameters.o \
-	$O/get_value_parameters.o \
-	$O/gll_library.o \
-	$O/hex_nodes.o \
-	$O/intgrl.o \
-	$O/lagrange_poly.o \
-	$O/locate_receivers.o \
-	$O/locate_sources.o \
-	$O/make_ellipticity.o \
-	$O/make_gravity.o \
-	$O/model_prem.o \
-	$O/model_topo_bathy.o \
-	$O/multiply_arrays_source.o \
-	$O/param_reader.o \
-	$O/spline_routines.o \
-	$O/netlib_specfun_erf.o \
-	$O/read_arrays_buffers_solver.o \
-	$O/read_compute_parameters.o \
-	$O/read_parameter_file.o \
-	$O/read_value_parameters.o \
-	$O/recompute_jacobian.o \
-	$O/reduce.o \
-	$O/rthetaphi_xyz.o \
-	$O/write_c_binary.o \
-	$O/write_seismograms.o \
-	$O/write_output_ASCII.o \
-	$O/write_output_SAC.o \
+	$O/assemble_MPI_scalar.mpicheckno.o \
+	$O/assemble_MPI_vector.mpicheckno.o \
+	$O/assemble_MPI_scalar_block.mpicheckno.o \
+	$O/auto_ner.shared.o \
+	$O/broadcast_compute_parameters.sharedmpi.o \
+	$O/calendar.shared.o \
+	$O/comp_source_spectrum.check.o \
+	$O/comp_source_time_function.check.o \
+	$O/compute_adj_source_frechet.check.o \
+	$O/compute_arrays_source.check.o \
+	$O/convert_time.check.o \
+	$O/create_central_cube_buffers.mpicheck.o \
+	$O/create_name_database.shared.o \
+	$O/count_number_of_sources.shared.o \
+	$O/define_derivation_matrices.check.o \
+	$O/euler_angles.shared.o \
+	$O/force_ftz.cc.o \
+	$O/get_attenuation.check.o \
+	$O/get_backazimuth.check.o \
+	$O/get_cmt.check.o \
+	$O/get_event_info.mpicheck.o \
+	$O/get_model_parameters.shared.o \
+	$O/get_value_parameters.shared.o \
+	$O/gll_library.shared.o \
+	$O/hex_nodes.shared.o \
+	$O/intgrl.shared.o \
+	$O/lagrange_poly.shared.o \
+	$O/locate_receivers.mpicheck.o \
+	$O/locate_sources.mpicheck.o \
+	$O/make_ellipticity.shared.o \
+	$O/make_gravity.shared.o \
+	$O/model_prem.shared.o \
+	$O/model_topo_bathy.sharedmpi.o \
+	$O/multiply_arrays_source.check.o \
+	$O/param_reader.cc.o \
+	$O/spline_routines.shared.o \
+	$O/netlib_specfun_erf.check.o \
+	$O/read_arrays_buffers_solver.mpicheck.o \
+	$O/read_compute_parameters.shared.o \
+	$O/read_parameter_file.shared.o \
+	$O/read_value_parameters.shared.o \
+	$O/recompute_jacobian.check.o \
+	$O/reduce.shared.o \
+	$O/rthetaphi_xyz.shared.o \
+	$O/write_c_binary.cc.o \
+	$O/write_seismograms.mpicheck.o \
+	$O/write_output_ASCII.mpicheck.o \
+	$O/write_output_SAC.mpicheck.o \
 	$(EMPTY_MACRO)
 
 # solver objects with statically allocated arrays; dependent upon
 # values_from_mesher.h
 
 SOLVER_ARRAY_OBJECTS = \
-	$O/check_simulation_stability.o \
-	$O/compute_add_sources.o \
-	$O/compute_boundary_kernel.o \
-	$O/compute_coupling.o \
-	$O/compute_element.o \
-	$O/compute_forces_crust_mantle.o \
-	$O/compute_forces_crust_mantle_Dev.o \
-	$O/compute_forces_inner_core.o \
-	$O/compute_forces_inner_core_Dev.o \
-	$O/compute_forces_outer_core.o \
-	$O/compute_forces_outer_core_Dev.o \
-	$O/compute_kernels.o \
-	$O/compute_seismograms.o \
-	$O/compute_stacey_crust_mantle.o \
-	$O/compute_stacey_outer_core.o \
-	$O/fix_non_blocking_flags.o \
-	$O/initialize_simulation.o \
-	$O/prepare_timerun.o \
-	$O/read_arrays_solver.o \
-	$O/read_forward_arrays.o \
-	$O/read_mesh_databases.o \
-	$O/save_forward_arrays.o \
-	$O/save_kernels.o \
-	$O/setup_sources_receivers.o \
-	$O/specfem3D.o \
-	$O/write_movie_volume.o \
-	$O/write_movie_surface.o \
-	$O/noise_tomography.o \
+	$O/specfem3D_par.Solver.o \
+	$O/assemble_MPI_central_cube_block.mpisolver.o \
+	$O/assemble_MPI_central_cube.mpisolver.o \
+	$O/check_simulation_stability.mpisolver.o \
+	$O/compute_add_sources.solver.o \
+	$O/compute_boundary_kernel.solvercheck.o \
+	$O/compute_coupling.solver.o \
+	$O/compute_element.Solver.o \
+	$O/compute_forces_acoustic.Solver.o \
+	$O/compute_forces_elastic.Solver.o \
+	$O/compute_forces_crust_mantle.solver.o \
+	$O/compute_forces_crust_mantle_Dev.Solver.o \
+	$O/compute_forces_inner_core.solver.o \
+	$O/compute_forces_inner_core_Dev.Solver.o \
+	$O/compute_forces_outer_core.solver.o \
+	$O/compute_forces_outer_core_Dev.solver.o \
+	$O/compute_kernels.solver.o \
+	$O/compute_seismograms.solver.o \
+	$O/compute_stacey_crust_mantle.solver.o \
+	$O/compute_stacey_outer_core.solver.o \
+	$O/fix_non_blocking_flags.mpisolvercheck.o \
+	$O/finalize_simulation.mpisolver.o \
+	$O/initialize_simulation.mpisolver.o \
+	$O/iterate_time.mpiSolver.o \
+	$O/noise_tomography.mpisolver.o \
+	$O/prepare_timerun.mpisolver.o \
+	$O/read_arrays_solver.solver.o \
+	$O/read_forward_arrays.solver.o \
+	$O/read_mesh_databases.mpisolver.o \
+	$O/read_topography_bathymetry.mpisolver.o \
+	$O/save_forward_arrays.solver.o \
+	$O/save_kernels.solver.o \
+	$O/setup_GLL_points.mpisolver.o \
+	$O/setup_sources_receivers.mpisolver.o \
+	$O/specfem3D.mpiSolver.o \
+	$O/write_movie_output.mpisolver.o \
+	$O/write_movie_volume.mpisolver.o \
+	$O/write_movie_surface.mpisolver.o \
 	$(EMPTY_MACRO)
 
+CUDA_OBJECTS = \
+  $O/assemble_MPI_scalar_cuda.cuda.o \
+	$O/assemble_MPI_vector_cuda.cuda.o \
+  $O/check_fields_cuda.cuda.o \
+	$O/compute_add_sources_acoustic_cuda.cuda.o \
+	$O/compute_add_sources_elastic_cuda.cuda.o \
+	$O/compute_coupling_cuda.cuda.o \
+	$O/compute_forces_crust_mantle_cuda.cuda.o \
+	$O/compute_forces_inner_core_cuda.cuda.o \
+	$O/compute_forces_outer_core_cuda.cuda.o \
+	$O/compute_kernels_cuda.cuda.o \
+	$O/compute_stacey_acoustic_cuda.cuda.o \
+	$O/compute_stacey_elastic_cuda.cuda.o \
+	$O/it_update_displacement_cuda.cuda.o \
+	$O/noise_tomography_cuda.cuda.o \
+	$O/prepare_mesh_constants_cuda.cuda.o \
+	$O/transfer_fields_cuda.cuda.o \
+	$O/write_seismograms_cuda.cuda.o \
+	$O/save_and_compare_cpu_vs_gpu.cudacc.o 
 
+CUDA_STUBS = \
+	$O/specfem3D_gpu_cuda_method_stubs.cudacc.o
+
+
+
 LIBSPECFEM_SOLVER = $O/libspecfem_solver.a
 
+
 #######################################
 
 ####
@@ -188,11 +239,13 @@
 ####
 
 # solver also depends on values from mesher
-XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $O/exit_mpi.o $(LIBSPECFEM_SOLVER)
+ at COND_CUDA_TRUE@XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $O/exit_mpi.sharedmpi.o $(LIBSPECFEM_SOLVER) $(CUDA_OBJECTS)
+ at COND_CUDA_FALSE@XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $O/exit_mpi.sharedmpi.o $(LIBSPECFEM_SOLVER) $(CUDA_STUBS)
 
+
 xspecfem3D: $(XSPECFEM_OBJECTS)
 ## use MPI here
-	${MPIFCCOMPILE_NO_CHECK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(MPILIBS)
+	${MPIFCCOMPILE_NO_CHECK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(MPILIBS) $(CUDA_LINK)
 
 reqheader:
 	(cd ../create_header_file; make)
@@ -220,287 +273,75 @@
 #### rule for each .o file below
 ####
 
+
 ##
 ## shared
 ##
-$O/auto_ner.o: ${SETUP}/constants.h ${SHARED}/auto_ner.f90
-	${FCCOMPILE_CHECK} -c -o $O/auto_ner.o ${FCFLAGS_f90} ${SHARED}/auto_ner.f90
+$O/%.shared.o: ${SHARED}/%.f90 ${SETUP}/constants.h
+	${FCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/broadcast_compute_parameters.o: ${SETUP}/constants.h ${SHARED}/broadcast_compute_parameters.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/broadcast_compute_parameters.o ${FCFLAGS_f90} ${SHARED}/broadcast_compute_parameters.f90
+$O/%.sharedmpi.o: ${SHARED}/%.f90 ${SETUP}/constants.h
+	${MPIFCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/calendar.o: ${SHARED}/calendar.f90
-	${FCCOMPILE_CHECK} -c -o $O/calendar.o ${FCFLAGS_f90} ${SHARED}/calendar.f90
+$O/%.cc.o: ${SHARED}/%.c ${SETUP}/config.h
+	${CC} -c $(CPPFLAGS) $(CFLAGS) -o $@ $< 
 
-$O/count_number_of_sources.o: ${SETUP}/constants.h ${SHARED}/count_number_of_sources.f90
-	${FCCOMPILE_CHECK} -c -o $O/count_number_of_sources.o ${FCFLAGS_f90} ${SHARED}/count_number_of_sources.f90
+#######################################
 
-$O/create_name_database.o: ${SETUP}/constants.h ${SHARED}/create_name_database.f90
-	${FCCOMPILE_CHECK} -c -o $O/create_name_database.o ${FCFLAGS_f90} ${SHARED}/create_name_database.f90
-
-$O/create_serial_name_database.o: ${SETUP}/constants.h ${SHARED}/create_serial_name_database.f90
-	${FCCOMPILE_CHECK} -c -o $O/create_serial_name_database.o ${FCFLAGS_f90} ${SHARED}/create_serial_name_database.f90
-
-$O/euler_angles.o: ${SETUP}/constants.h ${SHARED}/euler_angles.f90
-	${FCCOMPILE_CHECK} -c -o $O/euler_angles.o ${FCFLAGS_f90} ${SHARED}/euler_angles.f90
-
-$O/get_model_parameters.o: ${SETUP}/constants.h ${SHARED}/get_model_parameters.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_model_parameters.o ${FCFLAGS_f90} ${SHARED}/get_model_parameters.f90
-
-$O/get_value_parameters.o: ${SETUP}/constants.h ${SHARED}/get_value_parameters.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_value_parameters.o ${FCFLAGS_f90} ${SHARED}/get_value_parameters.f90
-
-$O/gll_library.o: ${SETUP}/constants.h ${SHARED}/gll_library.f90
-	${FCCOMPILE_CHECK} -c -o $O/gll_library.o ${FCFLAGS_f90} ${SHARED}/gll_library.f90
-
-$O/hex_nodes.o: ${SETUP}/constants.h ${SHARED}/hex_nodes.f90
-	${FCCOMPILE_CHECK} -c -o $O/hex_nodes.o ${FCFLAGS_f90} ${SHARED}/hex_nodes.f90
-
-$O/intgrl.o: ${SETUP}/constants.h ${SHARED}/intgrl.f90
-	${FCCOMPILE_CHECK} -c -o $O/intgrl.o ${FCFLAGS_f90} ${SHARED}/intgrl.f90
-
-$O/lagrange_poly.o: ${SETUP}/constants.h ${SHARED}/lagrange_poly.f90
-	${FCCOMPILE_CHECK} -c -o $O/lagrange_poly.o ${FCFLAGS_f90} ${SHARED}/lagrange_poly.f90
-
-$O/make_ellipticity.o: ${SETUP}/constants.h ${SHARED}/make_ellipticity.f90
-	${FCCOMPILE_CHECK} -c -o $O/make_ellipticity.o ${FCFLAGS_f90} ${SHARED}/make_ellipticity.f90
-
-$O/make_gravity.o: ${SETUP}/constants.h ${SHARED}/make_gravity.f90
-	${FCCOMPILE_CHECK} -c -o $O/make_gravity.o ${FCFLAGS_f90} ${SHARED}/make_gravity.f90
-
-$O/memory_eval.o: ${SETUP}/constants.h ${SHARED}/memory_eval.f90
-	${FCCOMPILE_CHECK} -c -o $O/memory_eval.o ${FCFLAGS_f90} ${SHARED}/memory_eval.f90
-
-$O/model_prem.o: ${SETUP}/constants.h ${SHARED}/model_prem.f90
-	${FCCOMPILE_CHECK} -c -o $O/model_prem.o ${FCFLAGS_f90} ${SHARED}/model_prem.f90
-
-### C compilation
-$O/force_ftz.o: ${SHARED}/force_ftz.c ${SETUP}/config.h
-	${CC} -c $(CPPFLAGS) $(CFLAGS) -o $O/force_ftz.o ${SHARED}/force_ftz.c
-
-$O/param_reader.o: ${SHARED}/param_reader.c ${SETUP}/config.h
-	${CC} -c $(CPPFLAGS) $(CFLAGS) -o $O/param_reader.o ${SHARED}/param_reader.c
-
-$O/read_compute_parameters.o: ${SETUP}/constants.h ${SHARED}/read_compute_parameters.f90
-	${FCCOMPILE_CHECK} -c -o $O/read_compute_parameters.o ${FCFLAGS_f90} ${SHARED}/read_compute_parameters.f90
-
-$O/read_parameter_file.o: ${SETUP}/constants.h ${SHARED}/read_parameter_file.f90
-	${FCCOMPILE_CHECK} -c -o $O/read_parameter_file.o ${FCFLAGS_f90} ${SHARED}/read_parameter_file.f90
-
-$O/read_value_parameters.o: ${SETUP}/constants.h ${SHARED}/read_value_parameters.f90
-	${FCCOMPILE_CHECK} -c -o $O/read_value_parameters.o ${FCFLAGS_f90} ${SHARED}/read_value_parameters.f90
-
-$O/reduce.o: ${SETUP}/constants.h ${SHARED}/reduce.f90
-	${FCCOMPILE_CHECK} -c -o $O/reduce.o ${FCFLAGS_f90} ${SHARED}/reduce.f90
-
-$O/rthetaphi_xyz.o: ${SETUP}/constants.h ${SHARED}/rthetaphi_xyz.f90
-	${FCCOMPILE_CHECK} -c -o $O/rthetaphi_xyz.o ${FCFLAGS_f90} ${SHARED}/rthetaphi_xyz.f90
-
-$O/save_header_file.o: ${SETUP}/constants.h ${SHARED}/save_header_file.f90
-	${FCCOMPILE_CHECK} -c -o $O/save_header_file.o ${FCFLAGS_f90} ${SHARED}/save_header_file.f90
-
-$O/spline_routines.o: ${SETUP}/constants.h ${SHARED}/spline_routines.f90
-	${FCCOMPILE_CHECK} -c -o $O/spline_routines.o ${FCFLAGS_f90} ${SHARED}/spline_routines.f90
-
-### C compilation
-$O/write_c_binary.o: ${SHARED}/write_c_binary.c ${SETUP}/config.h
-	$(CC) $(CPPFLAGS) $(CFLAGS) -c -o $O/write_c_binary.o ${SHARED}/write_c_binary.c
-
-##
-## shared objects with mpi compilation
-##
-$O/exit_mpi.o: ${SETUP}/constants.h ${SHARED}/exit_mpi.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/exit_mpi.o ${FCFLAGS_f90} ${SHARED}/exit_mpi.f90
-
-$O/model_topo_bathy.o: ${SETUP}/constants.h ${SHARED}/model_topo_bathy.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/model_topo_bathy.o ${FCFLAGS_f90} ${SHARED}/model_topo_bathy.f90
-
-
 ###
 ### specfem3D - optimized flags and dependence on values from mesher here
 ###
-$O/compute_add_sources.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_add_sources.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_add_sources.o ${FCFLAGS_f90} $S/compute_add_sources.f90
+$O/%.solver.o: $S/%.f90 ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h
+	${FCCOMPILE_NO_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/compute_boundary_kernel.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_boundary_kernel.f90
-	${FCCOMPILE_CHECK} -c -o $O/compute_boundary_kernel.o ${FCFLAGS_f90} $S/compute_boundary_kernel.f90
+$O/%.Solver.o: $S/%.F90 ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h
+	${FCCOMPILE_NO_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/compute_coupling.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_coupling.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_coupling.o ${FCFLAGS_f90} $S/compute_coupling.f90
+$O/%.solvercheck.o: $S/%.f90 ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h
+	${FCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/compute_element.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_element.F90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_element.o ${FCFLAGS_f90} $S/compute_element.F90
+$O/%.mpisolver.o: $S/%.f90 ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h
+	${MPIFCCOMPILE_NO_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/compute_forces_crust_mantle.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_forces_crust_mantle.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_crust_mantle.o ${FCFLAGS_f90} $S/compute_forces_crust_mantle.f90
+$O/%.mpiSolver.o: $S/%.F90 ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h
+	${MPIFCCOMPILE_NO_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/compute_forces_crust_mantle_Dev.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_forces_crust_mantle_Dev.F90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_crust_mantle_Dev.o ${FCFLAGS_f90} $S/compute_forces_crust_mantle_Dev.F90
+$O/%.mpisolvercheck.o: $S/%.f90 ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h
+	${MPIFCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/compute_forces_outer_core.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_forces_outer_core.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_outer_core.o ${FCFLAGS_f90} $S/compute_forces_outer_core.f90
+#######################################
 
-$O/compute_forces_outer_core_Dev.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_forces_outer_core_Dev.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_outer_core_Dev.o ${FCFLAGS_f90} $S/compute_forces_outer_core_Dev.f90
-
-$O/compute_forces_inner_core.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_forces_inner_core.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_inner_core.o ${FCFLAGS_f90} $S/compute_forces_inner_core.f90
-
-$O/compute_forces_inner_core_Dev.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_forces_inner_core_Dev.F90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_forces_inner_core_Dev.o ${FCFLAGS_f90} $S/compute_forces_inner_core_Dev.F90
-
-$O/compute_kernels.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_kernels.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_kernels.o ${FCFLAGS_f90} $S/compute_kernels.f90
-
-$O/compute_seismograms.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_seismograms.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_seismograms.o ${FCFLAGS_f90} $S/compute_seismograms.f90
-
-$O/compute_stacey_crust_mantle.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_stacey_crust_mantle.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_stacey_crust_mantle.o ${FCFLAGS_f90} $S/compute_stacey_crust_mantle.f90
-
-$O/compute_stacey_outer_core.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/compute_stacey_outer_core.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/compute_stacey_outer_core.o ${FCFLAGS_f90} $S/compute_stacey_outer_core.f90
-
-$O/read_arrays_solver.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/read_arrays_solver.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/read_arrays_solver.o ${FCFLAGS_f90} $S/read_arrays_solver.f90
-
-$O/read_forward_arrays.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/read_forward_arrays.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/read_forward_arrays.o ${FCFLAGS_f90} $S/read_forward_arrays.f90
-
-$O/save_forward_arrays.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/save_forward_arrays.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/save_forward_arrays.o ${FCFLAGS_f90} $S/save_forward_arrays.f90
-
-$O/save_kernels.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/save_kernels.f90
-	${FCCOMPILE_NO_CHECK} -c -o $O/save_kernels.o ${FCFLAGS_f90} $S/save_kernels.f90
-
-###
-### specfem3D - regular compilation options here
-###
-$O/comp_source_time_function.o: $S/comp_source_time_function.f90
-	${FCCOMPILE_CHECK} -c -o $O/comp_source_time_function.o ${FCFLAGS_f90} $S/comp_source_time_function.f90
-
-$O/comp_source_spectrum.o: ${SETUP}/constants.h $S/comp_source_spectrum.f90
-	${FCCOMPILE_CHECK} -c -o $O/comp_source_spectrum.o ${FCFLAGS_f90} $S/comp_source_spectrum.f90
-
-$O/compute_adj_source_frechet.o: ${SETUP}/constants.h $S/compute_adj_source_frechet.f90
-	${FCCOMPILE_CHECK} -c -o $O/compute_adj_source_frechet.o ${FCFLAGS_f90} $S/compute_adj_source_frechet.f90
-
-$O/compute_arrays_source.o: ${SETUP}/constants.h $S/compute_arrays_source.f90
-	${FCCOMPILE_CHECK} -c -o $O/compute_arrays_source.o ${FCFLAGS_f90} $S/compute_arrays_source.f90
-
-$O/convert_time.o: $S/convert_time.f90
-	${FCCOMPILE_CHECK} -c -o $O/convert_time.o ${FCFLAGS_f90} $S/convert_time.f90
-
-$O/define_derivation_matrices.o: ${SETUP}/constants.h $S/define_derivation_matrices.f90
-	${FCCOMPILE_CHECK} -c -o $O/define_derivation_matrices.o ${FCFLAGS_f90} $S/define_derivation_matrices.f90
-
-$O/get_attenuation.o: ${SETUP}/constants.h $S/get_attenuation.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_attenuation.o ${FCFLAGS_f90} $S/get_attenuation.f90
-
-$O/get_backazimuth.o: ${SETUP}/constants.h $S/get_backazimuth.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_backazimuth.o ${FCFLAGS_f90} $S/get_backazimuth.f90
-
-$O/get_cmt.o: ${SETUP}/constants.h $S/get_cmt.f90
-	${FCCOMPILE_CHECK} -c -o $O/get_cmt.o ${FCFLAGS_f90} $S/get_cmt.f90
-
-$O/multiply_arrays_source.o: ${SETUP}/constants.h $S/multiply_arrays_source.f90
-	${FCCOMPILE_CHECK} -c -o $O/multiply_arrays_source.o ${FCFLAGS_f90} $S/multiply_arrays_source.f90
-
-$O/netlib_specfun_erf.o: $S/netlib_specfun_erf.f90
-	${FCCOMPILE_CHECK} -c -o $O/netlib_specfun_erf.o ${FCFLAGS_f90} $S/netlib_specfun_erf.f90
-
-$O/recompute_jacobian.o: ${SETUP}/constants.h $S/recompute_jacobian.f90
-	${FCCOMPILE_CHECK} -c -o $O/recompute_jacobian.o ${FCFLAGS_f90} $S/recompute_jacobian.f90
-
 ##
-## specfem3D - use MPI here & dependent on values from mesher here
-##
-$O/assemble_MPI_central_cube.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/assemble_MPI_central_cube.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_central_cube.o ${FCFLAGS_f90} $S/assemble_MPI_central_cube.f90
-
-$O/assemble_MPI_central_cube_block.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/assemble_MPI_central_cube_block.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_central_cube_block.o ${FCFLAGS_f90} $S/assemble_MPI_central_cube_block.f90
-
-$O/check_simulation_stability.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/check_simulation_stability.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/check_simulation_stability.o ${FCFLAGS_f90} $S/check_simulation_stability.f90
-
-$O/fix_non_blocking_flags.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/fix_non_blocking_flags.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/fix_non_blocking_flags.o ${FCFLAGS_f90} $S/fix_non_blocking_flags.f90
-
-$O/initialize_simulation.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/initialize_simulation.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/initialize_simulation.o ${FCFLAGS_f90} $S/initialize_simulation.f90
-
-$O/noise_tomography.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/noise_tomography.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/noise_tomography.o ${FCFLAGS_f90} $S/noise_tomography.f90
-
-$O/prepare_timerun.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/prepare_timerun.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/prepare_timerun.o ${FCFLAGS_f90} $S/prepare_timerun.f90
-
-$O/read_mesh_databases.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/read_mesh_databases.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/read_mesh_databases.o ${FCFLAGS_f90} $S/read_mesh_databases.f90
-
-$O/setup_sources_receivers.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/setup_sources_receivers.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/setup_sources_receivers.o ${FCFLAGS_f90} $S/setup_sources_receivers.f90
-
-$O/specfem3D.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/specfem3D.F90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/specfem3D.o ${FCFLAGS_f90} $S/specfem3D.F90
-
-$O/write_movie_surface.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/write_movie_surface.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/write_movie_surface.o ${FCFLAGS_f90} $S/write_movie_surface.f90
-
-$O/write_movie_volume.o: ${SETUP}/constants.h ${OUTPUT}/values_from_mesher.h $S/write_movie_volume.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/write_movie_volume.o ${FCFLAGS_f90} $S/write_movie_volume.f90
-
-
-##
 ## specfem3D - non-dependent on values from mesher here
 ##
-$O/assemble_MPI_scalar.o: ${SETUP}/constants.h $S/assemble_MPI_scalar.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_scalar.o ${FCFLAGS_f90} $S/assemble_MPI_scalar.f90
+$O/%.check.o: $S/%.f90 ${SETUP}/constants.h
+	${FCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/assemble_MPI_scalar_block.o: ${SETUP}/constants.h $S/assemble_MPI_scalar_block.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_scalar_block.o ${FCFLAGS_f90} $S/assemble_MPI_scalar_block.f90
+$O/%.mpicheckno.o: $S/%.f90 ${SETUP}/constants.h
+	${MPIFCCOMPILE_NO_CHECK} ${FCFLAGS_f90} -c -o $@ $<
 
-$O/assemble_MPI_vector.o: ${SETUP}/constants.h $S/assemble_MPI_vector.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_vector.o ${FCFLAGS_f90} $S/assemble_MPI_vector.f90
+$O/%.mpicheck.o: %.f90 ${SETUP}/constants.h
+	${MPIFCCOMPILE_CHECK} -c -o $@ ${FCFLAGS_f90} $<
 
-$O/assemble_MPI_vector_block.o: ${SETUP}/constants.h $S/assemble_MPI_vector_block.f90
-	${MPIFCCOMPILE_NO_CHECK} -c -o $O/assemble_MPI_vector_block.o ${FCFLAGS_f90} $S/assemble_MPI_vector_block.f90
 
+#######################################
+
 ###
-### specfem3D - regular MPI compilation options here
+### CUDA compilation
 ###
-$O/create_central_cube_buffers.o: ${SETUP}/constants.h $S/create_central_cube_buffers.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/create_central_cube_buffers.o ${FCFLAGS_f90} $S/create_central_cube_buffers.f90
 
-$O/get_event_info.o: ${SETUP}/constants.h $S/get_event_info.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/get_event_info.o ${FCFLAGS_f90} $S/get_event_info.f90
+$O/%.cuda.o: ${CUDAD}/%.cu ${SETUP}/config.h ${CUDAD}/mesh_constants_cuda.h ${CUDAD}/prepare_constants_cuda.h
+	${NVCC} -c $< -o $@ $(NVCC_FLAGS)
 
-$O/locate_receivers.o: ${SETUP}/constants.h $S/locate_receivers.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/locate_receivers.o ${FCFLAGS_f90} $S/locate_receivers.f90
+$O/%.cudacc.o: ${CUDAD}/%.c ${SETUP}/config.h
+	${CC} -c $(CPPFLAGS) $(CFLAGS) $(MPI_INC) -o $@ ${CUDAD}/$< 
 
-$O/locate_sources.o: ${SETUP}/constants.h $S/locate_sources.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/locate_sources.o ${FCFLAGS_f90} $S/locate_sources.f90
-
-$O/read_arrays_buffers_solver.o: ${SETUP}/constants.h $S/read_arrays_buffers_solver.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/read_arrays_buffers_solver.o ${FCFLAGS_f90} $S/read_arrays_buffers_solver.f90
-
-$O/write_seismograms.o: ${SETUP}/constants.h $S/write_seismograms.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/write_seismograms.o ${FCFLAGS_f90} $S/write_seismograms.f90
-
-$O/write_output_ASCII.o: ${SETUP}/constants.h $S/write_output_ASCII.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/write_output_ASCII.o ${FCFLAGS_f90} $S/write_output_ASCII.f90
-
-$O/write_output_SAC.o: ${SETUP}/constants.h $S/write_output_SAC.f90
-	${MPIFCCOMPILE_CHECK} -c -o $O/write_output_SAC.o ${FCFLAGS_f90} $S/write_output_SAC.f90
-
-
 #######################################
 
-
 ###
 ### rule for the header file
 ###
+
 ${OUTPUT}/values_from_mesher.h: reqheader
 	(mkdir -p ${OUTPUT}; cd ${S_TOP}/; ./bin/xcreate_header_file)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_central_cube.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_central_cube.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_central_cube.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -26,119 +26,100 @@
 !=====================================================================
 
   subroutine assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-    npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-    receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
-    ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,vector_assemble,ndim_assemble,iphase_CC)
+                                      npoin2D_cube_from_slices, &
+                                      buffer_all_cube_from_slices,buffer_slices, &
+                                      request_send_cc,request_receive_cc, &
+                                      request_send_array_cc,request_receive_array_cc, &
+                                      ibool_central_cube, &
+                                      receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
+                                      ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE, &
+                                      vector_assemble,ndim_assemble, &
+                                      iphase_comm_CC)
 
   implicit none
 
-! standard include of the MPI library
+  ! standard include of the MPI library
   include 'mpif.h'
   include 'constants.h'
 
-! include values created by the mesher
+  ! include values created by the mesher
   include "OUTPUT_FILES/values_from_mesher.h"
 
-! for matching with central cube in inner core
+  ! for matching with central cube in inner core
   integer, intent(in) :: ichunk, nb_msgs_theor_in_cube, npoin2D_cube_from_slices
   integer, intent(in) :: ndim_assemble
   integer, intent(in) :: receiver_cube_from_slices
-  integer, intent(inout) :: iphase_CC
+  integer, intent(inout) :: iphase_comm_CC
   integer, dimension(nb_msgs_theor_in_cube), intent(in) :: sender_from_slices_to_cube
+
   double precision, dimension(npoin2D_cube_from_slices,ndim_assemble), intent(inout) :: buffer_slices
   double precision, dimension(npoin2D_cube_from_slices,ndim_assemble,nb_msgs_theor_in_cube), intent(inout) :: &
-                                                                                       buffer_all_cube_from_slices
+                                                                                      buffer_all_cube_from_slices
+
+  ! note: these parameters are "saved" now as global parameters
+  ! MPI status of messages to be received
+  integer, intent(inout) :: request_send_cc,request_receive_cc
+  ! maximum value of nb_msgs_theor_in_cube is 5 (when NPROC_XI == 1)
+  ! therefore NPROC_XI+4 is always large enough
+  integer, dimension(NPROC_XI_VAL+4), intent(inout) :: request_send_array_cc,request_receive_array_cc
+
   integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices), intent(in) :: ibool_central_cube
 
-! local to global mapping
+  ! local to global mapping
   integer, intent(in) :: NSPEC2D_BOTTOM_INNER_CORE
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE), intent(in) :: ibool_inner_core
   integer, dimension(NSPEC_INNER_CORE), intent(in) :: idoubling_inner_core
   integer, dimension(NSPEC2D_BOTTOM_INNER_CORE), intent(in) :: ibelm_bottom_inner_core
 
-! vector
+  ! vector
   real(kind=CUSTOM_REAL), dimension(ndim_assemble,NGLOB_INNER_CORE), intent(inout) :: vector_assemble
 
+! local parameters
+
   integer ipoin,idimension, ispec2D, ispec
   integer i,j,k
   integer sender,receiver,imsg
 
   real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: array_central_cube
 
-! MPI status of messages to be received
-  integer, save :: request_send,request_receive
-! maximum value of nb_msgs_theor_in_cube is 5 (when NPROC_XI == 1)
-! therefore NPROC_XI+4 is always large enough
-  integer, dimension(NPROC_XI_VAL+4), save :: request_send_array,request_receive_array
   logical :: flag_result_test
   integer, dimension(MPI_STATUS_SIZE) :: msg_status
   integer :: ier
 
-! mask
+  ! mask
   logical, dimension(NGLOB_INNER_CORE) :: mask
 
-!---
-!---  use buffers to assemble mass matrix with central cube once and for all
-!---
+  !---
+  !---  use buffers to assemble mass matrix with central cube once and for all
+  !---
 
-  if(iphase_CC == 1) then
+  select case( iphase_comm_CC )
 
-! on chunks AB and AB_ANTIPODE, receive all the messages from slices
-  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
-    do imsg = 1,nb_msgs_theor_in_cube-1
-! receive buffers from slices
-      sender = sender_from_slices_to_cube(imsg)
-      call MPI_IRECV(buffer_all_cube_from_slices(1,1,imsg), &
-                ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender, &
-                itag,MPI_COMM_WORLD,request_receive_array(imsg),ier)
-    enddo
-  endif
+  case( 1 )
 
-! send info to central cube from all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
-  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
-! for bottom elements in contact with central cube from the slices side
-    ipoin = 0
-    do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
-      ispec = ibelm_bottom_inner_core(ispec2D)
-! only for DOFs exactly on surface of central cube (bottom of these elements)
-      k = 1
-      do j = 1,NGLLY
-        do i = 1,NGLLX
-          ipoin = ipoin + 1
-          buffer_slices(ipoin,:) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
-        enddo
-      enddo
-    enddo
-! send buffer to central cube
-    receiver = receiver_cube_from_slices
-    call MPI_ISEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices, &
-              MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,request_send,ier)
- endif  ! end sending info to central cube
+    ! non-central-cube chunks send values to receiver central cube chunks AB or AB_ANTIPODE
 
-  iphase_CC = iphase_CC + 1
-  return ! exit because we have started some communications therefore we need some time
+    ! note: only chunks AB and AB_ANTIPODE contain valid central cube elements,
+    !          all other have only fictitious central cube elements
 
-  endif !!!!!!!!! end of iphase_CC 1
+    ! on chunks AB and AB_ANTIPODE, receive all the messages from slices
+    if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+      do imsg = 1,nb_msgs_theor_in_cube-1
+        ! receive buffers from slices
+        sender = sender_from_slices_to_cube(imsg)
+        call MPI_IRECV(buffer_all_cube_from_slices(1,1,imsg), &
+                  ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender, &
+                  itag,MPI_COMM_WORLD,request_receive_array_cc(imsg),ier)
+      enddo
+    endif
 
-  if(iphase_CC == 2) then
-
-  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
-    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not sent yet
-  endif
-
-  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
-    do imsg = 1,nb_msgs_theor_in_cube-1
-      call MPI_TEST(request_receive_array(imsg),flag_result_test,msg_status,ier)
-      if(.not. flag_result_test) return ! exit if message not received yet
-    enddo
-  endif
-
-! exchange of their bottom faces between chunks AB and AB_ANTIPODE
-  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
-    ipoin = 0
-    do ispec = NSPEC_INNER_CORE, 1, -1
-      if (idoubling_inner_core(ispec) == IFLAG_BOTTOM_CENTRAL_CUBE) then
+    ! send info to central cube from all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
+    if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
+      ! for bottom elements in contact with central cube from the slices side
+      ipoin = 0
+      do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
+        ispec = ibelm_bottom_inner_core(ispec2D)
+        ! only for DOFs exactly on surface of central cube (bottom of these elements)
         k = 1
         do j = 1,NGLLY
           do i = 1,NGLLX
@@ -146,183 +127,226 @@
             buffer_slices(ipoin,:) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
           enddo
         enddo
-      endif
-    enddo
-    sender = sender_from_slices_to_cube(nb_msgs_theor_in_cube)
-!   call MPI_SENDRECV(buffer_slices,ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,receiver_cube_from_slices, &
-!       itag,buffer_slices2,ndim_assemble*npoin2D_cube_from_slices,&
-!       MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+      enddo
+      ! send buffer to central cube
+      receiver = receiver_cube_from_slices
+      call MPI_ISEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices, &
+                MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,request_send_cc,ier)
+    endif  ! end sending info to central cube
 
-    call MPI_IRECV(buffer_all_cube_from_slices(1,1,nb_msgs_theor_in_cube), &
-        ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,request_receive,ier)
-!! DK DK this merged with previous statement
-!   buffer_all_cube_from_slices(:,:,nb_msgs_theor_in_cube) = buffer_slices2(:,:)
+    iphase_comm_CC = iphase_comm_CC + 1
+    return ! exit because we have started some communications therefore we need some time
 
-    call MPI_ISEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,receiver_cube_from_slices, &
-        itag,MPI_COMM_WORLD,request_send,ier)
-  endif
+  case( 2 )
 
-  iphase_CC = iphase_CC + 1
-  return ! exit because we have started some communications therefore we need some time
+    ! central cube chunks AB and AB_ANTIPODE send values to each other
 
-  endif !!!!!!!!! end of iphase_CC 2
+    ! checks that chunks have sent out messages
+    if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
+      call MPI_TEST(request_send_cc,flag_result_test,msg_status,ier)
+      if(.not. flag_result_test) return ! exit if message not sent yet
+    endif
 
-  if(iphase_CC == 3) then
-
-!--- now we need to assemble the contributions
-
-  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
-
-    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not sent yet
-    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not received yet
-
-    do idimension = 1,ndim_assemble
-! erase contributions to central cube array
-      array_central_cube(:) = 0._CUSTOM_REAL
-
-! use indirect addressing to store contributions only once
-! distinguish between single and double precision for reals
+    ! checks that central cube chunks have received all (requested) messages
+    if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
       do imsg = 1,nb_msgs_theor_in_cube-1
-        do ipoin = 1,npoin2D_cube_from_slices
-          if(CUSTOM_REAL == SIZE_REAL) then
-            array_central_cube(ibool_central_cube(imsg,ipoin)) = sngl(buffer_all_cube_from_slices(ipoin,idimension,imsg))
-          else
-            array_central_cube(ibool_central_cube(imsg,ipoin)) = buffer_all_cube_from_slices(ipoin,idimension,imsg)
-          endif
-        enddo
+        call MPI_TEST(request_receive_array_cc(imsg),flag_result_test,msg_status,ier)
+        if(.not. flag_result_test) return ! exit if message not received yet
       enddo
-! add the constribution of AB or AB_ANTIPODE to sum with the external slices on the edges
-! use a mask to avoid taking the same point into account several times.
-      mask(:) = .false.
-      do ipoin = 1,npoin2D_cube_from_slices
-        if (.not. mask(ibool_central_cube(nb_msgs_theor_in_cube,ipoin))) then
-          if(CUSTOM_REAL == SIZE_REAL) then
-            array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = &
-            array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) + &
-            sngl(buffer_all_cube_from_slices(ipoin,idimension,nb_msgs_theor_in_cube))
-          else
-            array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = &
-            array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) + &
-            buffer_all_cube_from_slices(ipoin,idimension,nb_msgs_theor_in_cube)
-          endif
-          mask(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = .true.
-        endif
-      enddo
+    endif
 
-! suppress degrees of freedom already assembled at top of cube on edges
-      do ispec = 1,NSPEC_INNER_CORE
-        if(idoubling_inner_core(ispec) == IFLAG_TOP_CENTRAL_CUBE) then
-          k = NGLLZ
+    ! exchange of their bottom faces between chunks AB and AB_ANTIPODE
+    if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+      ipoin = 0
+      do ispec = NSPEC_INNER_CORE, 1, -1
+        if (idoubling_inner_core(ispec) == IFLAG_BOTTOM_CENTRAL_CUBE) then
+          k = 1
           do j = 1,NGLLY
             do i = 1,NGLLX
-              array_central_cube(ibool_inner_core(i,j,k,ispec)) = 0._CUSTOM_REAL
+              ipoin = ipoin + 1
+              buffer_slices(ipoin,:) = dble(vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)))
             enddo
           enddo
         endif
       enddo
+      sender = sender_from_slices_to_cube(nb_msgs_theor_in_cube)
+    !   call MPI_SENDRECV(buffer_slices,ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,receiver_cube_from_slices, &
+    !       itag,buffer_slices2,ndim_assemble*npoin2D_cube_from_slices,&
+    !       MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,msg_status,ier)
 
-! assemble contributions
-      vector_assemble(idimension,:) = vector_assemble(idimension,:) + array_central_cube(:)
+      call MPI_IRECV(buffer_all_cube_from_slices(1,1,nb_msgs_theor_in_cube), &
+          ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender,itag,MPI_COMM_WORLD,request_receive_cc,ier)
+    !! DK DK this merged with previous statement
+    !   buffer_all_cube_from_slices(:,:,nb_msgs_theor_in_cube) = buffer_slices2(:,:)
 
-! copy sum back
-      do imsg = 1,nb_msgs_theor_in_cube-1
+      call MPI_ISEND(buffer_slices,ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,receiver_cube_from_slices, &
+          itag,MPI_COMM_WORLD,request_send_cc,ier)
+    endif
+
+    iphase_comm_CC = iphase_comm_CC + 1
+    return ! exit because we have started some communications therefore we need some time
+
+  case( 3 )
+
+    !--- now we need to assemble the contributions
+
+    ! central cube chunks AB and AB_ANTIPODE assemble values and send them out to others
+
+    if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+
+      ! checks that messages between AB and AB_ANTIPODE have been sent and received
+      call MPI_TEST(request_send_cc,flag_result_test,msg_status,ier)
+      if(.not. flag_result_test) return ! exit if message not sent yet
+      call MPI_TEST(request_receive_cc,flag_result_test,msg_status,ier)
+      if(.not. flag_result_test) return ! exit if message not received yet
+
+      do idimension = 1,ndim_assemble
+        ! erase contributions to central cube array
+        array_central_cube(:) = 0._CUSTOM_REAL
+
+        ! use indirect addressing to store contributions only once
+        ! distinguish between single and double precision for reals
+        do imsg = 1,nb_msgs_theor_in_cube-1
+          do ipoin = 1,npoin2D_cube_from_slices
+            if(CUSTOM_REAL == SIZE_REAL) then
+              array_central_cube(ibool_central_cube(imsg,ipoin)) = sngl(buffer_all_cube_from_slices(ipoin,idimension,imsg))
+            else
+              array_central_cube(ibool_central_cube(imsg,ipoin)) = buffer_all_cube_from_slices(ipoin,idimension,imsg)
+            endif
+          enddo
+        enddo
+        ! add the constribution of AB or AB_ANTIPODE to sum with the external slices on the edges
+        ! use a mask to avoid taking the same point into account several times.
+        mask(:) = .false.
         do ipoin = 1,npoin2D_cube_from_slices
-          buffer_all_cube_from_slices(ipoin,idimension,imsg) = vector_assemble(idimension,ibool_central_cube(imsg,ipoin))
+          if (.not. mask(ibool_central_cube(nb_msgs_theor_in_cube,ipoin))) then
+            if(CUSTOM_REAL == SIZE_REAL) then
+              array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = &
+              array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) + &
+              sngl(buffer_all_cube_from_slices(ipoin,idimension,nb_msgs_theor_in_cube))
+            else
+              array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = &
+              array_central_cube(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) + &
+              buffer_all_cube_from_slices(ipoin,idimension,nb_msgs_theor_in_cube)
+            endif
+            mask(ibool_central_cube(nb_msgs_theor_in_cube,ipoin)) = .true.
+          endif
         enddo
+
+        ! suppress degrees of freedom already assembled at top of cube on edges
+        do ispec = 1,NSPEC_INNER_CORE
+          if(idoubling_inner_core(ispec) == IFLAG_TOP_CENTRAL_CUBE) then
+            k = NGLLZ
+            do j = 1,NGLLY
+              do i = 1,NGLLX
+                array_central_cube(ibool_inner_core(i,j,k,ispec)) = 0._CUSTOM_REAL
+              enddo
+            enddo
+          endif
+        enddo
+
+        ! assemble contributions
+        vector_assemble(idimension,:) = vector_assemble(idimension,:) + array_central_cube(:)
+
+        ! copy sum back
+        do imsg = 1,nb_msgs_theor_in_cube-1
+          do ipoin = 1,npoin2D_cube_from_slices
+            buffer_all_cube_from_slices(ipoin,idimension,imsg) = vector_assemble(idimension,ibool_central_cube(imsg,ipoin))
+          enddo
+        enddo
+
       enddo
 
-    enddo
+    endif
 
-  endif
+    !----------
 
-!----------
+    ! receive info from central cube on all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
+    if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
+      ! receive buffers from slices
+      sender = receiver_cube_from_slices
+      call MPI_IRECV(buffer_slices, &
+                  ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender, &
+                  itag,MPI_COMM_WORLD,request_receive_cc,ier)
+      ! for bottom elements in contact with central cube from the slices side
+      !   ipoin = 0
+      !   do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
+      !     ispec = ibelm_bottom_inner_core(ispec2D)
+      ! only for DOFs exactly on surface of central cube (bottom of these elements)
+      !     k = 1
+      !     do j = 1,NGLLY
+      !       do i = 1,NGLLX
+      !         ipoin = ipoin + 1
+      ! distinguish between single and double precision for reals
+      !         if(CUSTOM_REAL == SIZE_REAL) then
+      !           vector_assemble(:,ibool_inner_core(i,j,k,ispec)) = sngl(buffer_slices(ipoin,:))
+      !         else
+      !           vector_assemble(:,ibool_inner_core(i,j,k,ispec)) = buffer_slices(ipoin,:)
+      !         endif
+      !       enddo
+      !     enddo
+      !   enddo
+    endif  ! end receiving info from central cube
 
-! receive info from central cube on all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
-  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
-! receive buffers from slices
-  sender = receiver_cube_from_slices
-  call MPI_IRECV(buffer_slices, &
-              ndim_assemble*npoin2D_cube_from_slices,MPI_DOUBLE_PRECISION,sender, &
-              itag,MPI_COMM_WORLD,request_receive,ier)
-! for bottom elements in contact with central cube from the slices side
-!   ipoin = 0
-!   do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
-!     ispec = ibelm_bottom_inner_core(ispec2D)
-! only for DOFs exactly on surface of central cube (bottom of these elements)
-!     k = 1
-!     do j = 1,NGLLY
-!       do i = 1,NGLLX
-!         ipoin = ipoin + 1
-! distinguish between single and double precision for reals
-!         if(CUSTOM_REAL == SIZE_REAL) then
-!           vector_assemble(:,ibool_inner_core(i,j,k,ispec)) = sngl(buffer_slices(ipoin,:))
-!         else
-!           vector_assemble(:,ibool_inner_core(i,j,k,ispec)) = buffer_slices(ipoin,:)
-!         endif
-!       enddo
-!     enddo
-!   enddo
- endif  ! end receiving info from central cube
+    !------- send info back from central cube to slices
 
-!------- send info back from central cube to slices
+    ! on chunk AB & CHUNK_AB_ANTIPODE, send all the messages to slices
+    if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+      do imsg = 1,nb_msgs_theor_in_cube-1
+        ! send buffers to slices
+        receiver = sender_from_slices_to_cube(imsg)
+        call MPI_ISEND(buffer_all_cube_from_slices(1,1,imsg),ndim_assemble*npoin2D_cube_from_slices, &
+                MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,request_send_array_cc(imsg),ier)
+      enddo
+    endif
 
-! on chunk AB & CHUNK_AB_ANTIPODE, send all the messages to slices
-  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
-    do imsg = 1,nb_msgs_theor_in_cube-1
-! send buffers to slices
-      receiver = sender_from_slices_to_cube(imsg)
-      call MPI_ISEND(buffer_all_cube_from_slices(1,1,imsg),ndim_assemble*npoin2D_cube_from_slices, &
-              MPI_DOUBLE_PRECISION,receiver,itag,MPI_COMM_WORLD,request_send_array(imsg),ier)
-    enddo
-  endif
+    iphase_comm_CC = iphase_comm_CC + 1
+    return ! exit because we have started some communications therefore we need some time
 
-  iphase_CC = iphase_CC + 1
-  return ! exit because we have started some communications therefore we need some time
+  case( 4 )
 
-  endif !!!!!!!!! end of iphase_CC 3
+    ! all non-central cube chunks set the values at the common points with central cube
 
-  if(iphase_CC == 4) then
+    ! checks that messages were sent out by central cube chunks AB and AB_ANTIPODE
+    if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
+      do imsg = 1,nb_msgs_theor_in_cube-1
+        call MPI_TEST(request_send_array_cc(imsg),flag_result_test,msg_status,ier)
+        if(.not. flag_result_test) return ! exit if message not sent yet
+      enddo
+    endif
 
-  if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
-    do imsg = 1,nb_msgs_theor_in_cube-1
-      call MPI_TEST(request_send_array(imsg),flag_result_test,msg_status,ier)
-      if(.not. flag_result_test) return ! exit if message not sent yet
-    enddo
-  endif
+    ! checks that messages have been received
+    if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
+      call MPI_TEST(request_receive_cc,flag_result_test,msg_status,ier)
+      if(.not. flag_result_test) return ! exit if message not received yet
+    endif
 
-  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
-    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not received yet
-  endif
-
-! receive info from central cube on all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
-  if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
-! for bottom elements in contact with central cube from the slices side
-    ipoin = 0
-    do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
-      ispec = ibelm_bottom_inner_core(ispec2D)
-! only for DOFs exactly on surface of central cube (bottom of these elements)
-      k = 1
-      do j = 1,NGLLY
-        do i = 1,NGLLX
-          ipoin = ipoin + 1
-! distinguish between single and double precision for reals
-          if(CUSTOM_REAL == SIZE_REAL) then
-            vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = sngl(buffer_slices(ipoin,:))
-          else
-            vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = buffer_slices(ipoin,:)
-          endif
+    ! receive info from central cube on all the slices except those in CHUNK_AB & CHUNK_AB_ANTIPODE
+    if(ichunk /= CHUNK_AB .and. ichunk /= CHUNK_AB_ANTIPODE) then
+      ! for bottom elements in contact with central cube from the slices side
+      ipoin = 0
+      do ispec2D = 1,NSPEC2D_BOTTOM_INNER_CORE
+        ispec = ibelm_bottom_inner_core(ispec2D)
+        ! only for DOFs exactly on surface of central cube (bottom of these elements)
+        k = 1
+        do j = 1,NGLLY
+          do i = 1,NGLLX
+            ipoin = ipoin + 1
+            ! distinguish between single and double precision for reals
+            if(CUSTOM_REAL == SIZE_REAL) then
+              vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = sngl(buffer_slices(ipoin,:))
+            else
+              vector_assemble(1:ndim_assemble,ibool_inner_core(i,j,k,ispec)) = buffer_slices(ipoin,:)
+            endif
+          enddo
         enddo
       enddo
-    enddo
- endif  ! end receiving info from central cube
+    endif  ! end receiving info from central cube
 
-! this is the exit condition, to go beyond the last phase number
-  iphase_CC = iphase_CC + 1
+    ! this is the exit condition, to go beyond the last phase number
+    iphase_comm_CC = iphase_comm_CC + 1
 
-  endif !!!!!!!!! end of iphase_CC 4
+  end select
 
   end subroutine assemble_MPI_central_cube
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_central_cube_block.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_central_cube_block.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_central_cube_block.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -26,9 +26,13 @@
 !=====================================================================
 
 subroutine assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
-  npoin2D_cube_from_slices, buffer_all_cube_from_slices, buffer_slices, buffer_slices2, ibool_central_cube, &
-  receiver_cube_from_slices, ibool_inner_core, idoubling_inner_core, NSPEC_INNER_CORE, &
-  ibelm_bottom_inner_core, NSPEC2D_BOTTOM_INNER_CORE,NGLOB_INNER_CORE,vector_assemble,ndim_assemble)
+                                          npoin2D_cube_from_slices, &
+                                          buffer_all_cube_from_slices, buffer_slices, buffer_slices2, &
+                                          ibool_central_cube, &
+                                          receiver_cube_from_slices, ibool_inner_core, &
+                                          idoubling_inner_core, NSPEC_INNER_CORE, &
+                                          ibelm_bottom_inner_core, NSPEC2D_BOTTOM_INNER_CORE,NGLOB_INNER_CORE, &
+                                          vector_assemble,ndim_assemble)
 
 ! this version of the routine is based on blocking MPI calls
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -36,11 +36,15 @@
             iboolfaces,iboolcorner, &
             iprocfrom_faces,iprocto_faces, &
             iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces_scalar,buffer_received_faces_scalar,npoin2D_max_all_CM_IC, &
+            buffer_send_faces_scalar,buffer_received_faces_scalar, &
+            npoin2D_max_all_CM_IC, &
             buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
+            request_send,request_receive, &
+            request_send_array,request_receive_array, &
             NUMMSGS_FACES,NCORNERSCHUNKS, &
             NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL, &
-            NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB2DMAX_XY,NCHUNKS,iphase)
+            NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB2DMAX_XY,NCHUNKS, &
+            iphase_comm)
 
   implicit none
 
@@ -50,7 +54,7 @@
   include "constants.h"
   include "precision.h"
 
-  integer myrank,nglob,NCHUNKS,iphase
+  integer myrank,nglob,NCHUNKS,iphase_comm
 
 ! array to assemble
   real(kind=CUSTOM_REAL), dimension(nglob), intent(inout) :: array_val
@@ -72,44 +76,49 @@
 
 ! indirect addressing for each corner of the chunks
   integer, dimension(NGLOB1D_RADIAL,NUMCORNERS_SHARED), intent(in) :: iboolcorner
-  integer icount_corners
 
   integer, intent(in) :: npoin2D_max_all_CM_IC
   integer, dimension(NGLOB2DMAX_XY,NUMFACES_SHARED), intent(in) :: iboolfaces
-  real(kind=CUSTOM_REAL), dimension(npoin2D_max_all_CM_IC,NUMFACES_SHARED), intent(inout) :: buffer_send_faces_scalar, &
-                                                                                             buffer_received_faces_scalar
+  real(kind=CUSTOM_REAL), dimension(npoin2D_max_all_CM_IC,NUMFACES_SHARED), intent(inout) :: &
+    buffer_send_faces_scalar, &
+    buffer_received_faces_scalar
 
-! buffers for send and receive between corners of the chunks
+  ! buffers for send and receive between corners of the chunks
   real(kind=CUSTOM_REAL), dimension(NGLOB1D_RADIAL), intent(inout) :: buffer_send_chunkcorn_scalar, &
                                                                       buffer_recv_chunkcorn_scalar
 
-! ---- arrays to assemble between chunks
+  ! ---- arrays to assemble between chunks
 
-! communication pattern for faces between chunks
+  ! communication pattern for faces between chunks
   integer, dimension(NUMMSGS_FACES), intent(in) :: iprocfrom_faces,iprocto_faces
 
-! communication pattern for corners between chunks
+  ! communication pattern for corners between chunks
   integer, dimension(NCORNERSCHUNKS), intent(in) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
 
-! MPI status of messages to be received
+  ! stored as global parameters
+  integer, intent(inout) :: request_send,request_receive
+  integer, dimension(NUMFACES_SHARED), intent(inout) :: request_send_array,request_receive_array
+
+! local parameters
+
+  ! MPI status of messages to be received
   integer, dimension(MPI_STATUS_SIZE) :: msg_status
 
   integer :: ipoin,ipoin2D,ipoin1D
   integer :: sender,receiver
   integer :: imsg
   integer :: icount_faces,npoin2D_chunks
+  integer :: icount_corners
 
   integer :: ier
-! do not remove the "save" statement because this routine is non blocking
-  integer, save :: request_send,request_receive
-  integer, dimension(NUMFACES_SHARED), save :: request_send_array,request_receive_array
   logical :: flag_result_test
 
+
 ! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
 
 ! check flag to see if we need to assemble (might be turned off when debugging)
   if (.not. ACTUALLY_ASSEMBLE_MPI_SLICES)  then
-    iphase = 9999 ! this means everything is finished
+    iphase_comm = 9999 ! this means everything is finished
     return
   endif
 
@@ -123,452 +132,716 @@
 !---- first assemble along xi using the 2-D topology
 !----
 
-  if(iphase == 1) then
+  if(iphase_comm == 1) then
 
-! slices copy the right face into the buffer
-  do ipoin=1,npoin2D_xi(2)
-    buffer_send_faces_scalar(ipoin,1) = array_val(iboolright_xi(ipoin))
-  enddo
+    ! slices copy the right face into the buffer
+    do ipoin=1,npoin2D_xi(2)
+      buffer_send_faces_scalar(ipoin,1) = array_val(iboolright_xi(ipoin))
+    enddo
 
-! send messages forward along each row
-  if(iproc_xi == 0) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi - 1,iproc_eta)
-  endif
-  if(iproc_xi == NPROC_XI-1) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi + 1,iproc_eta)
-  endif
-  call MPI_IRECV(buffer_received_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,request_receive,ier)
+    ! send messages forward along each row
+    if(iproc_xi == 0) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi - 1,iproc_eta)
+    endif
+    if(iproc_xi == NPROC_XI-1) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi + 1,iproc_eta)
+    endif
+    call MPI_IRECV(buffer_received_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,sender, &
+          itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,receiver, &
-        itag2,MPI_COMM_WORLD,request_send,ier)
+    call MPI_ISEND(buffer_send_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,receiver, &
+          itag2,MPI_COMM_WORLD,request_send,ier)
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-  endif !!!!!!!!! end of iphase 1
+  endif !!!!!!!!! end of iphase_comm 1
 
-  if(iphase == 2) then
+  if(iphase_comm == 2) then
 
-! call MPI_WAIT(request_send,msg_status,ier)
-! call MPI_WAIT(request_receive,msg_status,ier)
-  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not sent yet
-  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not received yet
+    ! call MPI_WAIT(request_send,msg_status,ier)
+    ! call MPI_WAIT(request_receive,msg_status,ier)
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
 
-! all slices add the buffer received to the contributions on the left face
-  if(iproc_xi > 0) then
-  do ipoin=1,npoin2D_xi(1)
-    array_val(iboolleft_xi(ipoin)) = array_val(iboolleft_xi(ipoin)) + &
-                              buffer_received_faces_scalar(ipoin,1)
-  enddo
-  endif
+    ! all slices add the buffer received to the contributions on the left face
+    if(iproc_xi > 0) then
+      do ipoin=1,npoin2D_xi(1)
+        array_val(iboolleft_xi(ipoin)) = array_val(iboolleft_xi(ipoin)) + &
+                                buffer_received_faces_scalar(ipoin,1)
+      enddo
+    endif
 
-! the contributions are correctly assembled on the left side of each slice
-! now we have to send the result back to the sender
-! all slices copy the left face into the buffer
-  do ipoin=1,npoin2D_xi(1)
-    buffer_send_faces_scalar(ipoin,1) = array_val(iboolleft_xi(ipoin))
-  enddo
+    ! the contributions are correctly assembled on the left side of each slice
+    ! now we have to send the result back to the sender
+    ! all slices copy the left face into the buffer
+    do ipoin=1,npoin2D_xi(1)
+      buffer_send_faces_scalar(ipoin,1) = array_val(iboolleft_xi(ipoin))
+    enddo
 
-! send messages backward along each row
-  if(iproc_xi == NPROC_XI-1) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi + 1,iproc_eta)
-  endif
-  if(iproc_xi == 0) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi - 1,iproc_eta)
-  endif
-  call MPI_IRECV(buffer_received_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,request_receive,ier)
+    ! send messages backward along each row
+    if(iproc_xi == NPROC_XI-1) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi + 1,iproc_eta)
+    endif
+    if(iproc_xi == 0) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi - 1,iproc_eta)
+    endif
+    call MPI_IRECV(buffer_received_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,sender, &
+          itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,receiver, &
-        itag2,MPI_COMM_WORLD,request_send,ier)
+    call MPI_ISEND(buffer_send_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,receiver, &
+          itag2,MPI_COMM_WORLD,request_send,ier)
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-  endif !!!!!!!!! end of iphase 2
+  endif !!!!!!!!! end of iphase_comm 2
 
-  if(iphase == 3) then
+  if(iphase_comm == 3) then
 
-! call MPI_WAIT(request_send,msg_status,ier)
-! call MPI_WAIT(request_receive,msg_status,ier)
-  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not sent yet
-  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not received yet
+    ! call MPI_WAIT(request_send,msg_status,ier)
+    ! call MPI_WAIT(request_receive,msg_status,ier)
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
 
-! all slices copy the buffer received to the contributions on the right face
-  if(iproc_xi < NPROC_XI-1) then
-  do ipoin=1,npoin2D_xi(2)
-    array_val(iboolright_xi(ipoin)) = buffer_received_faces_scalar(ipoin,1)
-  enddo
-  endif
+    ! all slices copy the buffer received to the contributions on the right face
+    if(iproc_xi < NPROC_XI-1) then
+      do ipoin=1,npoin2D_xi(2)
+        array_val(iboolright_xi(ipoin)) = buffer_received_faces_scalar(ipoin,1)
+      enddo
+    endif
 
-!----
-!---- then assemble along eta using the 2-D topology
-!----
+    !----
+    !---- then assemble along eta using the 2-D topology
+    !----
 
-! slices copy the right face into the buffer
-  do ipoin=1,npoin2D_eta(2)
-    buffer_send_faces_scalar(ipoin,1) = array_val(iboolright_eta(ipoin))
-  enddo
+    ! slices copy the right face into the buffer
+    do ipoin=1,npoin2D_eta(2)
+      buffer_send_faces_scalar(ipoin,1) = array_val(iboolright_eta(ipoin))
+    enddo
 
-! send messages forward along each row
-  if(iproc_eta == 0) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi,iproc_eta - 1)
-  endif
-  if(iproc_eta == NPROC_ETA-1) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi,iproc_eta + 1)
-  endif
-  call MPI_IRECV(buffer_received_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,request_receive,ier)
+    ! send messages forward along each row
+    if(iproc_eta == 0) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi,iproc_eta - 1)
+    endif
+    if(iproc_eta == NPROC_ETA-1) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi,iproc_eta + 1)
+    endif
+    call MPI_IRECV(buffer_received_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,sender, &
+      itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,receiver, &
-    itag2,MPI_COMM_WORLD,request_send,ier)
+    call MPI_ISEND(buffer_send_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,receiver, &
+      itag2,MPI_COMM_WORLD,request_send,ier)
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-  endif !!!!!!!!! end of iphase 3
+  endif !!!!!!!!! end of iphase_comm 3
 
-  if(iphase == 4) then
+  if(iphase_comm == 4) then
 
-! call MPI_WAIT(request_send,msg_status,ier)
-! call MPI_WAIT(request_receive,msg_status,ier)
-  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not sent yet
-  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not received yet
+    ! call MPI_WAIT(request_send,msg_status,ier)
+    ! call MPI_WAIT(request_receive,msg_status,ier)
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
 
-! all slices add the buffer received to the contributions on the left face
-  if(iproc_eta > 0) then
-  do ipoin=1,npoin2D_eta(1)
-    array_val(iboolleft_eta(ipoin)) = array_val(iboolleft_eta(ipoin)) + &
-                              buffer_received_faces_scalar(ipoin,1)
-  enddo
-  endif
+    ! all slices add the buffer received to the contributions on the left face
+    if(iproc_eta > 0) then
+      do ipoin=1,npoin2D_eta(1)
+        array_val(iboolleft_eta(ipoin)) = array_val(iboolleft_eta(ipoin)) + &
+                                buffer_received_faces_scalar(ipoin,1)
+      enddo
+    endif
 
-! the contributions are correctly assembled on the left side of each slice
-! now we have to send the result back to the sender
-! all slices copy the left face into the buffer
-  do ipoin=1,npoin2D_eta(1)
-    buffer_send_faces_scalar(ipoin,1) = array_val(iboolleft_eta(ipoin))
-  enddo
+    ! the contributions are correctly assembled on the left side of each slice
+    ! now we have to send the result back to the sender
+    ! all slices copy the left face into the buffer
+    do ipoin=1,npoin2D_eta(1)
+      buffer_send_faces_scalar(ipoin,1) = array_val(iboolleft_eta(ipoin))
+    enddo
 
-! send messages backward along each row
-  if(iproc_eta == NPROC_ETA-1) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi,iproc_eta + 1)
-  endif
-  if(iproc_eta == 0) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi,iproc_eta - 1)
-  endif
-  call MPI_IRECV(buffer_received_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,request_receive,ier)
+    ! send messages backward along each row
+    if(iproc_eta == NPROC_ETA-1) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi,iproc_eta + 1)
+    endif
+    if(iproc_eta == 0) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi,iproc_eta - 1)
+    endif
+    call MPI_IRECV(buffer_received_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,sender, &
+      itag,MPI_COMM_WORLD,request_receive,ier)
 
-  call MPI_ISEND(buffer_send_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,receiver, &
-    itag2,MPI_COMM_WORLD,request_send,ier)
+    call MPI_ISEND(buffer_send_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,receiver, &
+      itag2,MPI_COMM_WORLD,request_send,ier)
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-  endif !!!!!!!!! end of iphase 4
+  endif !!!!!!!!! end of iphase_comm 4
 
-  if(iphase == 5) then
+  if(iphase_comm == 5) then
 
-! call MPI_WAIT(request_send,msg_status,ier)
-! call MPI_WAIT(request_receive,msg_status,ier)
-  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not sent yet
-  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not received yet
+    ! call MPI_WAIT(request_send,msg_status,ier)
+    ! call MPI_WAIT(request_receive,msg_status,ier)
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
 
-! all slices copy the buffer received to the contributions on the right face
-  if(iproc_eta < NPROC_ETA-1) then
-  do ipoin=1,npoin2D_eta(2)
-    array_val(iboolright_eta(ipoin)) = buffer_received_faces_scalar(ipoin,1)
-  enddo
-  endif
+    ! all slices copy the buffer received to the contributions on the right face
+    if(iproc_eta < NPROC_ETA-1) then
+      do ipoin=1,npoin2D_eta(2)
+        array_val(iboolright_eta(ipoin)) = buffer_received_faces_scalar(ipoin,1)
+      enddo
+    endif
 
-!----
-!---- start MPI assembling phase between chunks
-!----
+    !----
+    !---- start MPI assembling phase between chunks
+    !----
 
-! check flag to see if we need to assemble (might be turned off when debugging)
-! and do not assemble if only one chunk
-  if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) then
-    iphase = 9999 ! this means everything is finished
-    return
-  endif
+    ! check flag to see if we need to assemble (might be turned off when debugging)
+    ! and do not assemble if only one chunk
+    if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) then
+      iphase_comm = 9999 ! this means everything is finished
+      return
+    endif
 
-! ***************************************************************
-!  transmit messages in forward direction (iprocfrom -> iprocto)
-! ***************************************************************
+    ! ***************************************************************
+    !  transmit messages in forward direction (iprocfrom -> iprocto)
+    ! ***************************************************************
 
-!---- put slices in receive mode
-!---- a given slice can belong to at most two faces
+    !---- put slices in receive mode
+    !---- a given slice can belong to at most two faces
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg)) then
-    sender = iprocfrom_faces(imsg)
-    npoin2D_chunks = npoin2D_faces(icount_faces)
-    call MPI_IRECV(buffer_received_faces_scalar(1,icount_faces), &
-              npoin2D_chunks,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
-!   do ipoin2D=1,npoin2D_chunks
-!     array_val(iboolfaces(ipoin2D,icount_faces)) = &
-!        array_val(iboolfaces(ipoin2D,icount_faces)) + buffer_received_faces_scalar(ipoin2D)
-!   enddo
-  endif
-  enddo
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg)) then
+        sender = iprocfrom_faces(imsg)
+        npoin2D_chunks = npoin2D_faces(icount_faces)
+        call MPI_IRECV(buffer_received_faces_scalar(1,icount_faces), &
+                      npoin2D_chunks,CUSTOM_MPI_TYPE,sender, &
+                      itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
+    !   do ipoin2D=1,npoin2D_chunks
+    !     array_val(iboolfaces(ipoin2D,icount_faces)) = &
+    !        array_val(iboolfaces(ipoin2D,icount_faces)) + buffer_received_faces_scalar(ipoin2D)
+    !   enddo
+      endif
+    enddo
 
-!---- put slices in send mode
-!---- a given slice can belong to at most two faces
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg)) then
-    receiver = iprocto_faces(imsg)
-    npoin2D_chunks = npoin2D_faces(icount_faces)
-    do ipoin2D=1,npoin2D_chunks
-      buffer_send_faces_scalar(ipoin2D,icount_faces) = array_val(iboolfaces(ipoin2D,icount_faces))
+    !---- put slices in send mode
+    !---- a given slice can belong to at most two faces
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg)) then
+        receiver = iprocto_faces(imsg)
+        npoin2D_chunks = npoin2D_faces(icount_faces)
+        do ipoin2D=1,npoin2D_chunks
+          buffer_send_faces_scalar(ipoin2D,icount_faces) = array_val(iboolfaces(ipoin2D,icount_faces))
+        enddo
+        call MPI_ISEND(buffer_send_faces_scalar(1,icount_faces),npoin2D_chunks, &
+                      CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,request_send_array(icount_faces),ier)
+      endif
     enddo
-    call MPI_ISEND(buffer_send_faces_scalar(1,icount_faces),npoin2D_chunks, &
-              CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,request_send_array(icount_faces),ier)
-  endif
-  enddo
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-  endif !!!!!!!!! end of iphase 5
+  endif !!!!!!!!! end of iphase_comm 5
 
-  if(iphase == 6) then
+  if(iphase_comm == 6) then
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg)) then
-    call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not received yet
-  endif
-  enddo
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg)) then
+        call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
+        if(.not. flag_result_test) return ! exit if message not received yet
+      endif
+    enddo
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg)) then
-    call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not sent yet
-  endif
-  enddo
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg)) then
+        call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
+        if(.not. flag_result_test) return ! exit if message not sent yet
+      endif
+    enddo
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg)) then
-    do ipoin2D=1,npoin2D_faces(icount_faces)
-      array_val(iboolfaces(ipoin2D,icount_faces)) = &
-         array_val(iboolfaces(ipoin2D,icount_faces)) + buffer_received_faces_scalar(ipoin2D,icount_faces)
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg)) then
+        do ipoin2D=1,npoin2D_faces(icount_faces)
+          array_val(iboolfaces(ipoin2D,icount_faces)) = &
+            array_val(iboolfaces(ipoin2D,icount_faces)) + buffer_received_faces_scalar(ipoin2D,icount_faces)
+        enddo
+      endif
     enddo
-  endif
-  enddo
 
-! *********************************************************************
-!  transmit messages back in opposite direction (iprocto -> iprocfrom)
-! *********************************************************************
+    ! *********************************************************************
+    !  transmit messages back in opposite direction (iprocto -> iprocfrom)
+    ! *********************************************************************
 
-!---- put slices in receive mode
-!---- a given slice can belong to at most two faces
+    !---- put slices in receive mode
+    !---- a given slice can belong to at most two faces
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg)) then
-    sender = iprocto_faces(imsg)
-    npoin2D_chunks = npoin2D_faces(icount_faces)
-    call MPI_IRECV(buffer_received_faces_scalar(1,icount_faces), &
-              npoin2D_chunks,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
-!   do ipoin2D=1,npoin2D_chunks
-!     array_val(iboolfaces(ipoin2D,icount_faces)) = buffer_received_faces_scalar(ipoin2D)
-!   enddo
-  endif
-  enddo
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg)) then
+        sender = iprocto_faces(imsg)
+        npoin2D_chunks = npoin2D_faces(icount_faces)
+        call MPI_IRECV(buffer_received_faces_scalar(1,icount_faces), &
+                      npoin2D_chunks,CUSTOM_MPI_TYPE,sender, &
+                      itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
+      !   do ipoin2D=1,npoin2D_chunks
+      !     array_val(iboolfaces(ipoin2D,icount_faces)) = buffer_received_faces_scalar(ipoin2D)
+      !   enddo
+      endif
+    enddo
 
-!---- put slices in send mode
-!---- a given slice can belong to at most two faces
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg)) then
-    receiver = iprocfrom_faces(imsg)
-    npoin2D_chunks = npoin2D_faces(icount_faces)
-    do ipoin2D=1,npoin2D_chunks
-      buffer_send_faces_scalar(ipoin2D,icount_faces) = array_val(iboolfaces(ipoin2D,icount_faces))
+    !---- put slices in send mode
+    !---- a given slice can belong to at most two faces
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg)) then
+        receiver = iprocfrom_faces(imsg)
+        npoin2D_chunks = npoin2D_faces(icount_faces)
+        do ipoin2D=1,npoin2D_chunks
+          buffer_send_faces_scalar(ipoin2D,icount_faces) = array_val(iboolfaces(ipoin2D,icount_faces))
+        enddo
+        call MPI_ISEND(buffer_send_faces_scalar(1,icount_faces),npoin2D_chunks, &
+                      CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,request_send_array(icount_faces),ier)
+      endif
     enddo
-    call MPI_ISEND(buffer_send_faces_scalar(1,icount_faces),npoin2D_chunks, &
-              CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,request_send_array(icount_faces),ier)
-  endif
-  enddo
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-  endif !!!!!!!!! end of iphase 6
+  endif !!!!!!!!! end of iphase_comm 6
 
-  if(iphase == 7) then
+  if(iphase_comm == 7) then
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg)) then
-    call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not received yet
-  endif
-  enddo
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg)) then
+        call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
+        if(.not. flag_result_test) return ! exit if message not received yet
+      endif
+    enddo
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg)) then
-    call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not sent yet
-  endif
-  enddo
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg)) then
+        call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
+        if(.not. flag_result_test) return ! exit if message not sent yet
+      endif
+    enddo
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg)) then
-    do ipoin2D=1,npoin2D_faces(icount_faces)
-      array_val(iboolfaces(ipoin2D,icount_faces)) = buffer_received_faces_scalar(ipoin2D,icount_faces)
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg)) then
+        do ipoin2D=1,npoin2D_faces(icount_faces)
+          array_val(iboolfaces(ipoin2D,icount_faces)) = buffer_received_faces_scalar(ipoin2D,icount_faces)
+        enddo
+      endif
     enddo
-  endif
-  enddo
 
-! this is the exit condition, to go beyond the last phase number
-  iphase = iphase + 1
+    ! this is the exit condition, to go beyond the last phase number
+    iphase_comm = iphase_comm + 1
 
-!! DK DK do the rest in blocking for now, for simplicity
+    !! DK DK do the rest in blocking for now, for simplicity
 
-!----
-!---- start MPI assembling corners
-!----
+    !----
+    !---- start MPI assembling corners
+    !----
 
-! scheme for corners cannot deadlock even if NPROC_XI = NPROC_ETA = 1
+    ! scheme for corners cannot deadlock even if NPROC_XI = NPROC_ETA = 1
 
-! ***************************************************************
-!  transmit messages in forward direction (two workers -> master)
-! ***************************************************************
+    ! ***************************************************************
+    !  transmit messages in forward direction (two workers -> master)
+    ! ***************************************************************
 
-  icount_corners = 0
+    icount_corners = 0
 
-  do imsg = 1,NCORNERSCHUNKS
+    do imsg = 1,NCORNERSCHUNKS
 
-  if(myrank == iproc_master_corners(imsg) .or. &
-     myrank == iproc_worker1_corners(imsg) .or. &
-     (NCHUNKS /= 2 .and. myrank == iproc_worker2_corners(imsg))) icount_corners = icount_corners + 1
+      if(myrank == iproc_master_corners(imsg) .or. &
+         myrank == iproc_worker1_corners(imsg) .or. &
+         (NCHUNKS /= 2 .and. myrank == iproc_worker2_corners(imsg))) icount_corners = icount_corners + 1
 
-!---- receive messages from the two workers on the master
-  if(myrank==iproc_master_corners(imsg)) then
+      !---- receive messages from the two workers on the master
+      if(myrank==iproc_master_corners(imsg)) then
 
-! receive from worker #1 and add to local array
-    sender = iproc_worker1_corners(imsg)
-    call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
-          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
-    do ipoin1D=1,NGLOB1D_RADIAL
-      array_val(iboolcorner(ipoin1D,icount_corners)) = array_val(iboolcorner(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_scalar(ipoin1D)
+        ! receive from worker #1 and add to local array
+        sender = iproc_worker1_corners(imsg)
+        call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
+              CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+        do ipoin1D=1,NGLOB1D_RADIAL
+          array_val(iboolcorner(ipoin1D,icount_corners)) = array_val(iboolcorner(ipoin1D,icount_corners)) + &
+                   buffer_recv_chunkcorn_scalar(ipoin1D)
+        enddo
+
+        ! receive from worker #2 and add to local array
+        if(NCHUNKS /= 2) then
+          sender = iproc_worker2_corners(imsg)
+          call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
+                      CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+          do ipoin1D=1,NGLOB1D_RADIAL
+            array_val(iboolcorner(ipoin1D,icount_corners)) = array_val(iboolcorner(ipoin1D,icount_corners)) + &
+                   buffer_recv_chunkcorn_scalar(ipoin1D)
+          enddo
+        endif
+
+      endif
+
+      !---- send messages from the two workers to the master
+      if(myrank==iproc_worker1_corners(imsg) .or. &
+                  (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
+
+        receiver = iproc_master_corners(imsg)
+        do ipoin1D=1,NGLOB1D_RADIAL
+          buffer_send_chunkcorn_scalar(ipoin1D) = array_val(iboolcorner(ipoin1D,icount_corners))
+        enddo
+        call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
+                  receiver,itag,MPI_COMM_WORLD,ier)
+
+      endif
+
+      ! *********************************************************************
+      !  transmit messages back in opposite direction (master -> two workers)
+      ! *********************************************************************
+
+      !---- receive messages from the master on the two workers
+      if(myrank==iproc_worker1_corners(imsg) .or. &
+                  (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
+
+      ! receive from master and copy to local array
+        sender = iproc_master_corners(imsg)
+        call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
+              CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+        do ipoin1D=1,NGLOB1D_RADIAL
+          array_val(iboolcorner(ipoin1D,icount_corners)) = buffer_recv_chunkcorn_scalar(ipoin1D)
+        enddo
+
+      endif
+
+      !---- send messages from the master to the two workers
+      if(myrank==iproc_master_corners(imsg)) then
+
+        do ipoin1D=1,NGLOB1D_RADIAL
+          buffer_send_chunkcorn_scalar(ipoin1D) = array_val(iboolcorner(ipoin1D,icount_corners))
+        enddo
+
+        ! send to worker #1
+        receiver = iproc_worker1_corners(imsg)
+        call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
+                  receiver,itag,MPI_COMM_WORLD,ier)
+
+        ! send to worker #2
+        if(NCHUNKS /= 2) then
+          receiver = iproc_worker2_corners(imsg)
+          call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
+                      receiver,itag,MPI_COMM_WORLD,ier)
+        endif
+
+      endif
+
     enddo
 
-! receive from worker #2 and add to local array
-  if(NCHUNKS /= 2) then
-    sender = iproc_worker2_corners(imsg)
-    call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
-          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
-    do ipoin1D=1,NGLOB1D_RADIAL
-      array_val(iboolcorner(ipoin1D,icount_corners)) = array_val(iboolcorner(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_scalar(ipoin1D)
+  endif !!!!!!!!! end of iphase_comm 7
+
+  end subroutine assemble_MPI_scalar
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!-------------------------------------------------------------------------------------------------
+!-------------------------------------------------------------------------------------------------
+!
+
+! daniel: TODO - still local versions
+
+
+  subroutine assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_AB,array_val, &
+                        buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        my_neighbours_ext_mesh, &
+                        request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
+
+! non-blocking MPI send
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+  integer :: NGLOB_AB
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+! array to send
+  real(kind=CUSTOM_REAL), dimension(NGLOB_AB) :: array_val
+
+
+  real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
+  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
+
+  integer ipoin,iinterface
+
+! sends only if more than one partition
+  if(NPROC > 1) then
+
+    ! partition border copy into the buffer
+    do iinterface = 1, num_interfaces_ext_mesh
+      do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+        buffer_send_scalar_ext_mesh(ipoin,iinterface) = &
+          array_val(ibool_interfaces_ext_mesh(ipoin,iinterface))
+      enddo
     enddo
-  endif
 
+    ! send messages
+    do iinterface = 1, num_interfaces_ext_mesh
+      ! non-blocking synchronous send request
+      call isend_cr(buffer_send_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+           nibool_interfaces_ext_mesh(iinterface), &
+           my_neighbours_ext_mesh(iinterface), &
+           itag, &
+           request_send_scalar_ext_mesh(iinterface) &
+           )
+      ! receive request
+      call irecv_cr(buffer_recv_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+           nibool_interfaces_ext_mesh(iinterface), &
+           my_neighbours_ext_mesh(iinterface), &
+           itag, &
+           request_recv_scalar_ext_mesh(iinterface) &
+           )
+
+    enddo
+
   endif
 
-!---- send messages from the two workers to the master
-  if(myrank==iproc_worker1_corners(imsg) .or. &
-              (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
+  end subroutine assemble_MPI_scalar_ext_mesh_s
 
-    receiver = iproc_master_corners(imsg)
-    do ipoin1D=1,NGLOB1D_RADIAL
-      buffer_send_chunkcorn_scalar(ipoin1D) = array_val(iboolcorner(ipoin1D,icount_corners))
+!
+!----
+!
+
+  subroutine assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,array_val, &
+                        buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh, &
+                        max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
+
+! waits for send/receiver to be completed and assembles contributions
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+  integer :: NGLOB_AB
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+! array to assemble
+  real(kind=CUSTOM_REAL), dimension(NGLOB_AB) :: array_val
+
+
+  real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_recv_scalar_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
+
+  integer ipoin,iinterface
+
+! assemble only if more than one partition
+  if(NPROC > 1) then
+
+    ! wait for communications completion (recv)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_recv_scalar_ext_mesh(iinterface))
     enddo
-    call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
-              receiver,itag,MPI_COMM_WORLD,ier)
 
+    ! adding contributions of neighbours
+    do iinterface = 1, num_interfaces_ext_mesh
+      do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+        array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+             array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) &
+             + buffer_recv_scalar_ext_mesh(ipoin,iinterface)
+      enddo
+    enddo
+
+    ! wait for communications completion (send)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_send_scalar_ext_mesh(iinterface))
+    enddo
+
   endif
 
-! *********************************************************************
-!  transmit messages back in opposite direction (master -> two workers)
-! *********************************************************************
+  end subroutine assemble_MPI_scalar_ext_mesh_w
 
-!---- receive messages from the master on the two workers
-  if(myrank==iproc_worker1_corners(imsg) .or. &
-              (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
 
-! receive from master and copy to local array
-    sender = iproc_master_corners(imsg)
-    call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
-          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
-    do ipoin1D=1,NGLOB1D_RADIAL
-      array_val(iboolcorner(ipoin1D,icount_corners)) = buffer_recv_chunkcorn_scalar(ipoin1D)
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+!daniel: TODO - cuda scalar assembly...
+
+  subroutine assemble_MPI_scalar_send_cuda(NPROC, &
+                                          buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
+                                          num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                          nibool_interfaces_ext_mesh, &
+                                          my_neighbours_ext_mesh, &
+                                          request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, &
+                                          FORWARD_OR_ADJOINT)
+
+! non-blocking MPI send
+
+  ! sends data
+  ! note: assembling data already filled into buffer_send_scalar_ext_mesh array
+
+  use constants
+  use specfem_par,only: Mesh_pointer
+  
+  implicit none
+
+  integer :: NPROC
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
+
+  integer :: FORWARD_OR_ADJOINT
+
+  ! local parameters
+  integer iinterface
+
+! sends only if more than one partition
+  if(NPROC > 1) then
+
+    ! preparation of the contribution between partitions using MPI
+    ! transfers mpi buffers to CPU
+    call transfer_boun_pot_from_device(Mesh_pointer, &
+                                      buffer_send_scalar_ext_mesh, &
+                                      FORWARD_OR_ADJOINT)
+
+    ! send messages
+    do iinterface = 1, num_interfaces_ext_mesh
+      ! non-blocking synchronous send request
+      call isend_cr(buffer_send_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+           nibool_interfaces_ext_mesh(iinterface), &
+           my_neighbours_ext_mesh(iinterface), &
+           itag, &
+           request_send_scalar_ext_mesh(iinterface) &
+           )
+      ! receive request
+      call irecv_cr(buffer_recv_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+           nibool_interfaces_ext_mesh(iinterface), &
+           my_neighbours_ext_mesh(iinterface), &
+           itag, &
+           request_recv_scalar_ext_mesh(iinterface) &
+           )
+
     enddo
 
   endif
 
-!---- send messages from the master to the two workers
-  if(myrank==iproc_master_corners(imsg)) then
+  end subroutine assemble_MPI_scalar_send_cuda
 
-    do ipoin1D=1,NGLOB1D_RADIAL
-      buffer_send_chunkcorn_scalar(ipoin1D) = array_val(iboolcorner(ipoin1D,icount_corners))
-    enddo
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-! send to worker #1
-    receiver = iproc_worker1_corners(imsg)
-    call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
-              receiver,itag,MPI_COMM_WORLD,ier)
+  subroutine assemble_MPI_scalar_write_cuda(NPROC, &
+                                            buffer_recv_scalar_ext_mesh, &
+                                            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                            request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, &
+                                            FORWARD_OR_ADJOINT)
 
-! send to worker #2
-  if(NCHUNKS /= 2) then
-    receiver = iproc_worker2_corners(imsg)
-    call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
-              receiver,itag,MPI_COMM_WORLD,ier)
-  endif
+! waits for send/receiver to be completed and assembles contributions
 
-  endif
+  use constants
+  use specfem_par,only: Mesh_pointer
+  
+  implicit none
 
-  enddo
+  integer :: NPROC
 
-  endif !!!!!!!!! end of iphase 7
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
 
-  end subroutine assemble_MPI_scalar
+  real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_recv_scalar_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
 
+  integer :: FORWARD_OR_ADJOINT
+
+  ! local parameters
+  integer :: iinterface 
+
+! assemble only if more than one partition
+  if(NPROC > 1) then
+
+    ! wait for communications completion (recv)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_recv_scalar_ext_mesh(iinterface))
+    enddo
+
+    ! adding contributions of neighbours
+    call transfer_asmbl_pot_to_device(Mesh_pointer, &
+                                    buffer_recv_scalar_ext_mesh, &
+                                    FORWARD_OR_ADJOINT)
+
+    ! note: adding contributions of neighbours has been done just above for cuda
+    !do iinterface = 1, num_interfaces_ext_mesh
+    !  do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+    !    array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+    !         array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) &
+    !         + buffer_recv_scalar_ext_mesh(ipoin,iinterface)
+    !  enddo
+    !enddo
+
+    ! wait for communications completion (send)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_send_scalar_ext_mesh(iinterface))
+    enddo
+
+  endif
+
+  end subroutine assemble_MPI_scalar_write_cuda

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar_block.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar_block.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar_block.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -117,62 +117,62 @@
 ! assemble along xi only if more than one slice
   if(NPROC_XI > 1) then
 
-! slices copy the right face into the buffer
-  do ipoin=1,npoin2D_xi(2)
-    buffer_send_faces_scalar(ipoin) = array_val(iboolright_xi(ipoin))
-  enddo
+    ! slices copy the right face into the buffer
+    do ipoin=1,npoin2D_xi(2)
+      buffer_send_faces_scalar(ipoin) = array_val(iboolright_xi(ipoin))
+    enddo
 
-! send messages forward along each row
-  if(iproc_xi == 0) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi - 1,iproc_eta)
-  endif
-  if(iproc_xi == NPROC_XI-1) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi + 1,iproc_eta)
-  endif
-  call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,receiver, &
-        itag2,buffer_received_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,msg_status,ier)
+    ! send messages forward along each row
+    if(iproc_xi == 0) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi - 1,iproc_eta)
+    endif
+    if(iproc_xi == NPROC_XI-1) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi + 1,iproc_eta)
+    endif
+    call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,receiver, &
+          itag2,buffer_received_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,sender, &
+          itag,MPI_COMM_WORLD,msg_status,ier)
 
-! all slices add the buffer received to the contributions on the left face
-  if(iproc_xi > 0) then
-  do ipoin=1,npoin2D_xi(1)
-    array_val(iboolleft_xi(ipoin)) = array_val(iboolleft_xi(ipoin)) + &
-                              buffer_received_faces_scalar(ipoin)
-  enddo
-  endif
+    ! all slices add the buffer received to the contributions on the left face
+    if(iproc_xi > 0) then
+      do ipoin=1,npoin2D_xi(1)
+        array_val(iboolleft_xi(ipoin)) = array_val(iboolleft_xi(ipoin)) + &
+                                buffer_received_faces_scalar(ipoin)
+      enddo
+    endif
 
-! the contributions are correctly assembled on the left side of each slice
-! now we have to send the result back to the sender
-! all slices copy the left face into the buffer
-  do ipoin=1,npoin2D_xi(1)
-    buffer_send_faces_scalar(ipoin) = array_val(iboolleft_xi(ipoin))
-  enddo
+    ! the contributions are correctly assembled on the left side of each slice
+    ! now we have to send the result back to the sender
+    ! all slices copy the left face into the buffer
+    do ipoin=1,npoin2D_xi(1)
+      buffer_send_faces_scalar(ipoin) = array_val(iboolleft_xi(ipoin))
+    enddo
 
-! send messages backward along each row
-  if(iproc_xi == NPROC_XI-1) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi + 1,iproc_eta)
-  endif
-  if(iproc_xi == 0) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi - 1,iproc_eta)
-  endif
-  call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,receiver, &
-        itag2,buffer_received_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,msg_status,ier)
+    ! send messages backward along each row
+    if(iproc_xi == NPROC_XI-1) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi + 1,iproc_eta)
+    endif
+    if(iproc_xi == 0) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi - 1,iproc_eta)
+    endif
+    call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_xi(1),CUSTOM_MPI_TYPE,receiver, &
+          itag2,buffer_received_faces_scalar,npoin2D_xi(2),CUSTOM_MPI_TYPE,sender, &
+          itag,MPI_COMM_WORLD,msg_status,ier)
 
-! all slices copy the buffer received to the contributions on the right face
-  if(iproc_xi < NPROC_XI-1) then
-  do ipoin=1,npoin2D_xi(2)
-    array_val(iboolright_xi(ipoin)) = buffer_received_faces_scalar(ipoin)
-  enddo
-  endif
+    ! all slices copy the buffer received to the contributions on the right face
+    if(iproc_xi < NPROC_XI-1) then
+      do ipoin=1,npoin2D_xi(2)
+        array_val(iboolright_xi(ipoin)) = buffer_received_faces_scalar(ipoin)
+      enddo
+    endif
 
   endif
 
@@ -183,62 +183,62 @@
 ! assemble along eta only if more than one slice
   if(NPROC_ETA > 1) then
 
-! slices copy the right face into the buffer
-  do ipoin=1,npoin2D_eta(2)
-    buffer_send_faces_scalar(ipoin) = array_val(iboolright_eta(ipoin))
-  enddo
+    ! slices copy the right face into the buffer
+    do ipoin=1,npoin2D_eta(2)
+      buffer_send_faces_scalar(ipoin) = array_val(iboolright_eta(ipoin))
+    enddo
 
-! send messages forward along each row
-  if(iproc_eta == 0) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi,iproc_eta - 1)
-  endif
-  if(iproc_eta == NPROC_ETA-1) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi,iproc_eta + 1)
-  endif
-  call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,receiver, &
-    itag2,buffer_received_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,msg_status,ier)
+    ! send messages forward along each row
+    if(iproc_eta == 0) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi,iproc_eta - 1)
+    endif
+    if(iproc_eta == NPROC_ETA-1) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi,iproc_eta + 1)
+    endif
+    call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,receiver, &
+      itag2,buffer_received_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,sender, &
+      itag,MPI_COMM_WORLD,msg_status,ier)
 
-! all slices add the buffer received to the contributions on the left face
-  if(iproc_eta > 0) then
-  do ipoin=1,npoin2D_eta(1)
-    array_val(iboolleft_eta(ipoin)) = array_val(iboolleft_eta(ipoin)) + &
-                              buffer_received_faces_scalar(ipoin)
-  enddo
-  endif
+    ! all slices add the buffer received to the contributions on the left face
+    if(iproc_eta > 0) then
+      do ipoin=1,npoin2D_eta(1)
+        array_val(iboolleft_eta(ipoin)) = array_val(iboolleft_eta(ipoin)) + &
+                                buffer_received_faces_scalar(ipoin)
+      enddo
+    endif
 
-! the contributions are correctly assembled on the left side of each slice
-! now we have to send the result back to the sender
-! all slices copy the left face into the buffer
-  do ipoin=1,npoin2D_eta(1)
-    buffer_send_faces_scalar(ipoin) = array_val(iboolleft_eta(ipoin))
-  enddo
+    ! the contributions are correctly assembled on the left side of each slice
+    ! now we have to send the result back to the sender
+    ! all slices copy the left face into the buffer
+    do ipoin=1,npoin2D_eta(1)
+      buffer_send_faces_scalar(ipoin) = array_val(iboolleft_eta(ipoin))
+    enddo
 
-! send messages backward along each row
-  if(iproc_eta == NPROC_ETA-1) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi,iproc_eta + 1)
-  endif
-  if(iproc_eta == 0) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi,iproc_eta - 1)
-  endif
-  call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,receiver, &
-    itag2,buffer_received_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,msg_status,ier)
+    ! send messages backward along each row
+    if(iproc_eta == NPROC_ETA-1) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi,iproc_eta + 1)
+    endif
+    if(iproc_eta == 0) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi,iproc_eta - 1)
+    endif
+    call MPI_SENDRECV(buffer_send_faces_scalar,npoin2D_eta(1),CUSTOM_MPI_TYPE,receiver, &
+      itag2,buffer_received_faces_scalar,npoin2D_eta(2),CUSTOM_MPI_TYPE,sender, &
+      itag,MPI_COMM_WORLD,msg_status,ier)
 
-! all slices copy the buffer received to the contributions on the right face
-  if(iproc_eta < NPROC_ETA-1) then
-  do ipoin=1,npoin2D_eta(2)
-    array_val(iboolright_eta(ipoin)) = buffer_received_faces_scalar(ipoin)
-  enddo
-  endif
+    ! all slices copy the buffer received to the contributions on the right face
+    if(iproc_eta < NPROC_ETA-1) then
+      do ipoin=1,npoin2D_eta(2)
+        array_val(iboolright_eta(ipoin)) = buffer_received_faces_scalar(ipoin)
+      enddo
+    endif
 
   endif
 
@@ -261,79 +261,79 @@
 ! scheme for faces cannot deadlock even if NPROC_XI = NPROC_ETA = 1
   do imsg_loop = 1,NUM_MSG_TYPES
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
-    sender = iprocfrom_faces(imsg)
-    npoin2D_chunks = npoin2D_faces(icount_faces)
-    call MPI_RECV(buffer_received_faces_scalar, &
-              npoin2D_chunks,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,msg_status,ier)
-    do ipoin2D=1,npoin2D_chunks
-      array_val(iboolfaces(ipoin2D,icount_faces)) = &
-         array_val(iboolfaces(ipoin2D,icount_faces)) + buffer_received_faces_scalar(ipoin2D)
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. &
+           myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+        sender = iprocfrom_faces(imsg)
+        npoin2D_chunks = npoin2D_faces(icount_faces)
+        call MPI_RECV(buffer_received_faces_scalar, &
+                  npoin2D_chunks,CUSTOM_MPI_TYPE,sender, &
+                  itag,MPI_COMM_WORLD,msg_status,ier)
+        do ipoin2D=1,npoin2D_chunks
+          array_val(iboolfaces(ipoin2D,icount_faces)) = &
+             array_val(iboolfaces(ipoin2D,icount_faces)) + buffer_received_faces_scalar(ipoin2D)
+        enddo
+      endif
     enddo
-  endif
-  enddo
 
-!---- put slices in send mode
-!---- a given slice can belong to at most two faces
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
-    receiver = iprocto_faces(imsg)
-    npoin2D_chunks = npoin2D_faces(icount_faces)
-    do ipoin2D=1,npoin2D_chunks
-      buffer_send_faces_scalar(ipoin2D) = array_val(iboolfaces(ipoin2D,icount_faces))
+    !---- put slices in send mode
+    !---- a given slice can belong to at most two faces
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. &
+           myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+        receiver = iprocto_faces(imsg)
+        npoin2D_chunks = npoin2D_faces(icount_faces)
+        do ipoin2D=1,npoin2D_chunks
+          buffer_send_faces_scalar(ipoin2D) = array_val(iboolfaces(ipoin2D,icount_faces))
+        enddo
+        call MPI_SEND(buffer_send_faces_scalar,npoin2D_chunks, &
+                  CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+      endif
     enddo
-    call MPI_SEND(buffer_send_faces_scalar,npoin2D_chunks, &
-              CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
-  endif
-  enddo
 
-! *********************************************************************
-!  transmit messages back in opposite direction (iprocto -> iprocfrom)
-! *********************************************************************
+    ! *********************************************************************
+    !  transmit messages back in opposite direction (iprocto -> iprocfrom)
+    ! *********************************************************************
 
-!---- put slices in receive mode
-!---- a given slice can belong to at most two faces
+    !---- put slices in receive mode
+    !---- a given slice can belong to at most two faces
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
-    sender = iprocto_faces(imsg)
-    npoin2D_chunks = npoin2D_faces(icount_faces)
-    call MPI_RECV(buffer_received_faces_scalar, &
-              npoin2D_chunks,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,msg_status,ier)
-    do ipoin2D=1,npoin2D_chunks
-      array_val(iboolfaces(ipoin2D,icount_faces)) = buffer_received_faces_scalar(ipoin2D)
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. &
+           myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+        sender = iprocto_faces(imsg)
+        npoin2D_chunks = npoin2D_faces(icount_faces)
+        call MPI_RECV(buffer_received_faces_scalar, &
+                  npoin2D_chunks,CUSTOM_MPI_TYPE,sender, &
+                  itag,MPI_COMM_WORLD,msg_status,ier)
+        do ipoin2D=1,npoin2D_chunks
+          array_val(iboolfaces(ipoin2D,icount_faces)) = buffer_received_faces_scalar(ipoin2D)
+        enddo
+      endif
     enddo
-  endif
-  enddo
 
-!---- put slices in send mode
-!---- a given slice can belong to at most two faces
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. &
-       myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
-    receiver = iprocfrom_faces(imsg)
-    npoin2D_chunks = npoin2D_faces(icount_faces)
-    do ipoin2D=1,npoin2D_chunks
-      buffer_send_faces_scalar(ipoin2D) = array_val(iboolfaces(ipoin2D,icount_faces))
+    !---- put slices in send mode
+    !---- a given slice can belong to at most two faces
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. &
+           myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg) .and. imsg_type(imsg) == imsg_loop) then
+        receiver = iprocfrom_faces(imsg)
+        npoin2D_chunks = npoin2D_faces(icount_faces)
+        do ipoin2D=1,npoin2D_chunks
+          buffer_send_faces_scalar(ipoin2D) = array_val(iboolfaces(ipoin2D,icount_faces))
+        enddo
+        call MPI_SEND(buffer_send_faces_scalar,npoin2D_chunks, &
+                  CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+      endif
     enddo
-    call MPI_SEND(buffer_send_faces_scalar,npoin2D_chunks, &
-              CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
-  endif
-  enddo
 
 ! end of anti-deadlocking loop
   enddo
@@ -352,86 +352,86 @@
 
   do imsg = 1,NCORNERSCHUNKS
 
-  if(myrank == iproc_master_corners(imsg) .or. &
-     myrank == iproc_worker1_corners(imsg) .or. &
-     (NCHUNKS /= 2 .and. myrank == iproc_worker2_corners(imsg))) icount_corners = icount_corners + 1
+    if(myrank == iproc_master_corners(imsg) .or. &
+       myrank == iproc_worker1_corners(imsg) .or. &
+       (NCHUNKS /= 2 .and. myrank == iproc_worker2_corners(imsg))) icount_corners = icount_corners + 1
 
-!---- receive messages from the two workers on the master
-  if(myrank==iproc_master_corners(imsg)) then
+    !---- receive messages from the two workers on the master
+    if(myrank==iproc_master_corners(imsg)) then
 
-! receive from worker #1 and add to local array
-    sender = iproc_worker1_corners(imsg)
-    call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
-          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
-    do ipoin1D=1,NGLOB1D_RADIAL
-      array_val(iboolcorner(ipoin1D,icount_corners)) = array_val(iboolcorner(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_scalar(ipoin1D)
-    enddo
+      ! receive from worker #1 and add to local array
+      sender = iproc_worker1_corners(imsg)
+      call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
+            CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+      do ipoin1D=1,NGLOB1D_RADIAL
+        array_val(iboolcorner(ipoin1D,icount_corners)) = array_val(iboolcorner(ipoin1D,icount_corners)) + &
+                 buffer_recv_chunkcorn_scalar(ipoin1D)
+      enddo
 
-! receive from worker #2 and add to local array
-  if(NCHUNKS /= 2) then
-    sender = iproc_worker2_corners(imsg)
-    call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
-          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
-    do ipoin1D=1,NGLOB1D_RADIAL
-      array_val(iboolcorner(ipoin1D,icount_corners)) = array_val(iboolcorner(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_scalar(ipoin1D)
-    enddo
-  endif
+      ! receive from worker #2 and add to local array
+      if(NCHUNKS /= 2) then
+        sender = iproc_worker2_corners(imsg)
+        call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
+              CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+        do ipoin1D=1,NGLOB1D_RADIAL
+          array_val(iboolcorner(ipoin1D,icount_corners)) = array_val(iboolcorner(ipoin1D,icount_corners)) + &
+                   buffer_recv_chunkcorn_scalar(ipoin1D)
+        enddo
+      endif
 
-  endif
+    endif
 
-!---- send messages from the two workers to the master
-  if(myrank==iproc_worker1_corners(imsg) .or. &
-              (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
+    !---- send messages from the two workers to the master
+    if(myrank==iproc_worker1_corners(imsg) .or. &
+                (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
 
-    receiver = iproc_master_corners(imsg)
-    do ipoin1D=1,NGLOB1D_RADIAL
-      buffer_send_chunkcorn_scalar(ipoin1D) = array_val(iboolcorner(ipoin1D,icount_corners))
-    enddo
-    call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
-              receiver,itag,MPI_COMM_WORLD,ier)
+      receiver = iproc_master_corners(imsg)
+      do ipoin1D=1,NGLOB1D_RADIAL
+        buffer_send_chunkcorn_scalar(ipoin1D) = array_val(iboolcorner(ipoin1D,icount_corners))
+      enddo
+      call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
+                receiver,itag,MPI_COMM_WORLD,ier)
 
-  endif
+    endif
 
-! *********************************************************************
-!  transmit messages back in opposite direction (master -> two workers)
-! *********************************************************************
+    ! *********************************************************************
+    !  transmit messages back in opposite direction (master -> two workers)
+    ! *********************************************************************
 
-!---- receive messages from the master on the two workers
-  if(myrank==iproc_worker1_corners(imsg) .or. &
-              (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
+    !---- receive messages from the master on the two workers
+    if(myrank==iproc_worker1_corners(imsg) .or. &
+                (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
 
-! receive from master and copy to local array
-    sender = iproc_master_corners(imsg)
-    call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
-          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
-    do ipoin1D=1,NGLOB1D_RADIAL
-      array_val(iboolcorner(ipoin1D,icount_corners)) = buffer_recv_chunkcorn_scalar(ipoin1D)
-    enddo
+      ! receive from master and copy to local array
+      sender = iproc_master_corners(imsg)
+      call MPI_RECV(buffer_recv_chunkcorn_scalar,NGLOB1D_RADIAL, &
+            CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+      do ipoin1D=1,NGLOB1D_RADIAL
+        array_val(iboolcorner(ipoin1D,icount_corners)) = buffer_recv_chunkcorn_scalar(ipoin1D)
+      enddo
 
-  endif
+    endif
 
-!---- send messages from the master to the two workers
-  if(myrank==iproc_master_corners(imsg)) then
+    !---- send messages from the master to the two workers
+    if(myrank==iproc_master_corners(imsg)) then
 
-    do ipoin1D=1,NGLOB1D_RADIAL
-      buffer_send_chunkcorn_scalar(ipoin1D) = array_val(iboolcorner(ipoin1D,icount_corners))
-    enddo
+      do ipoin1D=1,NGLOB1D_RADIAL
+        buffer_send_chunkcorn_scalar(ipoin1D) = array_val(iboolcorner(ipoin1D,icount_corners))
+      enddo
 
-! send to worker #1
-    receiver = iproc_worker1_corners(imsg)
-    call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
-              receiver,itag,MPI_COMM_WORLD,ier)
+      ! send to worker #1
+      receiver = iproc_worker1_corners(imsg)
+      call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
+                receiver,itag,MPI_COMM_WORLD,ier)
 
-! send to worker #2
-  if(NCHUNKS /= 2) then
-    receiver = iproc_worker2_corners(imsg)
-    call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
-              receiver,itag,MPI_COMM_WORLD,ier)
-  endif
+      ! send to worker #2
+      if(NCHUNKS /= 2) then
+        receiver = iproc_worker2_corners(imsg)
+        call MPI_SEND(buffer_send_chunkcorn_scalar,NGLOB1D_RADIAL,CUSTOM_MPI_TYPE, &
+                  receiver,itag,MPI_COMM_WORLD,ier)
+      endif
 
-  endif
+    endif
 
   enddo
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -32,35 +32,40 @@
 !----
 
   subroutine assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces_vector,buffer_received_faces_vector,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES,NCORNERSCHUNKS, &
-            NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL_crust_mantle, &
-            NGLOB1D_RADIAL_inner_core,NCHUNKS,iphase)
+                iproc_xi,iproc_eta,ichunk,addressing, &
+                iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,&
+                iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+                npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+                iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+                iboolleft_xi_inner_core,iboolright_xi_inner_core,&
+                iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+                npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+                iboolfaces_inner_core,iboolcorner_inner_core, &
+                iprocfrom_faces,iprocto_faces, &
+                iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+                buffer_send_faces_vector,buffer_received_faces_vector, &
+                npoin2D_max_all_CM_IC, &
+                buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
+                request_send,request_receive, &
+                request_send_array,request_receive_array, &
+                NUMMSGS_FACES,NCORNERSCHUNKS, &
+                NPROC_XI,NPROC_ETA,NGLOB1D_RADIAL_crust_mantle, &
+                NGLOB1D_RADIAL_inner_core,NCHUNKS,iphase_comm)
 
   implicit none
 
-! standard include of the MPI library
+  ! standard include of the MPI library
   include 'mpif.h'
 
   include "constants.h"
   include "precision.h"
 
-! include values created by the mesher
+  ! include values created by the mesher
   include "OUTPUT_FILES/values_from_mesher.h"
 
-  integer myrank,NCHUNKS,iphase
+  integer myrank,NCHUNKS,iphase_comm
 
-! the two arrays to assemble
+  ! the two arrays to assemble
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE), intent(inout) :: accel_crust_mantle
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE), intent(inout) :: accel_inner_core
 
@@ -74,19 +79,18 @@
   integer, intent(in) :: NGLOB1D_RADIAL_crust_mantle,NGLOB1D_RADIAL_inner_core,NPROC_XI,NPROC_ETA
   integer, intent(in) :: NUMMSGS_FACES,NCORNERSCHUNKS
 
-! for addressing of the slices
+  ! for addressing of the slices
   integer, dimension(NCHUNKS,0:NPROC_XI-1,0:NPROC_ETA-1), intent(in) :: addressing
 
-! 2-D addressing and buffers for summation between slices
+  ! 2-D addressing and buffers for summation between slices
   integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM), intent(in) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
   integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM), intent(in) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
   integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC), intent(in) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
   integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC), intent(in) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
 
-! indirect addressing for each corner of the chunks
+  ! indirect addressing for each corner of the chunks
   integer, dimension(NGLOB1D_RADIAL_crust_mantle,NUMCORNERS_SHARED), intent(in) :: iboolcorner_crust_mantle
   integer, dimension(NGLOB1D_RADIAL_inner_core,NUMCORNERS_SHARED), intent(in) :: iboolcorner_inner_core
-  integer icount_corners
 
   integer, intent(in) :: npoin2D_max_all_CM_IC
   integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED), intent(in) :: iboolfaces_crust_mantle
@@ -94,20 +98,27 @@
   real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED), intent(inout) :: &
       buffer_send_faces_vector,buffer_received_faces_vector
 
-! buffers for send and receive between corners of the chunks
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  ! buffers for send and receive between corners of the chunks
+  ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_crust_mantle + NGLOB1D_RADIAL_inner_core), intent(inout) :: &
     buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector
 
-! ---- arrays to assemble between chunks
+  ! stored as global parameters
+  integer, intent(inout) :: request_send,request_receive
+  integer, dimension(NUMFACES_SHARED), intent(inout) :: request_send_array,request_receive_array
 
-! communication pattern for faces between chunks
+
+  ! ---- arrays to assemble between chunks
+  ! communication pattern for faces between chunks
   integer, dimension(NUMMSGS_FACES), intent(in) :: iprocfrom_faces,iprocto_faces
-
-! communication pattern for corners between chunks
+  ! communication pattern for corners between chunks
   integer, dimension(NCORNERSCHUNKS), intent(in) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
 
-! MPI status of messages to be received
+! local parameters
+
+  integer :: icount_corners
+
+  ! MPI status of messages to be received
   integer, dimension(MPI_STATUS_SIZE) :: msg_status
 
   integer :: ipoin,ipoin2D,ipoin1D
@@ -118,811 +129,1061 @@
   integer :: NGLOB1D_RADIAL_all
   integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_all,npoin2D_eta_all
 
-! do not remove the "save" statement because this routine is non blocking
-! therefore it needs to find the right value of ioffset when it re-enters
-! the routine later to perform the next communication step
+  logical :: flag_result_test
+  integer :: ier
+
+  ! daniel: TODO - comment below might be obsolete..
+  ! do not remove the "save" statement because this routine is non blocking
+  ! therefore it needs to find the right value of ioffset when it re-enters
+  ! the routine later to perform the next communication step
   integer, save :: ioffset
 
-  integer :: ier
-! do not remove the "save" statement because this routine is non blocking
-  integer, save :: request_send,request_receive
-  integer, dimension(NUMFACES_SHARED), save :: request_send_array,request_receive_array
-  logical :: flag_result_test
 
-! $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
-
-! check flag to see if we need to assemble (might be turned off when debugging)
+  ! check flag to see if we need to assemble (might be turned off when debugging)
   if (.not. ACTUALLY_ASSEMBLE_MPI_SLICES) then
-    iphase = 9999 ! this means everything is finished
+    iphase_comm = 9999 ! this means everything is finished
     return
   endif
 
-! here we have to assemble all the contributions between slices using MPI
+  ! here we have to assemble all the contributions between slices using MPI
 
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
   npoin2D_xi_all(:) = npoin2D_xi_crust_mantle(:) + npoin2D_xi_inner_core(:)
   npoin2D_eta_all(:) = npoin2D_eta_crust_mantle(:) + npoin2D_eta_inner_core(:)
 
-!----
-!---- assemble the contributions between slices using MPI
-!----
+  !----
+  !---- assemble the contributions between slices using MPI
+  !----
 
-!----
-!---- first assemble along xi using the 2-D topology
-!----
+  !----
+  !---- first assemble along xi using the 2-D topology
+  !----
+  select case( iphase_comm )
 
-  if(iphase == 1) then
+  case( 1 )
 
-! slices copy the right face into the buffer
-  do ipoin = 1,npoin2D_xi_crust_mantle(2)
-    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin))
-  enddo
+    ! slices send out values along xi sides (right face) forward along each row
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-  ioffset = npoin2D_xi_crust_mantle(2)
+    ! slices copy the right face into the buffer
+    do ipoin = 1,npoin2D_xi_crust_mantle(2)
+      buffer_send_faces_vector(:,ipoin,1) = accel_crust_mantle(:,iboolright_xi_crust_mantle(ipoin))
+    enddo
 
-  do ipoin = 1,npoin2D_xi_inner_core(2)
-    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolright_xi_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolright_xi_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolright_xi_inner_core(ipoin))
-  enddo
+    ! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_xi_crust_mantle(2)
 
-! send messages forward along each row
-  if(iproc_xi == 0) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi - 1,iproc_eta)
-  endif
-  if(iproc_xi == NPROC_XI-1) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi + 1,iproc_eta)
-  endif
-  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all(1),CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,request_receive,ier)
+    do ipoin = 1,npoin2D_xi_inner_core(2)
+      buffer_send_faces_vector(:,ioffset + ipoin,1) = accel_inner_core(:,iboolright_xi_inner_core(ipoin))
+    enddo
 
-  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all(2),CUSTOM_MPI_TYPE,receiver, &
-        itag2,MPI_COMM_WORLD,request_send,ier)
+    ! send messages forward along each row
+    if(iproc_xi == 0) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi - 1,iproc_eta)
+    endif
+    if(iproc_xi == NPROC_XI-1) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi + 1,iproc_eta)
+    endif
+    ! requests to receive message
+    call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all(1),CUSTOM_MPI_TYPE,sender, &
+          itag,MPI_COMM_WORLD,request_receive,ier)
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+    ! sends out buffer
+    call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all(2),CUSTOM_MPI_TYPE,receiver, &
+          itag2,MPI_COMM_WORLD,request_send,ier)
 
-  endif !!!!!!!!! end of iphase 1
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-  if(iphase == 2) then
 
-! call MPI_WAIT(request_send,msg_status,ier)
-! call MPI_WAIT(request_receive,msg_status,ier)
-  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not sent yet
-  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not received yet
+  case( 2 )
 
-! all slices add the buffer received to the contributions on the left face
-  if(iproc_xi > 0) then
+    ! slices assemble along (left face) xi sides
+    ! and send out values along xi sides (left face) backward along each row
 
-  do ipoin = 1,npoin2D_xi_crust_mantle(1)
-    accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(1,ipoin,1)
-    accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(2,ipoin,1)
-    accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin)) = accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(3,ipoin,1)
-  enddo
+    ! checks if messages have been sent out and requested ones received
+    ! call MPI_WAIT(request_send,msg_status,ier)
+    ! call MPI_WAIT(request_receive,msg_status,ier)
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-  ioffset = npoin2D_xi_crust_mantle(1)
+    ! all slices add the buffer received to the contributions on the left face
+    if(iproc_xi > 0) then
 
-  do ipoin = 1,npoin2D_xi_inner_core(1)
-    accel_inner_core(1,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(1,iboolleft_xi_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(1,ioffset + ipoin,1)
-    accel_inner_core(2,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(2,iboolleft_xi_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(2,ioffset + ipoin,1)
-    accel_inner_core(3,iboolleft_xi_inner_core(ipoin)) = accel_inner_core(3,iboolleft_xi_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(3,ioffset + ipoin,1)
-  enddo
+      do ipoin = 1,npoin2D_xi_crust_mantle(1)
+        accel_crust_mantle(:,iboolleft_xi_crust_mantle(ipoin)) = &
+                accel_crust_mantle(:,iboolleft_xi_crust_mantle(ipoin)) + &
+                                  buffer_received_faces_vector(:,ipoin,1)
+      enddo
 
-  endif
+      ! the buffer for the inner core starts right after the buffer for the crust and mantle
+      ioffset = npoin2D_xi_crust_mantle(1)
 
-! the contributions are correctly assembled on the left side of each slice
-! now we have to send the result back to the sender
-! all slices copy the left face into the buffer
-  do ipoin = 1,npoin2D_xi_crust_mantle(1)
-    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolleft_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolleft_xi_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolleft_xi_crust_mantle(ipoin))
-  enddo
+      do ipoin = 1,npoin2D_xi_inner_core(1)
+        accel_inner_core(:,iboolleft_xi_inner_core(ipoin)) = &
+                  accel_inner_core(:,iboolleft_xi_inner_core(ipoin)) + &
+                                  buffer_received_faces_vector(:,ioffset + ipoin,1)
+      enddo
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-  ioffset = npoin2D_xi_crust_mantle(1)
+    endif
 
-  do ipoin = 1,npoin2D_xi_inner_core(1)
-    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolleft_xi_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolleft_xi_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolleft_xi_inner_core(ipoin))
-  enddo
+    ! the contributions are correctly assembled on the left side of each slice
+    ! now we have to send the result back to the sender
+    ! all slices copy the left face into the buffer
+    do ipoin = 1,npoin2D_xi_crust_mantle(1)
+      buffer_send_faces_vector(:,ipoin,1) = accel_crust_mantle(:,iboolleft_xi_crust_mantle(ipoin))
+    enddo
 
-! send messages backward along each row
-  if(iproc_xi == NPROC_XI-1) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi + 1,iproc_eta)
-  endif
-  if(iproc_xi == 0) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi - 1,iproc_eta)
-  endif
-  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all(2),CUSTOM_MPI_TYPE,sender, &
-        itag,MPI_COMM_WORLD,request_receive,ier)
+    ! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_xi_crust_mantle(1)
 
-  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all(1),CUSTOM_MPI_TYPE,receiver, &
-        itag2,MPI_COMM_WORLD,request_send,ier)
+    do ipoin = 1,npoin2D_xi_inner_core(1)
+      buffer_send_faces_vector(:,ioffset + ipoin,1) = accel_inner_core(:,iboolleft_xi_inner_core(ipoin))
+    enddo
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+    ! send messages backward along each row
+    if(iproc_xi == NPROC_XI-1) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi + 1,iproc_eta)
+    endif
+    if(iproc_xi == 0) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi - 1,iproc_eta)
+    endif
+    call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_xi_all(2),CUSTOM_MPI_TYPE,sender, &
+          itag,MPI_COMM_WORLD,request_receive,ier)
 
-  endif !!!!!!!!! end of iphase 2
+    call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_xi_all(1),CUSTOM_MPI_TYPE,receiver, &
+          itag2,MPI_COMM_WORLD,request_send,ier)
 
-  if(iphase == 3) then
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-! call MPI_WAIT(request_send,msg_status,ier)
-! call MPI_WAIT(request_receive,msg_status,ier)
-  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not sent yet
-  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not received yet
+  case( 3 )
 
-! all slices copy the buffer received to the contributions on the right face
-  if(iproc_xi < NPROC_XI-1) then
+    ! slices set (right face) xi sides
+    ! and sent out values along eta sides (right face) forward along each column
 
-  do ipoin = 1,npoin2D_xi_crust_mantle(2)
-    accel_crust_mantle(1,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin,1)
-    accel_crust_mantle(2,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin,1)
-    accel_crust_mantle(3,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin,1)
-  enddo
+    ! checks if messages have been sent out and received
+    ! call MPI_WAIT(request_send,msg_status,ier)
+    ! call MPI_WAIT(request_receive,msg_status,ier)
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-  ioffset = npoin2D_xi_crust_mantle(2)
+    ! all slices copy the buffer received to the contributions on the right face
+    if(iproc_xi < NPROC_XI-1) then
 
-  do ipoin = 1,npoin2D_xi_inner_core(2)
-    accel_inner_core(1,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin,1)
-    accel_inner_core(2,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin,1)
-    accel_inner_core(3,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin,1)
-  enddo
+      do ipoin = 1,npoin2D_xi_crust_mantle(2)
+        accel_crust_mantle(:,iboolright_xi_crust_mantle(ipoin)) = buffer_received_faces_vector(:,ipoin,1)
+      enddo
 
-  endif
+      ! the buffer for the inner core starts right after the buffer for the crust and mantle
+      ioffset = npoin2D_xi_crust_mantle(2)
 
-!----
-!---- then assemble along eta using the 2-D topology
-!----
+      do ipoin = 1,npoin2D_xi_inner_core(2)
+        accel_inner_core(:,iboolright_xi_inner_core(ipoin)) = buffer_received_faces_vector(:,ioffset + ipoin,1)
+      enddo
 
-! slices copy the right face into the buffer
-  do ipoin = 1,npoin2D_eta_crust_mantle(2)
-    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin))
-  enddo
+    endif
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-  ioffset = npoin2D_eta_crust_mantle(2)
+    !----
+    !---- then assemble along eta using the 2-D topology
+    !----
 
-  do ipoin = 1,npoin2D_eta_inner_core(2)
-    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolright_eta_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolright_eta_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolright_eta_inner_core(ipoin))
-  enddo
+    ! slices copy the right face into the buffer
+    do ipoin = 1,npoin2D_eta_crust_mantle(2)
+      buffer_send_faces_vector(:,ipoin,1) = accel_crust_mantle(:,iboolright_eta_crust_mantle(ipoin))
+    enddo
 
-! send messages forward along each row
-  if(iproc_eta == 0) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi,iproc_eta - 1)
-  endif
-  if(iproc_eta == NPROC_ETA-1) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi,iproc_eta + 1)
-  endif
-  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all(1),CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,request_receive,ier)
+    ! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_eta_crust_mantle(2)
 
-  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all(2),CUSTOM_MPI_TYPE,receiver, &
-    itag2,MPI_COMM_WORLD,request_send,ier)
+    do ipoin = 1,npoin2D_eta_inner_core(2)
+      buffer_send_faces_vector(:,ioffset + ipoin,1) = accel_inner_core(:,iboolright_eta_inner_core(ipoin))
+    enddo
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+    ! send messages forward along each row
+    if(iproc_eta == 0) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi,iproc_eta - 1)
+    endif
+    if(iproc_eta == NPROC_ETA-1) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi,iproc_eta + 1)
+    endif
+    call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all(1),CUSTOM_MPI_TYPE,sender, &
+      itag,MPI_COMM_WORLD,request_receive,ier)
 
-  endif !!!!!!!!! end of iphase 3
+    call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all(2),CUSTOM_MPI_TYPE,receiver, &
+      itag2,MPI_COMM_WORLD,request_send,ier)
 
-  if(iphase == 4) then
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-! call MPI_WAIT(request_send,msg_status,ier)
-! call MPI_WAIT(request_receive,msg_status,ier)
-  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not sent yet
-  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not received yet
 
-! all slices add the buffer received to the contributions on the left face
-  if(iproc_eta > 0) then
+ case( 4 )
 
-  do ipoin = 1,npoin2D_eta_crust_mantle(1)
-    accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(1,ipoin,1)
-    accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(2,ipoin,1)
-    accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin)) = accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin)) + &
-                              buffer_received_faces_vector(3,ipoin,1)
-  enddo
+    ! slices assemble along (left face) eta sides
+    ! and sent out values along eta sides (left face) backward along each column
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-  ioffset = npoin2D_eta_crust_mantle(1)
+    ! checks if messages have been sent out and received
+    ! call MPI_WAIT(request_send,msg_status,ier)
+    ! call MPI_WAIT(request_receive,msg_status,ier)
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
 
-  do ipoin = 1,npoin2D_eta_inner_core(1)
-    accel_inner_core(1,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(1,iboolleft_eta_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(1,ioffset + ipoin,1)
-    accel_inner_core(2,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(2,iboolleft_eta_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(2,ioffset + ipoin,1)
-    accel_inner_core(3,iboolleft_eta_inner_core(ipoin)) = accel_inner_core(3,iboolleft_eta_inner_core(ipoin)) + &
-                              buffer_received_faces_vector(3,ioffset + ipoin,1)
-  enddo
+    ! all slices add the buffer received to the contributions on the left face
+    if(iproc_eta > 0) then
 
-  endif
+      do ipoin = 1,npoin2D_eta_crust_mantle(1)
+        accel_crust_mantle(:,iboolleft_eta_crust_mantle(ipoin)) = &
+                  accel_crust_mantle(:,iboolleft_eta_crust_mantle(ipoin)) + &
+                                  buffer_received_faces_vector(:,ipoin,1)
+      enddo
 
-! the contributions are correctly assembled on the left side of each slice
-! now we have to send the result back to the sender
-! all slices copy the left face into the buffer
-  do ipoin = 1,npoin2D_eta_crust_mantle(1)
-    buffer_send_faces_vector(1,ipoin,1) = accel_crust_mantle(1,iboolleft_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(2,ipoin,1) = accel_crust_mantle(2,iboolleft_eta_crust_mantle(ipoin))
-    buffer_send_faces_vector(3,ipoin,1) = accel_crust_mantle(3,iboolleft_eta_crust_mantle(ipoin))
-  enddo
+      ! the buffer for the inner core starts right after the buffer for the crust and mantle
+      ioffset = npoin2D_eta_crust_mantle(1)
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-  ioffset = npoin2D_eta_crust_mantle(1)
+      do ipoin = 1,npoin2D_eta_inner_core(1)
+        accel_inner_core(:,iboolleft_eta_inner_core(ipoin)) = &
+                  accel_inner_core(:,iboolleft_eta_inner_core(ipoin)) + &
+                                  buffer_received_faces_vector(:,ioffset + ipoin,1)
+      enddo
 
-  do ipoin = 1,npoin2D_eta_inner_core(1)
-    buffer_send_faces_vector(1,ioffset + ipoin,1) = accel_inner_core(1,iboolleft_eta_inner_core(ipoin))
-    buffer_send_faces_vector(2,ioffset + ipoin,1) = accel_inner_core(2,iboolleft_eta_inner_core(ipoin))
-    buffer_send_faces_vector(3,ioffset + ipoin,1) = accel_inner_core(3,iboolleft_eta_inner_core(ipoin))
-  enddo
+    endif
 
-! send messages backward along each row
-  if(iproc_eta == NPROC_ETA-1) then
-    sender = MPI_PROC_NULL
-  else
-    sender = addressing(ichunk,iproc_xi,iproc_eta + 1)
-  endif
-  if(iproc_eta == 0) then
-    receiver = MPI_PROC_NULL
-  else
-    receiver = addressing(ichunk,iproc_xi,iproc_eta - 1)
-  endif
-  call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all(2),CUSTOM_MPI_TYPE,sender, &
-    itag,MPI_COMM_WORLD,request_receive,ier)
+    ! the contributions are correctly assembled on the left side of each slice
+    ! now we have to send the result back to the sender
+    ! all slices copy the left face into the buffer
+    do ipoin = 1,npoin2D_eta_crust_mantle(1)
+      buffer_send_faces_vector(:,ipoin,1) = accel_crust_mantle(:,iboolleft_eta_crust_mantle(ipoin))
+    enddo
 
-  call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all(1),CUSTOM_MPI_TYPE,receiver, &
-    itag2,MPI_COMM_WORLD,request_send,ier)
+    ! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = npoin2D_eta_crust_mantle(1)
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+    do ipoin = 1,npoin2D_eta_inner_core(1)
+      buffer_send_faces_vector(:,ioffset + ipoin,1) = accel_inner_core(:,iboolleft_eta_inner_core(ipoin))
+    enddo
 
-  endif !!!!!!!!! end of iphase 4
+    ! send messages backward along each row
+    if(iproc_eta == NPROC_ETA-1) then
+      sender = MPI_PROC_NULL
+    else
+      sender = addressing(ichunk,iproc_xi,iproc_eta + 1)
+    endif
+    if(iproc_eta == 0) then
+      receiver = MPI_PROC_NULL
+    else
+      receiver = addressing(ichunk,iproc_xi,iproc_eta - 1)
+    endif
+    call MPI_IRECV(buffer_received_faces_vector,NDIM*npoin2D_eta_all(2),CUSTOM_MPI_TYPE,sender, &
+      itag,MPI_COMM_WORLD,request_receive,ier)
 
-  if(iphase == 5) then
+    call MPI_ISEND(buffer_send_faces_vector,NDIM*npoin2D_eta_all(1),CUSTOM_MPI_TYPE,receiver, &
+      itag2,MPI_COMM_WORLD,request_send,ier)
 
-! call MPI_WAIT(request_send,msg_status,ier)
-! call MPI_WAIT(request_receive,msg_status,ier)
-  call MPI_TEST(request_send,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not sent yet
-  call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
-  if(.not. flag_result_test) return ! exit if message not received yet
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-! all slices copy the buffer received to the contributions on the right face
-  if(iproc_eta < NPROC_ETA-1) then
 
-  do ipoin = 1,npoin2D_eta_crust_mantle(2)
-    accel_crust_mantle(1,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(1,ipoin,1)
-    accel_crust_mantle(2,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(2,ipoin,1)
-    accel_crust_mantle(3,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(3,ipoin,1)
-  enddo
+  case( 5 )
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-  ioffset = npoin2D_eta_crust_mantle(2)
+    ! slices set (right face) eta sides
+    ! and sent out values for neighbor chunks (iboolfaces)
 
-  do ipoin = 1,npoin2D_eta_inner_core(2)
-    accel_inner_core(1,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(1,ioffset + ipoin,1)
-    accel_inner_core(2,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(2,ioffset + ipoin,1)
-    accel_inner_core(3,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(3,ioffset + ipoin,1)
-  enddo
+    ! checks if messages have been sent out and received
+    ! call MPI_WAIT(request_send,msg_status,ier)
+    ! call MPI_WAIT(request_receive,msg_status,ier)
+    call MPI_TEST(request_send,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not sent yet
+    call MPI_TEST(request_receive,flag_result_test,msg_status,ier)
+    if(.not. flag_result_test) return ! exit if message not received yet
 
-  endif
+    ! all slices copy the buffer received to the contributions on the right face
+    if(iproc_eta < NPROC_ETA-1) then
 
-!----
-!---- start MPI assembling phase between chunks
-!----
+      do ipoin = 1,npoin2D_eta_crust_mantle(2)
+        accel_crust_mantle(:,iboolright_eta_crust_mantle(ipoin)) = buffer_received_faces_vector(:,ipoin,1)
+      enddo
 
-! check flag to see if we need to assemble (might be turned off when debugging)
-! and do not assemble if only one chunk
-  if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) then
-    iphase = 9999 ! this means everything is finished
-    return
-  endif
+      ! the buffer for the inner core starts right after the buffer for the crust and mantle
+      ioffset = npoin2D_eta_crust_mantle(2)
 
-! ***************************************************************
-!  transmit messages in forward direction (iprocfrom -> iprocto)
-! ***************************************************************
+      do ipoin = 1,npoin2D_eta_inner_core(2)
+        accel_inner_core(:,iboolright_eta_inner_core(ipoin)) = buffer_received_faces_vector(:,ioffset + ipoin,1)
+      enddo
 
-!---- put slices in receive mode
-!---- a given slice can belong to at most two faces
+    endif
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-    if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+    !----
+    !---- start MPI assembling phase between chunks
+    !----
 
-    if(myrank==iprocto_faces(imsg)) then
-      sender = iprocfrom_faces(imsg)
+    ! check flag to see if we need to assemble (might be turned off when debugging)
+    ! and do not assemble if only one chunk
+    if (.not. ACTUALLY_ASSEMBLE_MPI_CHUNKS .or. NCHUNKS == 1) then
+      iphase_comm = 9999 ! this means everything is finished
+      return
+    endif
 
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-      npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
+    ! ***************************************************************
+    !  transmit messages in forward direction (iprocfrom -> iprocto)
+    ! ***************************************************************
 
-      call MPI_IRECV(buffer_received_faces_vector(1,1,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
+    !---- put slices in receive mode
+    !---- a given slice can belong to at most two faces
 
-!   do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-!     accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-!        accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(1,ipoin2D,icount_faces)
-!     accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-!        accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(2,ipoin2D,icount_faces)
-!     accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-!        accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) + buffer_received_faces_vector(3,ipoin2D,icount_faces)
-!   enddo
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-!   ioffset = npoin2D_faces_crust_mantle(icount_faces)
+      if(myrank==iprocto_faces(imsg)) then
+        sender = iprocfrom_faces(imsg)
 
-!   do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-!     accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-!        accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
-!          buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
-!     accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-!        accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
-!          buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
-!     accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-!        accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
-!          buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
-!   enddo
+        ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+        npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
-    endif
-  enddo
+        call MPI_IRECV(buffer_received_faces_vector(1,1,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
+                itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
 
-!---- put slices in send mode
-!---- a given slice can belong to at most two faces
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+        !   do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+        !     accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+        !        accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) &
+        !           + buffer_received_faces_vector(1,ipoin2D,icount_faces)
+        !     accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+        !        accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) &
+        !           + buffer_received_faces_vector(2,ipoin2D,icount_faces)
+        !     accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+        !        accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) &
+        !           + buffer_received_faces_vector(3,ipoin2D,icount_faces)
+        !   enddo
 
-  if(myrank==iprocfrom_faces(imsg)) then
-    receiver = iprocto_faces(imsg)
+        ! the buffer for the inner core starts right after the buffer for the crust and mantle
+        !   ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-    npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
+        !   do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+        !     accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        !        accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+        !          buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
+        !     accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        !        accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+        !          buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
+        !     accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        !        accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+        !          buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+        !   enddo
 
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      buffer_send_faces_vector(1,ipoin2D,icount_faces) = &
-        accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ipoin2D,icount_faces) = &
-        accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ipoin2D,icount_faces) = &
-        accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+      endif
     enddo
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+    !---- put slices in send mode
+    !---- a given slice can belong to at most two faces
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
 
-    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      buffer_send_faces_vector(1,ioffset + ipoin2D,icount_faces) = &
-        accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ioffset + ipoin2D,icount_faces) = &
-        accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ioffset + ipoin2D,icount_faces) = &
-        accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
-    enddo
+      if(myrank==iprocfrom_faces(imsg)) then
+        receiver = iprocto_faces(imsg)
 
-    call MPI_ISEND(buffer_send_faces_vector(1,1,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag, &
-                     MPI_COMM_WORLD,request_send_array(icount_faces),ier)
-  endif
-  enddo
+        ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+        npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+        do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+          buffer_send_faces_vector(:,ipoin2D,icount_faces) = &
+            accel_crust_mantle(:,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+        enddo
 
-  endif !!!!!!!!! end of iphase 5
+        ! the buffer for the inner core starts right after the buffer for the crust and mantle
+        ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
-  if(iphase == 6) then
+        do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+          buffer_send_faces_vector(:,ioffset + ipoin2D,icount_faces) = &
+            accel_inner_core(:,iboolfaces_inner_core(ipoin2D,icount_faces))
+        enddo
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg)) then
-    call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not received yet
-  endif
-  enddo
+        call MPI_ISEND(buffer_send_faces_vector(1,1,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag, &
+                         MPI_COMM_WORLD,request_send_array(icount_faces),ier)
+      endif
+    enddo
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg)) then
-    call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not sent yet
-  endif
-  enddo
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg)) then
+  case( 6 )
 
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-         accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) &
-         + buffer_received_faces_vector(1,ipoin2D,icount_faces)
-      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-         accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) &
-         + buffer_received_faces_vector(2,ipoin2D,icount_faces)
-      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-         accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) &
-         + buffer_received_faces_vector(3,ipoin2D,icount_faces)
+    ! receiver slices on chunk faces assemble values (iboolfaces)
+    ! and send values back to senders
+
+    ! checks if messages have been received
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg)) then
+        call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
+        if(.not. flag_result_test) return ! exit if message not received yet
+      endif
     enddo
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-    ioffset = npoin2D_faces_crust_mantle(icount_faces)
-
-    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-         accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
-           buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
-      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-         accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
-           buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
-      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-         accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
-           buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+    ! checks if messages have been sent out
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg)) then
+        call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
+        if(.not. flag_result_test) return ! exit if message not sent yet
+      endif
     enddo
 
-  endif
-  enddo
+    ! assembles values on chunk faces
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg)) then
 
-! *********************************************************************
-!  transmit messages back in opposite direction (iprocto -> iprocfrom)
-! *********************************************************************
+        do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+          accel_crust_mantle(:,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+             accel_crust_mantle(:,iboolfaces_crust_mantle(ipoin2D,icount_faces)) &
+             + buffer_received_faces_vector(:,ipoin2D,icount_faces)
+        enddo
 
-!---- put slices in receive mode
-!---- a given slice can belong to at most two faces
+        ! the buffer for the inner core starts right after the buffer for the crust and mantle
+        ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg)) then
-    sender = iprocto_faces(imsg)
+        do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+          accel_inner_core(:,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+             accel_inner_core(:,iboolfaces_inner_core(ipoin2D,icount_faces)) + &
+               buffer_received_faces_vector(:,ioffset + ipoin2D,icount_faces)
+        enddo
 
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-    npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
+      endif
+    enddo
 
-    call MPI_IRECV(buffer_received_faces_vector(1,1,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
-              itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
+    ! *********************************************************************
+    !  transmit messages back in opposite direction (iprocto -> iprocfrom)
+    ! *********************************************************************
 
-!   do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-!     accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(1,ipoin2D,icount_faces)
-!     accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(2,ipoin2D,icount_faces)
-!     accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = buffer_received_faces_vector(3,ipoin2D,icount_faces)
-!   enddo
+    !---- put slices in receive mode
+    !---- a given slice can belong to at most two faces
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-!   ioffset = npoin2D_faces_crust_mantle(icount_faces)
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg)) then
+        sender = iprocto_faces(imsg)
 
-!   do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-!     accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-!       buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
-!     accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-!       buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
-!     accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-!       buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
-!   enddo
+        ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+        npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
-  endif
-  enddo
+        call MPI_IRECV(buffer_received_faces_vector(1,1,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,sender, &
+                  itag,MPI_COMM_WORLD,request_receive_array(icount_faces),ier)
 
-!---- put slices in send mode
-!---- a given slice can belong to at most two faces
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg)) then
-    receiver = iprocfrom_faces(imsg)
+        !   do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+        !     accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+        !       buffer_received_faces_vector(1,ipoin2D,icount_faces)
+        !     accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+        !       buffer_received_faces_vector(2,ipoin2D,icount_faces)
+        !     accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+        !       buffer_received_faces_vector(3,ipoin2D,icount_faces)
+        !   enddo
 
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-    npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
+        ! the buffer for the inner core starts right after the buffer for the crust and mantle
+        !   ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      buffer_send_faces_vector(1,ipoin2D,icount_faces) = &
-        accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ipoin2D,icount_faces) = &
-        accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ipoin2D,icount_faces) = &
-        accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+        !   do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+        !     accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        !       buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
+        !     accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        !       buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
+        !     accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+        !       buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+        !   enddo
+
+      endif
     enddo
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+    !---- put slices in send mode
+    !---- a given slice can belong to at most two faces
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg)) then
+        receiver = iprocfrom_faces(imsg)
 
-    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      buffer_send_faces_vector(1,ioffset + ipoin2D,icount_faces) = &
-        accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(2,ioffset + ipoin2D,icount_faces) = &
-        accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces))
-      buffer_send_faces_vector(3,ioffset + ipoin2D,icount_faces) = &
-        accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces))
-    enddo
+        ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+        npoin2D_chunks_all = npoin2D_faces_crust_mantle(icount_faces) + npoin2D_faces_inner_core(icount_faces)
 
-    call MPI_ISEND(buffer_send_faces_vector(1,1,icount_faces),NDIM*npoin2D_chunks_all,CUSTOM_MPI_TYPE,receiver,itag, &
-                     MPI_COMM_WORLD,request_send_array(icount_faces),ier)
-  endif
-  enddo
+        do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+          buffer_send_faces_vector(:,ipoin2D,icount_faces) = &
+            accel_crust_mantle(:,iboolfaces_crust_mantle(ipoin2D,icount_faces))
+        enddo
 
-  iphase = iphase + 1
-  return ! exit because we have started some communications therefore we need some time
+        ! the buffer for the inner core starts right after the buffer for the crust and mantle
+        ioffset = npoin2D_faces_crust_mantle(icount_faces)
 
-  endif !!!!!!!!! end of iphase 6
+        do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+          buffer_send_faces_vector(:,ioffset + ipoin2D,icount_faces) = &
+            accel_inner_core(:,iboolfaces_inner_core(ipoin2D,icount_faces))
+        enddo
 
-  if(iphase == 7) then
+        call MPI_ISEND(buffer_send_faces_vector(1,1,icount_faces),NDIM*npoin2D_chunks_all, &
+                        CUSTOM_MPI_TYPE,receiver,itag, &
+                         MPI_COMM_WORLD,request_send_array(icount_faces),ier)
+      endif
+    enddo
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocto_faces(imsg)) then
-    call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not received yet
-  endif
-  enddo
+    iphase_comm = iphase_comm + 1
+    return ! exit because we have started some communications therefore we need some time
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg)) then
-    call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
-    if(.not. flag_result_test) return ! exit if message not sent yet
-  endif
-  enddo
+  case( 7 )
 
-  icount_faces = 0
-  do imsg = 1,NUMMSGS_FACES
-  if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
-  if(myrank==iprocfrom_faces(imsg)) then
-    do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
-      accel_crust_mantle(1,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-        buffer_received_faces_vector(1,ipoin2D,icount_faces)
-      accel_crust_mantle(2,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-        buffer_received_faces_vector(2,ipoin2D,icount_faces)
-      accel_crust_mantle(3,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
-        buffer_received_faces_vector(3,ipoin2D,icount_faces)
+    ! sender slices on chunk faces set values (iboolfaces)
+    ! and slices on corners assemble values on corners with other chunks
+
+    ! checks if messages have been sent
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocto_faces(imsg)) then
+        call MPI_TEST(request_send_array(icount_faces),flag_result_test,msg_status,ier)
+        if(.not. flag_result_test) return ! exit if message not received yet
+      endif
     enddo
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-    ioffset = npoin2D_faces_crust_mantle(icount_faces)
+    ! checks if messages have been received
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg)) then
+        call MPI_TEST(request_receive_array(icount_faces),flag_result_test,msg_status,ier)
+        if(.not. flag_result_test) return ! exit if message not sent yet
+      endif
+    enddo
 
-    do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
-      accel_inner_core(1,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-        buffer_received_faces_vector(1,ioffset + ipoin2D,icount_faces)
-      accel_inner_core(2,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-        buffer_received_faces_vector(2,ioffset + ipoin2D,icount_faces)
-      accel_inner_core(3,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
-        buffer_received_faces_vector(3,ioffset + ipoin2D,icount_faces)
+    ! sets values on faces (iboolfaces)
+    icount_faces = 0
+    do imsg = 1,NUMMSGS_FACES
+      if(myrank==iprocfrom_faces(imsg) .or. myrank==iprocto_faces(imsg)) icount_faces = icount_faces + 1
+      if(myrank==iprocfrom_faces(imsg)) then
+        do ipoin2D = 1,npoin2D_faces_crust_mantle(icount_faces)
+          accel_crust_mantle(:,iboolfaces_crust_mantle(ipoin2D,icount_faces)) = &
+            buffer_received_faces_vector(:,ipoin2D,icount_faces)
+        enddo
+
+        ! the buffer for the inner core starts right after the buffer for the crust and mantle
+        ioffset = npoin2D_faces_crust_mantle(icount_faces)
+
+        do ipoin2D = 1,npoin2D_faces_inner_core(icount_faces)
+          accel_inner_core(:,iboolfaces_inner_core(ipoin2D,icount_faces)) = &
+            buffer_received_faces_vector(:,ioffset + ipoin2D,icount_faces)
+        enddo
+      endif
     enddo
-  endif
-  enddo
 
-! this is the exit condition, to go beyond the last phase number
-  iphase = iphase + 1
+    ! this is the exit condition, to go beyond the last phase number
+    iphase_comm = iphase_comm + 1
 
-!! DK DK do the rest in blocking for now, for simplicity
+    !! DK DK do the rest in blocking for now, for simplicity
 
-!----
-!---- start MPI assembling corners
-!----
+    !----
+    !---- start MPI assembling corners
+    !----
 
-! scheme for corners cannot deadlock even if NPROC_XI = NPROC_ETA = 1
+    ! scheme for corners cannot deadlock even if NPROC_XI = NPROC_ETA = 1
 
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-  NGLOB1D_RADIAL_all = NGLOB1D_RADIAL_crust_mantle + NGLOB1D_RADIAL_inner_core
+    ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+    NGLOB1D_RADIAL_all = NGLOB1D_RADIAL_crust_mantle + NGLOB1D_RADIAL_inner_core
 
-! the buffer for the inner core starts right after the buffer for the crust and mantle
-  ioffset = NGLOB1D_RADIAL_crust_mantle
+    ! the buffer for the inner core starts right after the buffer for the crust and mantle
+    ioffset = NGLOB1D_RADIAL_crust_mantle
 
-! ***************************************************************
-!  transmit messages in forward direction (two workers -> master)
-! ***************************************************************
+    ! ***************************************************************
+    !  transmit messages in forward direction (two workers -> master)
+    ! ***************************************************************
 
-  icount_corners = 0
+    icount_corners = 0
 
-  do imsg = 1,NCORNERSCHUNKS
+    do imsg = 1,NCORNERSCHUNKS
 
-  if(myrank == iproc_master_corners(imsg) .or. &
-     myrank == iproc_worker1_corners(imsg) .or. &
-     (NCHUNKS /= 2 .and. myrank == iproc_worker2_corners(imsg))) icount_corners = icount_corners + 1
+      if(myrank == iproc_master_corners(imsg) .or. &
+         myrank == iproc_worker1_corners(imsg) .or. &
+         (NCHUNKS /= 2 .and. myrank == iproc_worker2_corners(imsg))) icount_corners = icount_corners + 1
 
-!---- receive messages from the two workers on the master
-  if(myrank==iproc_master_corners(imsg)) then
+      !---- receive messages from the two workers on the master
+      if(myrank==iproc_master_corners(imsg)) then
 
-! receive from worker #1 and add to local array
-    sender = iproc_worker1_corners(imsg)
+        ! receive from worker #1 and add to local array
+        sender = iproc_worker1_corners(imsg)
 
-    call MPI_RECV(buffer_recv_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all, &
-          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+        call MPI_RECV(buffer_recv_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all, &
+              CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
 
-    do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
-      accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
-               accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(1,ipoin1D)
-      accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
-               accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(2,ipoin1D)
-      accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
-               accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(3,ipoin1D)
-    enddo
+        do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
+          accel_crust_mantle(:,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
+                   accel_crust_mantle(:,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
+                   buffer_recv_chunkcorn_vector(:,ipoin1D)
+        enddo
 
-    do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
-      accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
-               accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(1,ioffset + ipoin1D)
-      accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
-               accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(2,ioffset + ipoin1D)
-      accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
-               accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(3,ioffset + ipoin1D)
-    enddo
+        do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
+          accel_inner_core(:,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
+                   accel_inner_core(:,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
+                   buffer_recv_chunkcorn_vector(:,ioffset + ipoin1D)
+        enddo
 
-! receive from worker #2 and add to local array
-  if(NCHUNKS /= 2) then
+        ! receive from worker #2 and add to local array
+        if(NCHUNKS /= 2) then
 
-    sender = iproc_worker2_corners(imsg)
+          sender = iproc_worker2_corners(imsg)
 
-    call MPI_RECV(buffer_recv_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all, &
-          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+          call MPI_RECV(buffer_recv_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all, &
+                CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
 
-    do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
-      accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
-               accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(1,ipoin1D)
-      accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
-               accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(2,ipoin1D)
-      accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
-               accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(3,ipoin1D)
-    enddo
+          do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
+            accel_crust_mantle(:,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
+                     accel_crust_mantle(:,iboolcorner_crust_mantle(ipoin1D,icount_corners)) + &
+                     buffer_recv_chunkcorn_vector(:,ipoin1D)
+          enddo
 
-    do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
-      accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
-               accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(1,ioffset + ipoin1D)
-      accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
-               accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(2,ioffset + ipoin1D)
-      accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
-               accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
-               buffer_recv_chunkcorn_vector(3,ioffset + ipoin1D)
+          do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
+            accel_inner_core(:,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
+                     accel_inner_core(:,iboolcorner_inner_core(ipoin1D,icount_corners)) + &
+                     buffer_recv_chunkcorn_vector(:,ioffset + ipoin1D)
+          enddo
+
+        endif
+
+      endif
+
+      !---- send messages from the two workers to the master
+      if(myrank==iproc_worker1_corners(imsg) .or. &
+                  (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
+
+        receiver = iproc_master_corners(imsg)
+
+        do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
+          buffer_send_chunkcorn_vector(:,ipoin1D) = &
+            accel_crust_mantle(:,iboolcorner_crust_mantle(ipoin1D,icount_corners))
+        enddo
+
+        do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
+          buffer_send_chunkcorn_vector(:,ioffset + ipoin1D) = &
+            accel_inner_core(:,iboolcorner_inner_core(ipoin1D,icount_corners))
+        enddo
+
+        call MPI_SEND(buffer_send_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all, &
+                      CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+
+      endif
+
+      ! *********************************************************************
+      !  transmit messages back in opposite direction (master -> two workers)
+      ! *********************************************************************
+
+      !---- receive messages from the master on the two workers
+      if(myrank==iproc_worker1_corners(imsg) .or. &
+                  (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
+
+        ! receive from master and copy to local array
+        sender = iproc_master_corners(imsg)
+
+        call MPI_RECV(buffer_recv_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all, &
+              CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+
+        do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
+          accel_crust_mantle(:,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
+            buffer_recv_chunkcorn_vector(:,ipoin1D)
+        enddo
+
+        do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
+          accel_inner_core(:,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
+            buffer_recv_chunkcorn_vector(:,ioffset + ipoin1D)
+        enddo
+
+      endif
+
+      !---- send messages from the master to the two workers
+      if(myrank==iproc_master_corners(imsg)) then
+
+        do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
+          buffer_send_chunkcorn_vector(:,ipoin1D) = &
+            accel_crust_mantle(:,iboolcorner_crust_mantle(ipoin1D,icount_corners))
+        enddo
+
+        do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
+          buffer_send_chunkcorn_vector(:,ioffset + ipoin1D) = &
+            accel_inner_core(:,iboolcorner_inner_core(ipoin1D,icount_corners))
+        enddo
+
+        ! send to worker #1
+        receiver = iproc_worker1_corners(imsg)
+        call MPI_SEND(buffer_send_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all,&
+                    CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+
+        ! send to worker #2
+        if(NCHUNKS /= 2) then
+          receiver = iproc_worker2_corners(imsg)
+          call MPI_SEND(buffer_send_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all,&
+                      CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+        endif
+
+      endif
+
     enddo
 
+  end select
+
+  end subroutine assemble_MPI_vector
+
+!
+!-------------------------------------------------------------------------------------------------
+!-------------------------------------------------------------------------------------------------
+!-------------------------------------------------------------------------------------------------
+!
+!
+
+!daniel: todo - still local versions
+
+  subroutine assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB, &
+                                           array_val, &
+                                           buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
+                                           num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                           nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                           my_neighbours_ext_mesh, &
+                                           request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+
+  ! sends data
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+  integer :: NGLOB_AB
+
+  ! array to assemble
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
+
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
+  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+
+  integer ipoin,iinterface
+
+  ! here we have to assemble all the contributions between partitions using MPI
+
+  ! assemble only if more than one partition
+  if(NPROC > 1) then
+
+     ! partition border copy into the buffer
+     do iinterface = 1, num_interfaces_ext_mesh
+        do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+           buffer_send_vector_ext_mesh(:,ipoin,iinterface) = &
+                array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface))
+        enddo
+     enddo
+
+     ! send messages
+     do iinterface = 1, num_interfaces_ext_mesh
+        call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
+             NDIM*nibool_interfaces_ext_mesh(iinterface), &
+             my_neighbours_ext_mesh(iinterface), &
+             itag, &
+             request_send_vector_ext_mesh(iinterface) &
+             )
+        call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
+             NDIM*nibool_interfaces_ext_mesh(iinterface), &
+             my_neighbours_ext_mesh(iinterface), &
+             itag, &
+             request_recv_vector_ext_mesh(iinterface) &
+             )
+     enddo
+
   endif
 
+  end subroutine assemble_MPI_vector_ext_mesh_s
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+!daniel: TODO - cuda scalar assembly...
+
+! interrupt might improve MPI performance
+! see: https://computing.llnl.gov/tutorials/mpi_performance/#Sender-ReceiverSync
+!
+! check: MP_CSS_INTERRUPT environment variable on IBM systems
+
+
+  subroutine assemble_MPI_vector_send_cuda(NPROC, &
+                                          buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
+                                          num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                          nibool_interfaces_ext_mesh, &
+                                          my_neighbours_ext_mesh, &
+                                          request_send_vector_ext_mesh,request_recv_vector_ext_mesh,&
+                                          IREGION,FORWARD_OR_ADJOINT)
+
+  ! sends data
+  ! note: array to assemble already filled into buffer_send_vector_ext_mesh array
+  use constants
+  use specfem_par,only: Mesh_pointer
+
+  implicit none
+
+  integer :: NPROC
+
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+  
+  integer :: IREGION
+  integer :: FORWARD_OR_ADJOINT
+  
+  ! local parameters
+  integer iinterface
+
+  ! send only if more than one partition
+  if(NPROC > 1) then
+
+    ! preparation of the contribution between partitions using MPI
+    ! transfers mpi buffers to CPU
+    call transfer_boun_accel_from_device(Mesh_pointer, &
+                                        buffer_send_vector_ext_mesh,&
+                                        IREGION,FORWARD_OR_ADJOINT)
+
+     ! send messages
+     do iinterface = 1, num_interfaces_ext_mesh
+        call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
+             NDIM*nibool_interfaces_ext_mesh(iinterface), &
+             my_neighbours_ext_mesh(iinterface), &
+             itag, &
+             request_send_vector_ext_mesh(iinterface) &
+             )
+        call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
+             NDIM*nibool_interfaces_ext_mesh(iinterface), &
+             my_neighbours_ext_mesh(iinterface), &
+             itag, &
+             request_recv_vector_ext_mesh(iinterface) &
+             )
+     enddo
+
   endif
 
-!---- send messages from the two workers to the master
-  if(myrank==iproc_worker1_corners(imsg) .or. &
-              (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
+  end subroutine assemble_MPI_vector_send_cuda
 
-    receiver = iproc_master_corners(imsg)
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-    do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
-      buffer_send_chunkcorn_vector(1,ipoin1D) = &
-        accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners))
-      buffer_send_chunkcorn_vector(2,ipoin1D) = &
-        accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners))
-      buffer_send_chunkcorn_vector(3,ipoin1D) = &
-        accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners))
+  subroutine assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_AB, &
+                                           array_val, &
+                                           buffer_recv_vector_ext_mesh, &
+                                           num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                           nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                           request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+
+! waits for data to receive and assembles
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+  integer :: NGLOB_AB
+
+  ! array to assemble
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
+
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_recv_vector_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+
+  integer ipoin,iinterface
+
+! here we have to assemble all the contributions between partitions using MPI
+
+  ! assemble only if more than one partition
+  if(NPROC > 1) then
+
+    ! wait for communications completion (recv)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_recv_vector_ext_mesh(iinterface))
     enddo
 
-    do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
-      buffer_send_chunkcorn_vector(1,ioffset + ipoin1D) = &
-        accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners))
-      buffer_send_chunkcorn_vector(2,ioffset + ipoin1D) = &
-        accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners))
-      buffer_send_chunkcorn_vector(3,ioffset + ipoin1D) = &
-        accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners))
+    ! adding contributions of neighbours
+    do iinterface = 1, num_interfaces_ext_mesh
+      do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+        array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+             array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) &
+             + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+      enddo
     enddo
 
-    call MPI_SEND(buffer_send_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+    ! wait for communications completion (send)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_send_vector_ext_mesh(iinterface))
+    enddo
 
   endif
 
-! *********************************************************************
-!  transmit messages back in opposite direction (master -> two workers)
-! *********************************************************************
+  end subroutine assemble_MPI_vector_ext_mesh_w
 
-!---- receive messages from the master on the two workers
-  if(myrank==iproc_worker1_corners(imsg) .or. &
-              (NCHUNKS /= 2 .and. myrank==iproc_worker2_corners(imsg))) then
 
-! receive from master and copy to local array
-    sender = iproc_master_corners(imsg)
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-    call MPI_RECV(buffer_recv_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all, &
-          CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+  subroutine assemble_MPI_vector_write_cuda(NPROC, &
+                                            buffer_recv_vector_ext_mesh, &
+                                            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+                                            IREGION,FORWARD_OR_ADJOINT )
 
-    do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
-      accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
-        buffer_recv_chunkcorn_vector(1,ipoin1D)
-      accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
-        buffer_recv_chunkcorn_vector(2,ipoin1D)
-      accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners)) = &
-        buffer_recv_chunkcorn_vector(3,ipoin1D)
+! waits for data to receive and assembles
+  use constants
+  use specfem_par,only: Mesh_pointer
+  
+  implicit none
+
+  integer :: NPROC
+
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_recv_vector_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+
+  integer :: IREGION
+  integer :: FORWARD_OR_ADJOINT
+
+  ! local parameters
+  
+  integer iinterface
+
+! here we have to assemble all the contributions between partitions using MPI
+
+  ! assemble only if more than one partition
+  if(NPROC > 1) then
+
+    ! wait for communications completion (recv)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_recv_vector_ext_mesh(iinterface))
     enddo
 
-    do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
-      accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
-        buffer_recv_chunkcorn_vector(1,ioffset + ipoin1D)
-      accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
-        buffer_recv_chunkcorn_vector(2,ioffset + ipoin1D)
-      accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners)) = &
-        buffer_recv_chunkcorn_vector(3,ioffset + ipoin1D)
+    ! adding contributions of neighbours
+    call transfer_asmbl_accel_to_device(Mesh_pointer, &
+                                      buffer_recv_vector_ext_mesh, &                                      
+                                      IREGION,FORWARD_OR_ADJOINT)
+
+    ! This step is done via previous function transfer_and_assemble...
+    ! do iinterface = 1, num_interfaces_ext_mesh
+    !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+    !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+    !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+    !   enddo
+    ! enddo
+
+    ! wait for communications completion (send)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_send_vector_ext_mesh(iinterface))
     enddo
 
   endif
 
-!---- send messages from the master to the two workers
-  if(myrank==iproc_master_corners(imsg)) then
+  end subroutine assemble_MPI_vector_write_cuda
 
-    do ipoin1D = 1,NGLOB1D_RADIAL_crust_mantle
-      buffer_send_chunkcorn_vector(1,ipoin1D) = &
-        accel_crust_mantle(1,iboolcorner_crust_mantle(ipoin1D,icount_corners))
-      buffer_send_chunkcorn_vector(2,ipoin1D) = &
-        accel_crust_mantle(2,iboolcorner_crust_mantle(ipoin1D,icount_corners))
-      buffer_send_chunkcorn_vector(3,ipoin1D) = &
-        accel_crust_mantle(3,iboolcorner_crust_mantle(ipoin1D,icount_corners))
-    enddo
+!
+!----
+!
 
-    do ipoin1D = 1,NGLOB1D_RADIAL_inner_core
-      buffer_send_chunkcorn_vector(1,ioffset + ipoin1D) = &
-        accel_inner_core(1,iboolcorner_inner_core(ipoin1D,icount_corners))
-      buffer_send_chunkcorn_vector(2,ioffset + ipoin1D) = &
-        accel_inner_core(2,iboolcorner_inner_core(ipoin1D,icount_corners))
-      buffer_send_chunkcorn_vector(3,ioffset + ipoin1D) = &
-        accel_inner_core(3,iboolcorner_inner_core(ipoin1D,icount_corners))
-    enddo
+  subroutine irecv_cr(recvbuf, recvcount, dest, recvtag, req)
 
-! send to worker #1
-    receiver = iproc_worker1_corners(imsg)
-    call MPI_SEND(buffer_send_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+  implicit none
 
-! send to worker #2
-  if(NCHUNKS /= 2) then
-    receiver = iproc_worker2_corners(imsg)
-    call MPI_SEND(buffer_send_chunkcorn_vector,NDIM*NGLOB1D_RADIAL_all,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+! standard include of the MPI library
+  include 'mpif.h'
 
-  endif
+  include "constants.h"
+  include "precision.h"
 
-  endif
+  integer recvcount, dest, recvtag, req
+  real(kind=CUSTOM_REAL), dimension(recvcount) :: recvbuf
 
-  enddo
+  integer ier
 
-  endif !!!!!!!!! end of iphase 7
+  call MPI_IRECV(recvbuf(1),recvcount,CUSTOM_MPI_TYPE,dest,recvtag, &
+                  MPI_COMM_WORLD,req,ier)
 
-  end subroutine assemble_MPI_vector
+  end subroutine irecv_cr
 
+!
+!----
+!
+
+  subroutine isend_cr(sendbuf, sendcount, dest, sendtag, req)
+
+  implicit none
+
+! standard include of the MPI library
+  include 'mpif.h'
+
+  include "constants.h"
+  include "precision.h"
+
+  integer sendcount, dest, sendtag, req
+  real(kind=CUSTOM_REAL), dimension(sendcount) :: sendbuf
+
+  integer ier
+
+  call MPI_ISEND(sendbuf(1),sendcount,CUSTOM_MPI_TYPE,dest,sendtag, &
+                  MPI_COMM_WORLD,req,ier)
+
+  end subroutine isend_cr
+
+!
+!----
+!
+
+  subroutine wait_req(req)
+
+  implicit none
+
+! standard include of the MPI library
+  include 'mpif.h'
+
+  integer :: req
+
+  integer, dimension(MPI_STATUS_SIZE) :: req_mpi_status
+
+  integer :: ier
+
+  call mpi_wait(req,req_mpi_status,ier)
+
+  end subroutine wait_req
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -27,7 +27,9 @@
 
   subroutine check_simulation_stability(it,displ_crust_mantle,displ_inner_core,displ_outer_core, &
                           b_displ_crust_mantle,b_displ_inner_core,b_displ_outer_core, &
-                          eps_trace_over_3_crust_mantle,epsilondev_crust_mantle, &
+                          eps_trace_over_3_crust_mantle, &
+                          epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                          epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
                           SIMULATION_TYPE,OUTPUT_FILES,time_start,DT,t0,NSTEP, &
                           myrank) !COMPUTE_AND_STORE_STRAIN,myrank)
 
@@ -52,9 +54,13 @@
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: &
     eps_trace_over_3_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) ::  &
-    epsilondev_crust_mantle
 
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) ::  &
+!    epsilondev_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) ::  &
+    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+    epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle
+
   integer SIMULATION_TYPE
   character(len=150) OUTPUT_FILES
 
@@ -124,7 +130,9 @@
 
   if (COMPUTE_AND_STORE_STRAIN) then
     Strain_norm = maxval(abs(eps_trace_over_3_crust_mantle))
-    strain2_norm= maxval(abs(epsilondev_crust_mantle))
+    strain2_norm= max( maxval(abs(epsilondev_xx_crust_mantle)),maxval(abs(epsilondev_yy_crust_mantle)), &
+                       maxval(abs(epsilondev_xy_crust_mantle)),maxval(abs(epsilondev_xz_crust_mantle)), &
+                       maxval(abs(epsilondev_yz_crust_mantle)) )
     call MPI_REDUCE(Strain_norm,Strain_norm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
              MPI_COMM_WORLD,ier)
     call MPI_REDUCE(Strain2_norm,Strain2_norm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -274,10 +274,10 @@
             !           the file at NSTEP corresponds to -t0 + (NSTEP-1)*DT
             !
             !           the same time step is saved for the forward wavefields to reconstruct them;
-            !           however, the Newark time scheme acts at the very beginning of this time loop
+            !           however, the Newmark time scheme acts at the very beginning of this time loop
             !           such that we have the backward/reconstructed wavefield updated by
             !           a single time step into the direction -DT and b_displ(it=1) would  corresponds to -t0 + (NSTEP-1)*DT - DT
-            !           after the Newark (predictor) time step update.
+            !           after the Newmark (predictor) time step update.
             !           however, we will read the backward/reconstructed wavefield at the end of the first time loop,
             !           such that b_displ(it=1) corresponds to -t0 + (NSTEP-1)*DT (which is the one saved in the files).
             !

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_arrays_source.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_arrays_source.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_arrays_source.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -183,7 +183,7 @@
   it_end = iadjsrc(it_sub_adj,1)+NSTEP_BLOCK-1
 
 
-  ! unfortunately, things become more tricky because of the Newark time scheme at
+  ! unfortunately, things become more tricky because of the Newmark time scheme at
   ! the very beginning of the time loop. however, when we read in the backward/reconstructed
   ! wavefields at the end of the first time loop, we can use the adjoint source index from 3000 down to 1.
   !

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_element.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_element.F90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_element.F90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -30,12 +30,6 @@
 ! or compile with: -D_HANDOPT
 !#define _HANDOPT
 
-! BEWARE:
-! BEWARE: we have observed that using _HANDOPT in combination with -O3 or higher can lead to problems on some machines;
-! BEWARE: thus, be careful when using it. At the very least, run the same test simulation once with _HANDOPT and once without
-! BEWARE: and make sure that all the seismograms you get are the same down to roundoff noise.
-! BEWARE:
-
 ! note: these hand optimizations should help compilers to pipeline the code and make better use of the cache;
 !          depending on compilers, it can further decrease the computation time by ~ 30%.
 !          the original routines are commented with "! way 1", the hand-optimized routines with  "! way 2"
@@ -47,7 +41,8 @@
                     wgll_cube, &
                     kappavstore,muvstore, &
                     ibool, &
-                    R_memory,epsilon_trace_over_3, &
+                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                    epsilon_trace_over_3, &
                     one_minus_sum_beta,vx,vy,vz,vnspec, &
                     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
                     dummyx_loc,dummyy_loc,dummyz_loc,epsilondev_loc,rho_s_H)
@@ -86,7 +81,11 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
+
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: &
+    R_xx,R_yy,R_xy,R_xz,R_yz
+
   real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
 
   integer :: vx,vy,vz,vnspec
@@ -219,8 +218,12 @@
         if(ATTENUATION_VAL .and. ( USE_ATTENUATION_MIMIC .eqv. .false. )  ) then
 
           ! note: fortran passes pointers to array location, thus R_memory(1,1,...) should be fine
-          call compute_element_att_stress( R_memory(1,1,i,j,k,ispec), &
-                    sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
+          call compute_element_att_stress(R_xx(1,i,j,k,ispec), &
+                                         R_yy(1,i,j,k,ispec), &
+                                         R_xy(1,i,j,k,ispec), &
+                                         R_xz(1,i,j,k,ispec), &
+                                         R_yz(1,i,j,k,ispec), &
+                                         sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
 
         endif ! ATTENUATION_VAL
 
@@ -370,7 +373,8 @@
                     wgll_cube, &
                     kappavstore,kappahstore,muvstore,muhstore,eta_anisostore, &
                     ibool, &
-                    R_memory,epsilon_trace_over_3, &
+                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                    epsilon_trace_over_3, &
                     one_minus_sum_beta,vx,vy,vz,vnspec, &
                     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
                     dummyx_loc,dummyy_loc,dummyz_loc,epsilondev_loc,rho_s_H)
@@ -411,7 +415,10 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: &
+    R_xx,R_yy,R_xy,R_xz,R_yz
+
   real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
 
   integer vx,vy,vz,vnspec
@@ -753,8 +760,12 @@
         if(ATTENUATION_VAL .and. ( USE_ATTENUATION_MIMIC .eqv. .false. )  ) then
 
           ! note: fortran passes pointers to array location, thus R_memory(1,1,...) should be fine
-          call compute_element_att_stress( R_memory(1,1,i,j,k,ispec), &
-                    sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
+          call compute_element_att_stress(R_xx(1,i,j,k,ispec), &
+                                         R_yy(1,i,j,k,ispec), &
+                                         R_xy(1,i,j,k,ispec), &
+                                         R_xz(1,i,j,k,ispec), &
+                                         R_yz(1,i,j,k,ispec), &
+                                         sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
 
         endif ! ATTENUATION_VAL
 
@@ -903,7 +914,8 @@
                     c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
                     c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
                     ibool, &
-                    R_memory,epsilon_trace_over_3, &
+                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                    epsilon_trace_over_3, &
                     one_minus_sum_beta,vx,vy,vz,vnspec, &
                     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
                     dummyx_loc,dummyy_loc,dummyz_loc,epsilondev_loc,rho_s_H)
@@ -944,7 +956,10 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: &
+    R_xx,R_yy,R_xy,R_xz,R_yz
+
   real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
 
   integer vx,vy,vz,vnspec
@@ -1116,8 +1131,12 @@
         if(ATTENUATION_VAL .and. ( USE_ATTENUATION_MIMIC .eqv. .false. )  ) then
 
           ! note: fortran passes pointers to array location, thus R_memory(1,1,...) should be fine
-          call compute_element_att_stress( R_memory(1,1,i,j,k,ispec), &
-                    sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
+          call compute_element_att_stress(R_xx(1,i,j,k,ispec), &
+                                         R_yy(1,i,j,k,ispec), &
+                                         R_xy(1,i,j,k,ispec), &
+                                         R_xz(1,i,j,k,ispec), &
+                                         R_yz(1,i,j,k,ispec), &
+                                         sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
 
         endif ! ATTENUATION_VAL
 
@@ -1254,8 +1273,8 @@
 !
 
 
-  subroutine compute_element_att_stress( R_memory_loc, &
-                    sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
+  subroutine compute_element_att_stress(R_xx_loc,R_yy_loc,R_xy_loc,R_xz_loc,R_yz_loc, &
+                                       sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
 
   implicit none
 
@@ -1269,19 +1288,25 @@
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS) :: R_memory_loc
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS) :: R_memory_loc
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xx_loc
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_yy_loc
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xy_loc
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xz_loc
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_yz_loc
+
   real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
 
 ! local parameters
   real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1
   integer :: i_SLS
-#ifdef _HANDOPT
+#ifdef _HANDOPT_ATT
   real(kind=CUSTOM_REAL) R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
   integer :: imodulo_N_SLS
   integer :: i_SLS1,i_SLS2
 #endif
 
-#ifdef _HANDOPT
+#ifdef __HANDOPT_ATT
 ! way 2:
 ! note: this should help compilers to pipeline the code and make better use of the cache;
 !          depending on compilers, it can further decrease the computation time by ~ 30%.
@@ -1290,61 +1315,61 @@
 
   if(imodulo_N_SLS >= 1) then
     do i_SLS = 1,imodulo_N_SLS
-      R_xx_val1 = R_memory_loc(1,i_SLS)
-      R_yy_val1 = R_memory_loc(2,i_SLS)
+      R_xx_val1 = R_xx_loc(i_SLS)
+      R_yy_val1 = R_yy_loc(i_SLS)
       sigma_xx = sigma_xx - R_xx_val1
       sigma_yy = sigma_yy - R_yy_val1
       sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-      sigma_xy = sigma_xy - R_memory_loc(3,i_SLS)
-      sigma_xz = sigma_xz - R_memory_loc(4,i_SLS)
-      sigma_yz = sigma_yz - R_memory_loc(5,i_SLS)
+      sigma_xy = sigma_xy - R_xy_loc(i_SLS)
+      sigma_xz = sigma_xz - R_xz_loc(i_SLS)
+      sigma_yz = sigma_yz - R_yz_loc(i_SLS)
     enddo
   endif
   if(N_SLS >= imodulo_N_SLS+1) then
     ! note: another possibility would be using a reduction example for this loop; was tested but it does not improve,
     ! probably since N_SLS == 3 is too small for a loop benefit
     do i_SLS = imodulo_N_SLS+1,N_SLS,3
-      R_xx_val1 = R_memory_loc(1,i_SLS)
-      R_yy_val1 = R_memory_loc(2,i_SLS)
+      R_xx_val1 = R_xx_loc(i_SLS)
+      R_yy_val1 = R_yy_loc(i_SLS)
       sigma_xx = sigma_xx - R_xx_val1
       sigma_yy = sigma_yy - R_yy_val1
       sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-      sigma_xy = sigma_xy - R_memory_loc(3,i_SLS)
-      sigma_xz = sigma_xz - R_memory_loc(4,i_SLS)
-      sigma_yz = sigma_yz - R_memory_loc(5,i_SLS)
+      sigma_xy = sigma_xy - R_xy_loc(i_SLS)
+      sigma_xz = sigma_xz - R_xz_loc(i_SLS)
+      sigma_yz = sigma_yz - R_yz_loc(i_SLS)
 
       i_SLS1=i_SLS+1
-      R_xx_val2 = R_memory_loc(1,i_SLS1)
-      R_yy_val2 = R_memory_loc(2,i_SLS1)
+      R_xx_val2 = R_xx_loc(i_SLS1)
+      R_yy_val2 = R_yy_loc(i_SLS1)
       sigma_xx = sigma_xx - R_xx_val2
       sigma_yy = sigma_yy - R_yy_val2
       sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-      sigma_xy = sigma_xy - R_memory_loc(3,i_SLS1)
-      sigma_xz = sigma_xz - R_memory_loc(4,i_SLS1)
-      sigma_yz = sigma_yz - R_memory_loc(5,i_SLS1)
+      sigma_xy = sigma_xy - R_xy_loc(i_SLS1)
+      sigma_xz = sigma_xz - R_xz_loc(i_SLS1)
+      sigma_yz = sigma_yz - R_yz_loc(i_SLS1)
 
       i_SLS2 =i_SLS+2
-      R_xx_val3 = R_memory_loc(1,i_SLS2)
-      R_yy_val3 = R_memory_loc(2,i_SLS2)
+      R_xx_val3 = R_xx_loc(i_SLS2)
+      R_yy_val3 = R_yy_loc(i_SLS2)
       sigma_xx = sigma_xx - R_xx_val3
       sigma_yy = sigma_yy - R_yy_val3
       sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-      sigma_xy = sigma_xy - R_memory_loc(3,i_SLS2)
-      sigma_xz = sigma_xz - R_memory_loc(4,i_SLS2)
-      sigma_yz = sigma_yz - R_memory_loc(5,i_SLS2)
+      sigma_xy = sigma_xy - R_xy_loc(i_SLS2)
+      sigma_xz = sigma_xz - R_xz_loc(i_SLS2)
+      sigma_yz = sigma_yz - R_yz_loc(i_SLS2)
     enddo
   endif
 #else
 ! way 1:
   do i_SLS = 1,N_SLS
-    R_xx_val1 = R_memory_loc(1,i_SLS) ! R_memory(1,i_SLS,i,j,k,ispec)
-    R_yy_val1 = R_memory_loc(2,i_SLS) ! R_memory(2,i_SLS,i,j,k,ispec)
+    R_xx_val1 = R_xx_loc(i_SLS) ! R_memory(1,i_SLS,i,j,k,ispec)
+    R_yy_val1 = R_yy_loc(i_SLS) ! R_memory(2,i_SLS,i,j,k,ispec)
     sigma_xx = sigma_xx - R_xx_val1
     sigma_yy = sigma_yy - R_yy_val1
     sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-    sigma_xy = sigma_xy - R_memory_loc(3,i_SLS) ! R_memory(3,i_SLS,i,j,k,ispec)
-    sigma_xz = sigma_xz - R_memory_loc(4,i_SLS) ! R_memory(4,i_SLS,i,j,k,ispec)
-    sigma_yz = sigma_yz - R_memory_loc(5,i_SLS) ! R_memory(5,i_SLS,i,j,k,ispec)
+    sigma_xy = sigma_xy - R_xy_loc(i_SLS) ! R_memory(3,i_SLS,i,j,k,ispec)
+    sigma_xz = sigma_xz - R_xz_loc(i_SLS) ! R_memory(4,i_SLS,i,j,k,ispec)
+    sigma_yz = sigma_yz - R_yz_loc(i_SLS) ! R_memory(5,i_SLS,i,j,k,ispec)
   enddo
 #endif
 
@@ -1355,11 +1380,13 @@
 !--------------------------------------------------------------------------------------------
 !
 
-  subroutine compute_element_att_memory_cr(ispec,R_memory, &
+  subroutine compute_element_att_memory_cr(ispec,R_xx,R_yy,R_xy,R_xz,R_yz, &
                                         vx,vy,vz,vnspec,factor_common, &
                                         alphaval,betaval,gammaval, &
                                         c44store,muvstore, &
-                                        epsilondev,epsilondev_loc)
+                                        epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                        epsilondev_xz,epsilondev_yz, &
+                                        epsilondev_loc)
 ! crust mantle
 ! update memory variables based upon the Runge-Kutta scheme
 
@@ -1388,7 +1415,9 @@
   ! element id
   integer :: ispec
 
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: &
+    R_xx,R_yy,R_xy,R_xz,R_yz
 
   integer :: vx,vy,vz,vnspec
   real(kind=CUSTOM_REAL), dimension(N_SLS, vx, vy, vz, vnspec) :: factor_common
@@ -1397,18 +1426,19 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_MANTLE) :: c44store
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: muvstore
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
+    epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+
   real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
 
 ! local parameters
   real(kind=CUSTOM_REAL), dimension(NGLLX, NGLLY, NGLLZ) :: factor_common_c44_muv
   integer :: i_SLS
 
-#ifdef _HANDOPT
+#ifdef __HANDOPT_ATT
   real(kind=CUSTOM_REAL) :: alphal,betal,gammal
   integer :: i,j,k
-#else
-  integer :: i_memory
 #endif
 
   ! use Runge-Kutta scheme to march in time
@@ -1417,7 +1447,7 @@
   ! IMPROVE we use mu_v here even if there is some anisotropy
   ! IMPROVE we should probably use an average value instead
 
-#ifdef _HANDOPT
+#ifdef __HANDOPT_ATT
 ! way 2:
   do i_SLS = 1,N_SLS
 
@@ -1438,9 +1468,31 @@
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
-          R_memory(:,i_SLS,i,j,k,ispec) = alphal * R_memory(:,i_SLS,i,j,k,ispec) &
+!          R_memory(:,i_SLS,i,j,k,ispec) = alphal * R_memory(:,i_SLS,i,j,k,ispec) &
+!                  + factor_common_c44_muv(i,j,k) &
+!                  *( betal * epsilondev(:,i,j,k,ispec) + gammal * epsilondev_loc(:,i,j,k))
+
+          R_xx(i_SLS,i,j,k,ispec) = alphal * R_xx(i_SLS,i,j,k,ispec) &
                   + factor_common_c44_muv(i,j,k) &
-                  *( betal * epsilondev(:,i,j,k,ispec) + gammal * epsilondev_loc(:,i,j,k))
+                  *( betal * epsilondev_xx(i,j,k,ispec) + gammal * epsilondev_loc(1,i,j,k))
+
+          R_yy(i_SLS,i,j,k,ispec) = alphal * R_yy(i_SLS,i,j,k,ispec) &
+                  + factor_common_c44_muv(i,j,k) &
+                  *( betal * epsilondev_yy(i,j,k,ispec) + gammal * epsilondev_loc(2,i,j,k))
+
+          R_xy(i_SLS,i,j,k,ispec) = alphal * R_xy(i_SLS,i,j,k,ispec) &
+                  + factor_common_c44_muv(i,j,k) &
+                  *( betal * epsilondev_xy(i,j,k,ispec) + gammal * epsilondev_loc(3,i,j,k))
+
+          R_xz(i_SLS,i,j,k,ispec) = alphal * R_xz(i_SLS,i,j,k,ispec) &
+                  + factor_common_c44_muv(i,j,k) &
+                  *( betal * epsilondev_xz(i,j,k,ispec) + gammal * epsilondev_loc(4,i,j,k))
+
+          R_yz(i_SLS,i,j,k,ispec) = alphal * R_yz(i_SLS,i,j,k,ispec) &
+                  + factor_common_c44_muv(i,j,k) &
+                  *( betal * epsilondev_yz(i,j,k,ispec) + gammal * epsilondev_loc(5,i,j,k))
+
+
         enddo
       enddo
     enddo
@@ -1457,11 +1509,33 @@
       factor_common_c44_muv(:,:,:) = factor_common_c44_muv(:,:,:) * muvstore(:,:,:,ispec)
     endif
 
-    do i_memory = 1,5
-      R_memory(i_memory,i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_memory(i_memory,i_SLS,:,:,:,ispec) &
+!    do i_memory = 1,5
+!      R_memory(i_memory,i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_memory(i_memory,i_SLS,:,:,:,ispec) &
+!                + factor_common_c44_muv(:,:,:) &
+!                * (betaval(i_SLS) * epsilondev(i_memory,:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(i_memory,:,:,:))
+!    enddo
+
+    R_xx(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xx(i_SLS,:,:,:,ispec) &
                 + factor_common_c44_muv(:,:,:) &
-                * (betaval(i_SLS) * epsilondev(i_memory,:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(i_memory,:,:,:))
-    enddo
+                * (betaval(i_SLS) * epsilondev_xx(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(1,:,:,:))
+
+    R_yy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yy(i_SLS,:,:,:,ispec) &
+                + factor_common_c44_muv(:,:,:) &
+                * (betaval(i_SLS) * epsilondev_yy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(2,:,:,:))
+
+    R_xy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xy(i_SLS,:,:,:,ispec) &
+                + factor_common_c44_muv(:,:,:) &
+                * (betaval(i_SLS) * epsilondev_xy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(3,:,:,:))
+
+    R_xz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xz(i_SLS,:,:,:,ispec) &
+                + factor_common_c44_muv(:,:,:) &
+                * (betaval(i_SLS) * epsilondev_xz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(4,:,:,:))
+
+    R_yz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yz(i_SLS,:,:,:,ispec) &
+                + factor_common_c44_muv(:,:,:) &
+                * (betaval(i_SLS) * epsilondev_yz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(5,:,:,:))
+
+
   enddo ! i_SLS
 #endif
 
@@ -1472,11 +1546,13 @@
 !--------------------------------------------------------------------------------------------
 !
 
-  subroutine compute_element_att_memory_ic(ispec,R_memory, &
+  subroutine compute_element_att_memory_ic(ispec,R_xx,R_yy,R_xy,R_xz,R_yz, &
                                         vx,vy,vz,vnspec,factor_common, &
                                         alphaval,betaval,gammaval, &
                                         muvstore, &
-                                        epsilondev,epsilondev_loc)
+                                        epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                        epsilondev_xz,epsilondev_yz, &
+                                        epsilondev_loc)
 ! inner core
 ! update memory variables based upon the Runge-Kutta scheme
 
@@ -1505,15 +1581,21 @@
   ! element id
   integer :: ispec
 
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: R_memory
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: R_memory
 
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
+    R_xx,R_yy,R_xy,R_xz,R_yz
+
   integer :: vx,vy,vz,vnspec
   real(kind=CUSTOM_REAL), dimension(N_SLS, vx, vy, vz, vnspec) :: factor_common
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: muvstore
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilondev
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilondev
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
+    epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+
   real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
 
 ! local parameters
@@ -1521,11 +1603,9 @@
 
   integer :: i_SLS
 
-#ifdef _HANDOPT
+#ifdef __HANDOPT_ATT
   real(kind=CUSTOM_REAL) :: alphal,betal,gammal
   integer :: i,j,k
-#else
-  integer :: i_memory
 #endif
 
   ! use Runge-Kutta scheme to march in time
@@ -1534,7 +1614,7 @@
   ! IMPROVE we use mu_v here even if there is some anisotropy
   ! IMPROVE we should probably use an average value instead
 
-#ifdef _HANDOPT
+#ifdef __HANDOPT_ATT
 ! way 2:
   do i_SLS = 1,N_SLS
 
@@ -1551,9 +1631,26 @@
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
-          R_memory(:,i_SLS,i,j,k,ispec) = alphal * R_memory(:,i_SLS,i,j,k,ispec) &
+!          R_memory(:,i_SLS,i,j,k,ispec) = alphal * R_memory(:,i_SLS,i,j,k,ispec) &
+!                  + factor_common_use(i,j,k) &
+!                  *( betal * epsilondev(:,i,j,k,ispec) + gammal * epsilondev_loc(:,i,j,k))
+
+          R_xx(i_SLS,i,j,k,ispec) = alphal * R_xx(i_SLS,i,j,k,ispec) &
                   + factor_common_use(i,j,k) &
-                  *( betal * epsilondev(:,i,j,k,ispec) + gammal * epsilondev_loc(:,i,j,k))
+                  *( betal * epsilondev_xx(i,j,k,ispec) + gammal * epsilondev_loc(1,i,j,k))
+          R_yy(i_SLS,i,j,k,ispec) = alphal * R_yy(i_SLS,i,j,k,ispec) &
+                  + factor_common_use(i,j,k) &
+                  *( betal * epsilondev_yy(i,j,k,ispec) + gammal * epsilondev_loc(2,i,j,k))
+          R_xy(i_SLS,i,j,k,ispec) = alphal * R_xy(i_SLS,i,j,k,ispec) &
+                  + factor_common_use(i,j,k) &
+                  *( betal * epsilondev_xy(i,j,k,ispec) + gammal * epsilondev_loc(3,i,j,k))
+          R_xz(i_SLS,i,j,k,ispec) = alphal * R_xz(i_SLS,i,j,k,ispec) &
+                  + factor_common_use(i,j,k) &
+                  *( betal * epsilondev_xz(i,j,k,ispec) + gammal * epsilondev_loc(4,i,j,k))
+          R_yz(i_SLS,i,j,k,ispec) = alphal * R_yz(i_SLS,i,j,k,ispec) &
+                  + factor_common_use(i,j,k) &
+                  *( betal * epsilondev_yz(i,j,k,ispec) + gammal * epsilondev_loc(5,i,j,k))
+
         enddo
       enddo
     enddo
@@ -1563,11 +1660,32 @@
 ! way 1:
   do i_SLS = 1,N_SLS
     factor_common_use(:,:,:) = factor_common(i_SLS,:,:,:,ispec)
-    do i_memory = 1,5
-       R_memory(i_memory,i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_memory(i_memory,i_SLS,:,:,:,ispec) &
+!    do i_memory = 1,5
+!       R_memory(i_memory,i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_memory(i_memory,i_SLS,:,:,:,ispec) &
+!            + muvstore(:,:,:,ispec) * factor_common_use(:,:,:) * &
+!            (betaval(i_SLS) * epsilondev(i_memory,:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(i_memory,:,:,:))
+!    enddo
+
+    R_xx(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xx(i_SLS,:,:,:,ispec) &
             + muvstore(:,:,:,ispec) * factor_common_use(:,:,:) * &
-            (betaval(i_SLS) * epsilondev(i_memory,:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(i_memory,:,:,:))
-    enddo
+            (betaval(i_SLS) * epsilondev_xx(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(1,:,:,:))
+
+    R_yy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yy(i_SLS,:,:,:,ispec) &
+            + muvstore(:,:,:,ispec) * factor_common_use(:,:,:) * &
+            (betaval(i_SLS) * epsilondev_yy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(2,:,:,:))
+
+    R_xy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xy(i_SLS,:,:,:,ispec) &
+            + muvstore(:,:,:,ispec) * factor_common_use(:,:,:) * &
+            (betaval(i_SLS) * epsilondev_xy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(3,:,:,:))
+
+    R_xz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xz(i_SLS,:,:,:,ispec) &
+            + muvstore(:,:,:,ispec) * factor_common_use(:,:,:) * &
+            (betaval(i_SLS) * epsilondev_xz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(4,:,:,:))
+
+    R_yz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yz(i_SLS,:,:,:,ispec) &
+            + muvstore(:,:,:,ispec) * factor_common_use(:,:,:) * &
+            (betaval(i_SLS) * epsilondev_yz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(5,:,:,:))
+
   enddo
 #endif
 

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,368 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  5 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!                        Princeton University, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+  subroutine compute_forces_acoustic()
+
+  use specfem_par
+  use specfem_par_crustmantle,only: displ_crust_mantle,b_displ_crust_mantle, &
+                                    ibool_crust_mantle,ibelm_bottom_crust_mantle
+  use specfem_par_innercore,only: displ_inner_core,b_displ_inner_core, &
+                                  ibool_inner_core,ibelm_top_inner_core
+  use specfem_par_outercore
+  implicit none
+
+  ! local parameters
+  real(kind=CUSTOM_REAL) :: time,b_time
+  ! non blocking MPI
+  ! iphase: iphase = 1 is for computing outer elements in the outer_core,
+  !              iphase = 2 is for computing inner elements in the outer core (former icall parameter)
+  integer :: iphase
+  logical :: phase_is_inner
+
+  ! compute internal forces in the fluid region
+  if(CUSTOM_REAL == SIZE_REAL) then
+    time = sngl((dble(it-1)*DT-t0)*scale_t_inv)
+  else
+    time = (dble(it-1)*DT-t0)*scale_t_inv
+  endif
+  if (SIMULATION_TYPE == 3) then
+    ! note on backward/reconstructed wavefields:
+    !       b_time for b_displ( it=1 ) corresponds to (NSTEP - 1)*DT - t0  (after Newmark scheme...)
+    !       as we start with saved wavefields b_displ( 1 ) <-> displ( NSTEP ) which correspond
+    !       to a time (NSTEP - (it-1) - 1)*DT - t0
+    !       for reconstructing the rotational contributions
+    if(CUSTOM_REAL == SIZE_REAL) then
+      b_time = sngl((dble(NSTEP-it)*DT-t0)*scale_t_inv)
+    else
+      b_time = (dble(NSTEP-it)*DT-t0)*scale_t_inv
+    endif
+  endif
+
+  ! ****************************************************
+  !   big loop over all spectral elements in the fluid
+  ! ****************************************************
+
+  ! distinguishes two runs: for elements on MPI interfaces, and elements within the partitions
+  do iphase=1,2
+
+    ! first, iphase == 1 for points on MPI interfaces (thus outer elements)
+    ! second, iphase == 2 for points purely inside partition (thus inner elements)
+    !
+    ! compute all the outer elements first, then sends out non blocking MPI communication
+    ! and continues computing inner elements (overlapping)
+    if( iphase == 1 ) then
+      phase_is_inner = .false.
+    else
+      phase_is_inner = .true.
+    endif
+
+    if( .not. GPU_MODE ) then
+      ! on CPU    
+      if( USE_DEVILLE_PRODUCTS_VAL ) then
+        ! uses Deville et al. (2002) routine
+        call compute_forces_outer_core_Dev(time,deltat,two_omega_earth, &
+                                      A_array_rotation,B_array_rotation, &
+                                      displ_outer_core,accel_outer_core,div_displ_outer_core, &
+                                      phase_is_inner)
+      else
+        ! div_displ_outer_core is initialized to zero in the following subroutine.
+        call compute_forces_outer_core(time,deltat,two_omega_earth, &
+                                      A_array_rotation,B_array_rotation, &
+                                      displ_outer_core,accel_outer_core,div_displ_outer_core, &
+                                      phase_is_inner)
+      endif
+
+      ! adjoint / kernel runs
+      if (SIMULATION_TYPE == 3) then
+        if( USE_DEVILLE_PRODUCTS_VAL ) then
+          ! uses Deville et al. (2002) routine
+          call compute_forces_outer_core_Dev(b_time,b_deltat,b_two_omega_earth, &
+                                      b_A_array_rotation,b_B_array_rotation, &
+                                      b_displ_outer_core,b_accel_outer_core,b_div_displ_outer_core, &
+                                      phase_is_inner)
+        else
+          call compute_forces_outer_core(b_time,b_deltat,b_two_omega_earth, &
+                                      b_A_array_rotation,b_B_array_rotation, &
+                                      b_displ_outer_core,b_accel_outer_core,b_div_displ_outer_core, &
+                                      phase_is_inner)
+        endif
+      endif
+
+    else
+      ! on GPU
+      call load_GPU_acoustic()
+      
+      ! includes both forward and adjoint/kernel simulations
+      call compute_forces_outer_core_cuda(Mesh_pointer,iphase,time,b_time)
+
+      call load_CPU_acoustic()
+    endif
+
+
+    ! computes additional contributions to acceleration field
+    if( iphase == 1 ) then
+      ! Stacey absorbing boundaries
+      if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) call compute_stacey_outer_core()
+
+      ! ****************************************************
+      ! **********  add matching with solid part  **********
+      ! ****************************************************
+      ! only for elements in first matching layer in the fluid
+      !---
+      !--- couple with mantle at the top of the outer core
+      !---
+      if(ACTUALLY_COUPLE_FLUID_CMB) &
+        call compute_coupling_fluid_CMB(displ_crust_mantle,b_displ_crust_mantle, &
+                              ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
+                              accel_outer_core,b_accel_outer_core, &
+                              normal_top_outer_core,jacobian2D_top_outer_core, &
+                              wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
+                              SIMULATION_TYPE,NSPEC2D_TOP(IREGION_OUTER_CORE))
+
+      !---
+      !--- couple with inner core at the bottom of the outer core
+      !---
+      if(ACTUALLY_COUPLE_FLUID_ICB) &
+        call compute_coupling_fluid_ICB(displ_inner_core,b_displ_inner_core, &
+                              ibool_inner_core,ibelm_top_inner_core,  &
+                              accel_outer_core,b_accel_outer_core, &
+                              normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                              wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
+                              SIMULATION_TYPE,NSPEC2D_BOTTOM(IREGION_OUTER_CORE))
+    endif
+
+
+    ! assemble all the contributions between slices using MPI
+    ! in outer core
+    if( iphase == 1 ) then
+      ! sends out MPI interface data (non-blocking)
+      
+      if(.NOT. GPU_MODE) then
+        ! on CPU
+        call assemble_MPI_scalar_ext_mesh_s(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
+                                accel_outer_core, &
+                                buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core, &
+                                num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
+                                nibool_interfaces_outer_core,ibool_interfaces_outer_core,&
+                                my_neighbours_outer_core, &
+                                request_send_scalar_outer_core,request_recv_scalar_outer_core)
+      else
+        ! on GPU
+        call load_GPU_acoustic()
+        ! outer core
+        call assemble_MPI_scalar_send_cuda(NPROCTOT_VAL, &
+                                buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core, &
+                                num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
+                                nibool_interfaces_outer_core,&
+                                my_neighbours_outer_core, &
+                                request_send_scalar_outer_core,request_recv_scalar_outer_core, &
+                                1) ! <-- 1 == fwd accel
+      endif
+
+      ! adjoint simulations
+      if( SIMULATION_TYPE == 3 ) then
+        if(.NOT. GPU_MODE) then
+          ! on CPU
+          call assemble_MPI_scalar_ext_mesh_s(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
+                                b_accel_outer_core, &
+                                b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
+                                num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
+                                nibool_interfaces_outer_core,ibool_interfaces_outer_core,&
+                                my_neighbours_outer_core, &
+                                b_request_send_scalar_outer_core,b_request_recv_scalar_outer_core)
+        else
+          ! on GPU
+          call load_GPU_acoustic()
+          ! outer core
+          call assemble_MPI_scalar_send_cuda(NPROCTOT_VAL, &
+                                b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
+                                num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
+                                nibool_interfaces_outer_core,&
+                                my_neighbours_outer_core, &
+                                b_request_send_scalar_outer_core,b_request_recv_scalar_outer_core, &
+                                3) ! <-- 3 == adjoint b_accel
+        endif ! GPU
+      endif ! SIMULATION_TYPE == 3
+      
+    else
+      ! make sure the last communications are finished and processed
+      ! waits for send/receive requests to be completed and assembles values      
+      if(.NOT. GPU_MODE) then
+        ! on CPU
+        call assemble_MPI_scalar_ext_mesh_w(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
+                                accel_outer_core, &
+                                buffer_recv_scalar_outer_core,num_interfaces_outer_core,&
+                                max_nibool_interfaces_outer_core, &
+                                nibool_interfaces_outer_core,ibool_interfaces_outer_core, &
+                                request_send_scalar_outer_core,request_recv_scalar_outer_core)
+      else
+        ! on GPU
+        call assemble_MPI_scalar_write_cuda(NPROCTOT_VAL, &
+                                buffer_recv_scalar_outer_core, &
+                                num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
+                                request_send_scalar_outer_core,request_recv_scalar_outer_core, &
+                                1) ! <-- 1 == fwd accel
+        call load_CPU_acoustic()
+      endif
+
+      ! adjoint simulations
+      if( SIMULATION_TYPE == 3 ) then
+        if(.NOT. GPU_MODE) then
+          ! on CPU
+          call assemble_MPI_scalar_ext_mesh_w(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
+                                b_accel_outer_core, &
+                                b_buffer_recv_scalar_outer_core,num_interfaces_outer_core,&
+                                max_nibool_interfaces_outer_core, &
+                                nibool_interfaces_outer_core,ibool_interfaces_outer_core, &
+                                b_request_send_scalar_outer_core,b_request_recv_scalar_outer_core)
+        else
+          ! on GPU
+          call assemble_MPI_scalar_write_cuda(NPROCTOT_VAL, &
+                                b_buffer_recv_scalar_outer_core, &
+                                num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
+                                b_request_send_scalar_outer_core,b_request_recv_scalar_outer_core, &
+                                3) ! <-- 3 == adjoint b_accel 
+          call load_CPU_acoustic()
+        endif
+      endif ! SIMULATION_TYPE == 3
+    endif ! iphase == 1
+
+  enddo ! iphase
+
+  ! Newmark time scheme:
+  ! corrector terms for fluid parts
+  ! (multiply by the inverse of the mass matrix and update velocity)
+  if(.NOT. GPU_MODE) then
+    ! on CPU
+    call compute_forces_ac_update_veloc(veloc_outer_core,accel_outer_core, &
+                                       deltatover2,rmass_outer_core)
+    ! adjoint / kernel runs
+    if (SIMULATION_TYPE == 3) &
+      call compute_forces_ac_update_veloc(b_veloc_outer_core,b_accel_outer_core, &
+                                         b_deltatover2,rmass_outer_core)
+  else    
+    ! on GPU
+    call load_GPU_acoustic()
+    call kernel_3_outer_core_cuda(Mesh_pointer, &
+                                deltatover2,SIMULATION_TYPE,b_deltatover2)
+    call load_CPU_acoustic()
+  endif
+  
+  end subroutine compute_forces_acoustic
+
+!=====================================================================
+
+  subroutine compute_forces_ac_update_veloc(veloc_outer_core,accel_outer_core,deltatover2,rmass_outer_core)
+
+  use specfem_par,only: CUSTOM_REAL,NGLOB_OUTER_CORE
+
+#ifdef _HANDOPT
+  use specfem_par,only: imodulo_NGLOB_OUTER_CORE
+#endif
+
+  implicit none
+
+  ! velocity potential
+  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: veloc_outer_core,accel_outer_core
+
+  ! mass matrix
+  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: rmass_outer_core
+
+  real(kind=CUSTOM_REAL) :: deltatover2
+
+  ! local parameters
+  integer :: i
+
+  ! Newmark time scheme
+  ! multiply by the inverse of the mass matrix and update velocity
+
+#ifdef _HANDOPT_NEWMARK
+! way 2:
+  ! outer core
+  if(imodulo_NGLOB_OUTER_CORE >= 1) then
+    do i=1,imodulo_NGLOB_OUTER_CORE
+      accel_outer_core(i) = accel_outer_core(i)*rmass_outer_core(i)
+      veloc_outer_core(i) = veloc_outer_core(i) + deltatover2*accel_outer_core(i)
+    enddo
+  endif
+  do i=imodulo_NGLOB_OUTER_CORE+1,NGLOB_OUTER_CORE,3
+    accel_outer_core(i) = accel_outer_core(i)*rmass_outer_core(i)
+    veloc_outer_core(i) = veloc_outer_core(i) + deltatover2*accel_outer_core(i)
+
+    accel_outer_core(i+1) = accel_outer_core(i+1)*rmass_outer_core(i+1)
+    veloc_outer_core(i+1) = veloc_outer_core(i+1) + deltatover2*accel_outer_core(i+1)
+
+    accel_outer_core(i+2) = accel_outer_core(i+2)*rmass_outer_core(i+2)
+    veloc_outer_core(i+2) = veloc_outer_core(i+2) + deltatover2*accel_outer_core(i+2)
+  enddo
+#else
+! way 1:
+  do i=1,NGLOB_OUTER_CORE
+    accel_outer_core(i) = accel_outer_core(i)*rmass_outer_core(i)
+    veloc_outer_core(i) = veloc_outer_core(i) + deltatover2*accel_outer_core(i)
+  enddo
+#endif
+
+  end subroutine compute_forces_ac_update_veloc
+
+!=====================================================================
+
+  subroutine load_GPU_acoustic
+  
+  use specfem_par
+  use specfem_par_outercore
+  implicit none
+  
+  ! daniel: TODO - temporary transfers to the GPU
+  call transfer_fields_oc_to_device(NGLOB_OUTER_CORE,displ_outer_core, &
+                                veloc_outer_core,accel_outer_core,Mesh_pointer)
+
+  if( SIMULATION_TYPE == 3 ) then
+    call transfer_b_fields_oc_to_device(NGLOB_OUTER_CORE,b_displ_outer_core, &
+                                b_veloc_outer_core,b_accel_outer_core,Mesh_pointer)  
+  endif
+    
+  end subroutine 
+
+!=====================================================================
+
+  subroutine load_CPU_acoustic
+  
+  use specfem_par
+  use specfem_par_outercore
+  implicit none
+  
+  ! daniel: TODO - temporary transfers to the GPU
+  call transfer_fields_oc_from_device(NGLOB_OUTER_CORE,displ_outer_core, &
+                                veloc_outer_core,accel_outer_core,Mesh_pointer)
+
+  if( SIMULATION_TYPE == 3 ) then
+    call transfer_b_fields_oc_from_device(NGLOB_OUTER_CORE,b_displ_outer_core, &
+                                b_veloc_outer_core,b_accel_outer_core,Mesh_pointer)
+  endif
+    
+  end subroutine

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -25,133 +25,107 @@
 !
 !=====================================================================
 
-  subroutine compute_forces_crust_mantle(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_crust_mantle,accel_crust_mantle,xstore,ystore,zstore, &
-          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-!----------------------
-            is_on_a_slice_edge_crust_mantle,icall, &
-            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_yy,hprime_zz, &
-          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore,kappahstore,muvstore,muhstore,eta_anisostore, &
-          c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
-          c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
-          c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
-          ibool,ispec_is_tiso, &
-        !-- idoubling,
-          R_memory,epsilondev,epsilon_trace_over_3,one_minus_sum_beta, &
-          alphaval,betaval,gammaval,factor_common,vx,vy,vz,vnspec)
+  subroutine compute_forces_crust_mantle(displ_crust_mantle,accel_crust_mantle, &
+                                        phase_is_inner, &
+                                        R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                        epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                        epsilondev_xz,epsilondev_yz, &
+                                        epsilon_trace_over_3, &
+                                        alphaval,betaval,gammaval, &
+                                        factor_common,vx,vy,vz,vnspec)
 
-  implicit none
+  use constants
 
-  include "constants.h"
+  use specfem_par,only: &
+    hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+    minus_gravity_table,density_table,minus_deriv_gravity_table, &
+    myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+    iprocfrom_faces,iprocto_faces, &
+    iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+    npoin2D_max_all_CM_IC,INCLUDE_CENTRAL_CUBE
 
-! include values created by the mesher
-! done for performance only using static allocation to allow for loop unrolling
-  include "OUTPUT_FILES/values_from_mesher.h"
+  use specfem_par_crustmantle,only: &
+    xstore => xstore_crust_mantle,ystore => ystore_crust_mantle,zstore => zstore_crust_mantle, &
+    xix => xix_crust_mantle,xiy => xiy_crust_mantle,xiz => xiz_crust_mantle, &
+    etax => etax_crust_mantle,etay => etay_crust_mantle,etaz => etaz_crust_mantle, &
+    gammax => gammax_crust_mantle,gammay => gammay_crust_mantle,gammaz => gammaz_crust_mantle, &
+    kappavstore => kappavstore_crust_mantle,kappahstore => kappahstore_crust_mantle, &
+    muvstore => muvstore_crust_mantle,muhstore => muhstore_crust_mantle, &
+    eta_anisostore => eta_anisostore_crust_mantle, &
+    c11store => c11store_crust_mantle,c12store => c12store_crust_mantle,c13store => c13store_crust_mantle, &
+    c14store => c14store_crust_mantle,c15store => c15store_crust_mantle,c16store => c16store_crust_mantle, &
+    c22store => c22store_crust_mantle,c23store => c23store_crust_mantle,c24store => c24store_crust_mantle, &
+    c25store => c25store_crust_mantle,c26store => c26store_crust_mantle,c33store => c33store_crust_mantle, &
+    c34store => c34store_crust_mantle,c35store => c35store_crust_mantle,c36store => c36store_crust_mantle, &
+    c44store => c44store_crust_mantle,c45store => c45store_crust_mantle,c46store => c46store_crust_mantle, &
+    c55store => c55store_crust_mantle,c56store => c56store_crust_mantle,c66store => c66store_crust_mantle, &
+    ibool => ibool_crust_mantle, &
+    ispec_is_tiso => ispec_is_tiso_crust_mantle, &
+    one_minus_sum_beta => one_minus_sum_beta_crust_mantle, &
+    iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+    npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+    iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+    phase_ispec_inner => phase_ispec_inner_crust_mantle, &
+    nspec_outer => nspec_outer_crust_mantle, &
+    nspec_inner => nspec_inner_crust_mantle
 
-! model_attenuation_variables
-!  type model_attenuation_variables
-!    sequence
-!    double precision min_period, max_period
-!    double precision                          :: QT_c_source        ! Source Frequency
-!    double precision, dimension(:), pointer   :: Qtau_s             ! tau_sigma
-!    double precision, dimension(:), pointer   :: QrDisc             ! Discontinutitues Defined
-!    double precision, dimension(:), pointer   :: Qr                 ! Radius
-!    double precision, dimension(:), pointer   :: Qmu                ! Shear Attenuation
-!    double precision, dimension(:,:), pointer :: Qtau_e             ! tau_epsilon
-!    double precision, dimension(:), pointer   :: Qomsb, Qomsb2      ! one_minus_sum_beta
-!    double precision, dimension(:,:), pointer :: Qfc, Qfc2          ! factor_common
-!    double precision, dimension(:), pointer   :: Qsf, Qsf2          ! scale_factor
-!    integer, dimension(:), pointer            :: Qrmin              ! Max and Mins of idoubling
-!    integer, dimension(:), pointer            :: Qrmax              ! Max and Mins of idoubling
-!    integer, dimension(:), pointer            :: interval_Q                 ! Steps
-!    integer                                   :: Qn                 ! Number of points
-!    integer dummy_pad ! padding 4 bytes to align the structure
-!  end type model_attenuation_variables
+  use specfem_par_innercore,only: &
+    ibool_inner_core,idoubling_inner_core, &
+    iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+    npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+    iboolfaces_inner_core,iboolcorner_inner_core, &
+    nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+    npoin2D_cube_from_slices, &
+    ibool_central_cube, &
+    receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC
 
-! array with the local to global mapping per slice
-!  integer, dimension(NSPEC_CRUST_MANTLE) :: idoubling
-  logical, dimension(NSPEC_CRUST_MANTLE) :: ispec_is_tiso
+  implicit none
 
-! displacement and acceleration
+  ! displacement and acceleration
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: displ_crust_mantle,accel_crust_mantle
 
-! memory variables for attenuation
-! memory variables R_ij are stored at the local rather than global level
-! to allow for optimization of cache access by compiler
-  integer i_SLS,i_memory
-! variable sized array variables for one_minus_sum_beta and factor_common
-  integer vx, vy, vz, vnspec
 
-  real(kind=CUSTOM_REAL) one_minus_sum_beta_use,minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(vx, vy, vz, vnspec) :: one_minus_sum_beta
+  ! memory variables for attenuation
+  ! memory variables R_ij are stored at the local rather than global level
+  ! to allow for optimization of cache access by compiler
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: &
+    R_xx,R_yy,R_xy,R_xz,R_yz
 
-! for attenuation
-  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev
 
-! [alpha,beta,gamma]val reduced to N_SLS and factor_common to N_SLS*NUM_NODES
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-  real(kind=CUSTOM_REAL), dimension(N_SLS, vx, vy, vz, vnspec) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(NGLLX, NGLLY, NGLLZ) :: factor_common_c44_muv
-
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+    epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
 
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
 
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
 
-! x y and z contain r theta and phi
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: xstore,ystore,zstore
+  ! variable sized array variables for one_minus_sum_beta and factor_common
+  integer vx, vy, vz, vnspec
+  ! [alpha,beta,gamma]val reduced to N_SLS and factor_common to N_SLS*NUM_NODES
+  real(kind=CUSTOM_REAL), dimension(N_SLS, vx, vy, vz, vnspec) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
 
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
-        kappavstore,muvstore
+  ! inner/outer element run flag
+  logical :: phase_is_inner
 
-! store anisotropic properties only where needed to save memory
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_TISO_MANTLE) :: &
-        kappahstore,muhstore,eta_anisostore
+  ! local parameters
 
-! arrays for full anisotropy only when needed
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_MANTLE) :: &
-        c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
-        c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
-        c36store,c44store,c45store,c46store,c55store,c56store,c66store
+  ! for attenuation
+  real(kind=CUSTOM_REAL) one_minus_sum_beta_use,minus_sum_beta
+  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
+  real(kind=CUSTOM_REAL), dimension(NGLLX, NGLLY, NGLLZ) :: factor_common_c44_muv
+  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
 
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
   integer ispec,iglob,ispec_strain
   integer i,j,k,l
+  integer i_SLS
 
-! the 21 coefficients for an anisotropic medium in reduced notation
+  ! the 21 coefficients for an anisotropic medium in reduced notation
   real(kind=CUSTOM_REAL) c11,c22,c33,c44,c55,c66,c12,c13,c23,c14,c24,c34,c15,c25,c35,c45,c16,c26,c36,c46,c56
 
   real(kind=CUSTOM_REAL) rhovphsq,sinphifour,cosphisq,sinphisq,costhetasq,rhovsvsq,sinthetasq, &
@@ -159,8 +133,8 @@
         costwotheta,cosfourtheta,sintwophisq,costheta,sinphi,sintheta,cosphi, &
         sintwotheta,costwophi,sintwophi,costwothetasq,costwophisq,phi,theta
 
-  real(kind=CUSTOM_REAL) two_rhovpvsq,two_rhovphsq,two_rhovsvsq,two_rhovshsq
-  real(kind=CUSTOM_REAL) four_rhovpvsq,four_rhovphsq,four_rhovsvsq,four_rhovshsq
+  real(kind=CUSTOM_REAL) two_rhovsvsq,two_rhovshsq ! two_rhovpvsq,two_rhovphsq
+  real(kind=CUSTOM_REAL) four_rhovsvsq,four_rhovshsq ! four_rhovpvsq,four_rhovphsq,
 
   real(kind=CUSTOM_REAL) twoetaminone,etaminone,eta_aniso
   real(kind=CUSTOM_REAL) two_eta_aniso,four_eta_aniso,six_eta_aniso
@@ -183,7 +157,7 @@
   real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
   real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
 
-! for gravity
+  ! for gravity
   integer int_radius
   real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
   double precision radius,rho,minus_g,minus_dg
@@ -192,113 +166,31 @@
   double precision cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq
   double precision factor,sx_l,sy_l,sz_l,gxl,gyl,gzl
   double precision Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl
-  double precision, dimension(NRAD_GRAVITY) :: minus_gravity_table,density_table,minus_deriv_gravity_table
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
 
-! this for non blocking MPI
-  integer :: iphase,icall
+!  integer :: computed_elements
+  integer :: num_elements,ispec_p
+  integer :: iphase
 
-  integer :: computed_elements
-
-  logical, dimension(NSPEC_CRUST_MANTLE) :: is_on_a_slice_edge_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: accel_inner_core
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
-
-  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
-
-  integer :: ichunk,iproc_xi,iproc_eta,myrank
-
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
-
-  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
-  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
-
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-       npoin2D_xi_inner_core,npoin2D_eta_inner_core
-
-! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
-
-! communication pattern for corners between chunks
-  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
-
-  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
-  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
-
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_inner_core
-
-  integer :: npoin2D_max_all_CM_IC
-  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC) :: buffer_send_faces,buffer_received_faces
-
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
-     buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector
-
-! for matching with central cube in inner core
-  integer nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
-  integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
-  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices
-  double precision, dimension(npoin2D_cube_from_slices,NDIM,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
-  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
-  integer receiver_cube_from_slices
-  logical :: INCLUDE_CENTRAL_CUBE
-
-! local to global mapping
-  integer NSPEC2D_BOTTOM_INNER_CORE
-  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
-
 ! ****************************************************
 !   big loop over all spectral elements in the solid
 ! ****************************************************
 
-  computed_elements = 0
+!  computed_elements = 0
+  if( .not. phase_is_inner ) then
+    iphase = 1
+    num_elements = nspec_outer
+  else
+    iphase = 2
+    num_elements = nspec_inner
+  endif
 
-  do ispec = 1,NSPEC_CRUST_MANTLE
+  do ispec_p = 1,num_elements
 
-! hide communications by computing the edges first
-    if((icall == 2 .and. is_on_a_slice_edge_crust_mantle(ispec)) .or. &
-       (icall == 1 .and. .not. is_on_a_slice_edge_crust_mantle(ispec))) cycle
+    ispec = phase_ispec_inner(ispec_p,iphase)
 
-! process the communications every ELEMENTS_NONBLOCKING elements
-    computed_elements = computed_elements + 1
-    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_CM_IC) == 0) then
+    ! only compute element which belong to current phase (inner or outer elements)
 
-      if(iphase <= 7) call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
-            NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
-
-      if(INCLUDE_CENTRAL_CUBE) then
-          if(iphase > 7 .and. iphase_CC <= 4) &
-            call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-                   npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-                   receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
-                   ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,accel_inner_core,NDIM,iphase_CC)
-      endif
-
-    endif
-
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
@@ -549,13 +441,13 @@
               four_eta_aniso = 4.*eta_aniso
               six_eta_aniso = 6.*eta_aniso
 
-              two_rhovpvsq = 2.*rhovpvsq
-              two_rhovphsq = 2.*rhovphsq
+              !two_rhovpvsq = 2.*rhovpvsq
+              !two_rhovphsq = 2.*rhovphsq
               two_rhovsvsq = 2.*rhovsvsq
               two_rhovshsq = 2.*rhovshsq
 
-              four_rhovpvsq = 4.*rhovpvsq
-              four_rhovphsq = 4.*rhovphsq
+              !four_rhovpvsq = 4.*rhovpvsq
+              !four_rhovphsq = 4.*rhovphsq
               four_rhovsvsq = 4.*rhovsvsq
               four_rhovshsq = 4.*rhovshsq
 
@@ -693,14 +585,14 @@
         ! subtract memory variables if attenuation
           if(ATTENUATION_VAL .and. ( USE_ATTENUATION_MIMIC .eqv. .false. ) ) then
             do i_SLS = 1,N_SLS
-              R_xx_val = R_memory(1,i_SLS,i,j,k,ispec)
-              R_yy_val = R_memory(2,i_SLS,i,j,k,ispec)
+              R_xx_val = R_xx(i_SLS,i,j,k,ispec)
+              R_yy_val = R_yy(i_SLS,i,j,k,ispec)
               sigma_xx = sigma_xx - R_xx_val
               sigma_yy = sigma_yy - R_yy_val
               sigma_zz = sigma_zz + R_xx_val + R_yy_val
-              sigma_xy = sigma_xy - R_memory(3,i_SLS,i,j,k,ispec)
-              sigma_xz = sigma_xz - R_memory(4,i_SLS,i,j,k,ispec)
-              sigma_yz = sigma_yz - R_memory(5,i_SLS,i,j,k,ispec)
+              sigma_xy = sigma_xy - R_xy(i_SLS,i,j,k,ispec)
+              sigma_xz = sigma_xz - R_xz(i_SLS,i,j,k,ispec)
+              sigma_yz = sigma_yz - R_yz(i_SLS,i,j,k,ispec)
             enddo
           endif
 
@@ -890,9 +782,7 @@
       do j=1,NGLLY
         do i=1,NGLLX
           iglob = ibool(i,j,k,ispec)
-          accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(1,i,j,k)
-          accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(2,i,j,k)
-          accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(3,i,j,k)
+          accel_crust_mantle(:,iglob) = accel_crust_mantle(:,iglob) + sum_terms(:,i,j,k)
         enddo
       enddo
     enddo
@@ -916,26 +806,42 @@
 
 ! use Runge-Kutta scheme to march in time
       do i_SLS = 1,N_SLS
-        do i_memory = 1,5
 
 ! get coefficients for that standard linear solid
 ! IMPROVE we use mu_v here even if there is some anisotropy
 ! IMPROVE we should probably use an average value instead
 
-          ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
-          factor_common_c44_muv = factor_common(i_SLS,:,:,:,ispec)
-          if(ANISOTROPIC_3D_MANTLE_VAL) then
-            factor_common_c44_muv = factor_common_c44_muv * c44store(:,:,:,ispec)
-          else
-            factor_common_c44_muv = factor_common_c44_muv * muvstore(:,:,:,ispec)
-          endif
+        ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
+        factor_common_c44_muv = factor_common(i_SLS,:,:,:,ispec)
+        if(ANISOTROPIC_3D_MANTLE_VAL) then
+          factor_common_c44_muv = factor_common_c44_muv * c44store(:,:,:,ispec)
+        else
+          factor_common_c44_muv = factor_common_c44_muv * muvstore(:,:,:,ispec)
+        endif
 
-          R_memory(i_memory,i_SLS,:,:,:,ispec) = alphaval(i_SLS) * &
-                    R_memory(i_memory,i_SLS,:,:,:,ispec) + &
-                    factor_common_c44_muv * &
-                    (betaval(i_SLS) * epsilondev(i_memory,:,:,:,ispec) + &
-                    gammaval(i_SLS) * epsilondev_loc(i_memory,:,:,:))
-        enddo
+!        do i_memory = 1,5
+!          R_memory(i_memory,i_SLS,:,:,:,ispec) = alphaval(i_SLS) * &
+!                    R_memory(i_memory,i_SLS,:,:,:,ispec) + &
+!                    factor_common_c44_muv * &
+!                    (betaval(i_SLS) * epsilondev(i_memory,:,:,:,ispec) + &
+!                    gammaval(i_SLS) * epsilondev_loc(i_memory,:,:,:))
+!        enddo
+
+        R_xx(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xx(i_SLS,:,:,:,ispec) + factor_common_c44_muv * &
+                    (betaval(i_SLS) * epsilondev_xx(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(1,:,:,:))
+
+        R_yy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yy(i_SLS,:,:,:,ispec) + factor_common_c44_muv * &
+                    (betaval(i_SLS) * epsilondev_yy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(2,:,:,:))
+
+        R_xy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xy(i_SLS,:,:,:,ispec) + factor_common_c44_muv * &
+                    (betaval(i_SLS) * epsilondev_xy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(3,:,:,:))
+
+        R_xz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xz(i_SLS,:,:,:,ispec) + factor_common_c44_muv * &
+                    (betaval(i_SLS) * epsilondev_xz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(4,:,:,:))
+
+        R_yz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yz(i_SLS,:,:,:,ispec) + factor_common_c44_muv * &
+                    (betaval(i_SLS) * epsilondev_yz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(5,:,:,:))
+
       enddo
 
     endif
@@ -946,7 +852,11 @@
       do k=1,NGLLZ
         do j=1,NGLLY
           do i=1,NGLLX
-            epsilondev(:,i,j,k,ispec) = epsilondev_loc(:,i,j,k)
+            epsilondev_xx(i,j,k,ispec) = epsilondev_loc(1,i,j,k)
+            epsilondev_yy(i,j,k,ispec) = epsilondev_loc(2,i,j,k)
+            epsilondev_xy(i,j,k,ispec) = epsilondev_loc(3,i,j,k)
+            epsilondev_xz(i,j,k,ispec) = epsilondev_loc(4,i,j,k)
+            epsilondev_yz(i,j,k,ispec) = epsilondev_loc(5,i,j,k)
           enddo
         enddo
       enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -30,112 +30,96 @@
 ! or compile with: -D_HANDOPT
 !#define _HANDOPT
 
-! BEWARE:
-! BEWARE: we have observed that using _HANDOPT in combination with -O3 or higher can lead to problems on some machines;
-! BEWARE: thus, be careful when using it. At the very least, run the same test simulation once with _HANDOPT and once without
-! BEWARE: and make sure that all the seismograms you get are the same down to roundoff noise.
-! BEWARE:
-
 ! note: these hand optimizations should help compilers to pipeline the code and make better use of the cache;
 !          depending on compilers, it can further decrease the computation time by ~ 30%.
 !          the original routines are commented with "! way 1", the hand-optimized routines with  "! way 2"
 
-  subroutine compute_forces_crust_mantle_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_crust_mantle,accel_crust_mantle,xstore,ystore,zstore, &
-          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-!----------------------
-            is_on_a_slice_edge_crust_mantle,icall, &
-            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_xxT, &
-          hprimewgll_xx,hprimewgll_xxT, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore,kappahstore,muvstore,muhstore,eta_anisostore, &
-          c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
-          c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
-          c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
-          ibool,ispec_is_tiso, &
-        ! --idoubling,
-          R_memory,epsilondev,epsilon_trace_over_3,one_minus_sum_beta, &
-          alphaval,betaval,gammaval,factor_common,vx,vy,vz,vnspec)
+  subroutine compute_forces_crust_mantle_Dev( displ_crust_mantle,accel_crust_mantle, &
+                                              phase_is_inner, &
+                                              R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                              epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                              epsilondev_xz,epsilondev_yz, &
+                                              epsilon_trace_over_3, &
+                                              alphaval,betaval,gammaval, &
+                                              factor_common,vx,vy,vz,vnspec)
 
 ! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
 
-  implicit none
+  use constants
 
-  include "constants.h"
+  use specfem_par,only: &
+    hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+    minus_gravity_table,density_table,minus_deriv_gravity_table, &
+    myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+    iprocfrom_faces,iprocto_faces, &
+    iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+    npoin2D_max_all_CM_IC,INCLUDE_CENTRAL_CUBE
 
-  ! include values created by the mesher
-  ! done for performance only using static allocation to allow for loop unrolling
-  include "OUTPUT_FILES/values_from_mesher.h"
+  use specfem_par_crustmantle,only: &
+    xstore => xstore_crust_mantle,ystore => ystore_crust_mantle,zstore => zstore_crust_mantle, &
+    xix => xix_crust_mantle,xiy => xiy_crust_mantle,xiz => xiz_crust_mantle, &
+    etax => etax_crust_mantle,etay => etay_crust_mantle,etaz => etaz_crust_mantle, &
+    gammax => gammax_crust_mantle,gammay => gammay_crust_mantle,gammaz => gammaz_crust_mantle, &
+    kappavstore => kappavstore_crust_mantle,kappahstore => kappahstore_crust_mantle, &
+    muvstore => muvstore_crust_mantle,muhstore => muhstore_crust_mantle, &
+    eta_anisostore => eta_anisostore_crust_mantle, &
+    c11store => c11store_crust_mantle,c12store => c12store_crust_mantle,c13store => c13store_crust_mantle, &
+    c14store => c14store_crust_mantle,c15store => c15store_crust_mantle,c16store => c16store_crust_mantle, &
+    c22store => c22store_crust_mantle,c23store => c23store_crust_mantle,c24store => c24store_crust_mantle, &
+    c25store => c25store_crust_mantle,c26store => c26store_crust_mantle,c33store => c33store_crust_mantle, &
+    c34store => c34store_crust_mantle,c35store => c35store_crust_mantle,c36store => c36store_crust_mantle, &
+    c44store => c44store_crust_mantle,c45store => c45store_crust_mantle,c46store => c46store_crust_mantle, &
+    c55store => c55store_crust_mantle,c56store => c56store_crust_mantle,c66store => c66store_crust_mantle, &
+    ibool => ibool_crust_mantle, &
+    ispec_is_tiso => ispec_is_tiso_crust_mantle, &
+    one_minus_sum_beta => one_minus_sum_beta_crust_mantle, &
+    iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+    npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+    iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+    phase_ispec_inner => phase_ispec_inner_crust_mantle, &
+    nspec_outer => nspec_outer_crust_mantle, &
+    nspec_inner => nspec_inner_crust_mantle
 
+  use specfem_par_innercore,only: &
+    ibool_inner_core,idoubling_inner_core, &
+    iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+    npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+    iboolfaces_inner_core,iboolcorner_inner_core, &
+    nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+    npoin2D_cube_from_slices, &
+    ibool_central_cube, &
+    receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC
+
+  implicit none
+
   ! displacement and acceleration
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: displ_crust_mantle,accel_crust_mantle
-  ! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool
 
-  ! x y and z contain r theta and phi
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: xstore,ystore,zstore
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-
-  ! array with derivatives of Lagrange polynomials and precalculated products
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xxT,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-  ! store anisotropic properties only where needed to save memory
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_TISO_MANTLE) :: &
-        kappahstore,muhstore,eta_anisostore
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
-        kappavstore,muvstore
-
-  ! arrays for full anisotropy only when needed
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_MANTLE) :: &
-        c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
-        c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
-        c36store,c44store,c45store,c46store,c55store,c56store,c66store
-
   ! attenuation
   ! memory variables for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: &
+    R_xx,R_yy,R_xy,R_xz,R_yz
+
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
+    epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
 
   integer :: vx,vy,vz,vnspec
 
   ! [alpha,beta,gamma]val reduced to N_SLS and factor_common to N_SLS*NUM_NODES
   real(kind=CUSTOM_REAL), dimension(N_SLS, vx, vy, vz, vnspec) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(vx, vy, vz, vnspec) :: one_minus_sum_beta
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
 
-  ! array with the local to global mapping per slice
-  logical, dimension(NSPEC_CRUST_MANTLE) :: ispec_is_tiso
+  ! inner/outer element run flag
+  logical :: phase_is_inner
 
-  ! gravity
-  double precision, dimension(NRAD_GRAVITY) :: minus_gravity_table,density_table,minus_deriv_gravity_table
+  ! local parameters
 
-! local parameters
   ! Deville
   ! manually inline the calls to the Deville et al. (2002) routines
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
@@ -181,72 +165,11 @@
   ! for gravity
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
 
-  integer :: ispec
-  integer :: i,j,k
-  integer :: iglob1
+  integer :: ispec,i,j,k,iglob1
+!  integer :: computed_elements
+  integer :: num_elements,ispec_p
+  integer :: iphase
 
-
-! this for non blocking MPI
-  integer :: iphase,icall
-
-  integer :: computed_elements
-
-  logical, dimension(NSPEC_CRUST_MANTLE) :: is_on_a_slice_edge_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: accel_inner_core
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
-
-  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
-
-  integer :: ichunk,iproc_xi,iproc_eta,myrank
-
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
-
-  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
-  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
-
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-       npoin2D_xi_inner_core,npoin2D_eta_inner_core
-
-! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
-
-! communication pattern for corners between chunks
-  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
-
-  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
-  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
-
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_inner_core
-
-  integer :: npoin2D_max_all_CM_IC
-  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC) :: buffer_send_faces,buffer_received_faces
-
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
-     buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector
-
-! for matching with central cube in inner core
-  integer nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
-  integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
-  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices
-  double precision, dimension(npoin2D_cube_from_slices,NDIM,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
-  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
-  integer receiver_cube_from_slices
-  logical :: INCLUDE_CENTRAL_CUBE
-
-! local to global mapping
-  integer NSPEC2D_BOTTOM_INNER_CORE
-  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
-
 #ifdef _HANDOPT
   integer, dimension(5) :: iglobv5
 #endif
@@ -255,44 +178,21 @@
 !   big loop over all spectral elements in the solid
 ! ****************************************************
 
-  computed_elements = 0
+!  computed_elements = 0
+  if( .not. phase_is_inner ) then
+    iphase = 1
+    num_elements = nspec_outer
+  else
+    iphase = 2
+    num_elements = nspec_inner
+  endif
 
-  do ispec = 1,NSPEC_CRUST_MANTLE
+  do ispec_p = 1,num_elements
 
-! hide communications by computing the edges first
-    if((icall == 2 .and. is_on_a_slice_edge_crust_mantle(ispec)) .or. &
-       (icall == 1 .and. .not. is_on_a_slice_edge_crust_mantle(ispec))) cycle
+    ispec = phase_ispec_inner(ispec_p,iphase)
 
-! process the communications every ELEMENTS_NONBLOCKING elements
-    computed_elements = computed_elements + 1
-    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_CM_IC) == 0) then
+    ! only compute element which belong to current phase (inner or outer elements)
 
-      if(iphase <= 7) call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
-            NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
-
-      if(INCLUDE_CENTRAL_CUBE) then
-          if(iphase > 7 .and. iphase_CC <= 4) &
-            call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-                   npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-                   receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
-                   ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,accel_inner_core,NDIM,iphase_CC)
-      endif
-
-    endif
-
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
@@ -416,7 +316,8 @@
                     c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
                     c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
                     ibool, &
-                    R_memory,epsilon_trace_over_3, &
+                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                    epsilon_trace_over_3, &
                     one_minus_sum_beta,vx,vy,vz,vnspec, &
                     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
                     dummyx_loc,dummyy_loc,dummyz_loc,epsilondev_loc,rho_s_H)
@@ -430,7 +331,8 @@
                     wgll_cube, &
                     kappavstore,muvstore, &
                     ibool, &
-                    R_memory,epsilon_trace_over_3, &
+                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                    epsilon_trace_over_3, &
                     one_minus_sum_beta,vx,vy,vz,vnspec, &
                     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
                     dummyx_loc,dummyy_loc,dummyz_loc,epsilondev_loc,rho_s_H)
@@ -443,7 +345,8 @@
                     wgll_cube, &
                     kappavstore,kappahstore,muvstore,muhstore,eta_anisostore, &
                     ibool, &
-                    R_memory,epsilon_trace_over_3, &
+                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                    epsilon_trace_over_3, &
                     one_minus_sum_beta,vx,vy,vz,vnspec, &
                     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
                     dummyx_loc,dummyy_loc,dummyz_loc,epsilondev_loc,rho_s_H)
@@ -581,17 +484,23 @@
     if(ATTENUATION_VAL .and. ( USE_ATTENUATION_MIMIC .eqv. .false. ) ) then
 
       ! updates R_memory
-      call compute_element_att_memory_cr(ispec,R_memory, &
+      call compute_element_att_memory_cr(ispec,R_xx,R_yy,R_xy,R_xz,R_yz, &
                                       vx,vy,vz,vnspec,factor_common, &
                                       alphaval,betaval,gammaval, &
                                       c44store,muvstore, &
-                                      epsilondev,epsilondev_loc)
+                                      epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                      epsilondev_xz,epsilondev_yz, &
+                                      epsilondev_loc)
 
     endif
 
     ! save deviatoric strain for Runge-Kutta scheme
     if(COMPUTE_AND_STORE_STRAIN) then
-      epsilondev(:,:,:,:,ispec) = epsilondev_loc(:,:,:,:)
+      epsilondev_xx(:,:,:,ispec) = epsilondev_loc(1,:,:,:)
+      epsilondev_yy(:,:,:,ispec) = epsilondev_loc(2,:,:,:)
+      epsilondev_xy(:,:,:,ispec) = epsilondev_loc(3,:,:,:)
+      epsilondev_xz(:,:,:,ispec) = epsilondev_loc(4,:,:,:)
+      epsilondev_yz(:,:,:,ispec) = epsilondev_loc(5,:,:,:)
     endif
 
   enddo   ! spectral element loop NSPEC_CRUST_MANTLE

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,767 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  5 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!                        Princeton University, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+  subroutine compute_forces_elastic()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore,only: accel_outer_core,b_accel_outer_core, &
+                                  normal_top_outer_core,jacobian2D_top_outer_core, &
+                                  normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                                  ibelm_top_outer_core,ibelm_bottom_outer_core, &
+                                  ibool_outer_core
+  use specfem_par_movie
+  implicit none
+
+  ! local parameters
+  ! non blocking MPI
+  ! iphase: iphase = 1 is for computing outer elements in the crust_mantle and inner_core regions,
+  !              iphase = 2 is for computing inner elements (former icall parameter)
+  integer :: iphase
+  logical :: phase_is_inner
+
+  ! ****************************************************
+  !   big loop over all spectral elements in the solid
+  ! ****************************************************
+
+  ! compute internal forces in the solid regions
+
+  ! for anisotropy and gravity, x y and z contain r theta and phi
+
+  ! distinguishes two runs: for elements on MPI interfaces, and elements within the partitions
+  do iphase = 1,2
+
+    ! first, iphase == 1 for points on MPI interfaces (thus outer elements)
+    ! second, iphase == 2 for points purely inside partition (thus inner elements)
+    !
+    ! compute all the outer elements first, then sends out non blocking MPI communication
+    ! and continues computing inner elements (overlapping)
+    if( iphase == 1 ) then
+      phase_is_inner = .false.
+    else
+      phase_is_inner = .true.
+    endif
+
+
+    if( .NOT. GPU_MODE ) then
+      ! on CPU
+
+      ! compute internal forces in the solid regions
+      ! note: for anisotropy and gravity, x y and z contain r theta and phi
+      if( USE_DEVILLE_PRODUCTS_VAL ) then
+        ! uses Deville (2002) optimizations
+        ! crust/mantle region
+        call compute_forces_crust_mantle_Dev( displ_crust_mantle,accel_crust_mantle, &
+                                      phase_is_inner, &
+                                      R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
+                                      R_xz_crust_mantle,R_yz_crust_mantle, &
+                                      epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                                      epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                                      eps_trace_over_3_crust_mantle, &
+                                      alphaval,betaval,gammaval,factor_common_crust_mantle, &
+                                      size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
+                                      size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
+        ! inner core region
+        call compute_forces_inner_core_Dev( displ_inner_core,accel_inner_core, &
+                                      phase_is_inner, &
+                                      R_xx_inner_core,R_yy_inner_core,R_xy_inner_core,R_xz_inner_core,R_yz_inner_core, &
+                                      epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+                                      epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+                                      eps_trace_over_3_inner_core,&
+                                      alphaval,betaval,gammaval, &
+                                      factor_common_inner_core, &
+                                      size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
+                                      size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
+
+      else
+        ! no Deville optimization
+        ! crust/mantle region
+        call compute_forces_crust_mantle( displ_crust_mantle,accel_crust_mantle, &
+                                      phase_is_inner, &
+                                      R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
+                                      R_xz_crust_mantle,R_yz_crust_mantle, &
+                                      epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                                      epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                                      eps_trace_over_3_crust_mantle, &
+                                      alphaval,betaval,gammaval,factor_common_crust_mantle, &
+                                      size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
+                                      size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
+        ! inner core region
+        call compute_forces_inner_core( displ_inner_core,accel_inner_core, &
+                                      phase_is_inner, &
+                                      R_xx_inner_core,R_yy_inner_core,R_xy_inner_core,R_xz_inner_core,R_yz_inner_core, &
+                                      epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+                                      epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+                                      eps_trace_over_3_inner_core,&
+                                      alphaval,betaval,gammaval, &
+                                      factor_common_inner_core, &
+                                      size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
+                                      size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
+
+      endif
+
+      ! adjoint / kernel runs
+      if (SIMULATION_TYPE == 3 ) then
+        if( USE_DEVILLE_PRODUCTS_VAL ) then
+          ! uses Deville (2002) optimizations
+          ! crust/mantle region
+          call compute_forces_crust_mantle_Dev( b_displ_crust_mantle,b_accel_crust_mantle, &
+                                      phase_is_inner, &
+                                      b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
+                                      b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
+                                      b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
+                                      b_epsilondev_xy_crust_mantle, &
+                                      b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
+                                      b_eps_trace_over_3_crust_mantle, &
+                                      b_alphaval,b_betaval,b_gammaval,factor_common_crust_mantle, &
+                                      size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
+                                      size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
+          ! inner core region
+          call compute_forces_inner_core_Dev( b_displ_inner_core,b_accel_inner_core, &
+                                        phase_is_inner, &
+                                        b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
+                                        b_R_xz_inner_core,b_R_yz_inner_core, &
+                                        b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+                                        b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
+                                        b_eps_trace_over_3_inner_core,&
+                                        b_alphaval,b_betaval,b_gammaval, &
+                                        factor_common_inner_core, &
+                                        size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
+                                        size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
+
+        else
+          ! no Deville optimization
+          ! crust/mantle region
+          call compute_forces_crust_mantle( b_displ_crust_mantle,b_accel_crust_mantle, &
+                                        phase_is_inner, &
+                                        b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
+                                        b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
+                                        b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
+                                        b_epsilondev_xy_crust_mantle, &
+                                        b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
+                                        b_eps_trace_over_3_crust_mantle, &
+                                        b_alphaval,b_betaval,b_gammaval,factor_common_crust_mantle, &
+                                        size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
+                                        size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
+
+          ! inner core region
+          call compute_forces_inner_core( b_displ_inner_core,b_accel_inner_core, &
+                                        phase_is_inner, &
+                                        b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
+                                        b_R_xz_inner_core,b_R_yz_inner_core, &
+                                        b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+                                        b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
+                                        b_eps_trace_over_3_inner_core,&
+                                        b_alphaval,b_betaval,b_gammaval, &
+                                        factor_common_inner_core, &
+                                        size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
+                                        size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
+        endif
+      endif !SIMULATION_TYPE == 3
+
+    else
+      ! on GPU      
+      call load_GPU_elastic()      
+      
+      ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
+      ! for crust/mantle
+      call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase)
+      ! for inner core
+      call compute_forces_inner_core_cuda(Mesh_pointer,iphase)
+            
+      call load_CPU_elastic()      
+    endif ! GPU_MODE
+
+
+    
+    ! computes additional contributions to acceleration field
+    if( iphase == 1 ) then
+
+      ! absorbing boundaries
+      ! Stacey
+      if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) call compute_stacey_crust_mantle()
+
+      ! add the sources
+      if (SIMULATION_TYPE == 1) &
+        call compute_add_sources(myrank,NSOURCES, &
+                                  accel_crust_mantle,sourcearrays, &
+                                  DT,t0,tshift_cmt,hdur_gaussian,ibool_crust_mantle, &
+                                  islice_selected_source,ispec_selected_source,it, &
+                                  hdur,xi_source,eta_source,gamma_source,nu_source)
+
+      ! add adjoint sources
+      if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
+        if( nadj_rec_local > 0 ) &
+          call compute_add_sources_adjoint(myrank,nrec, &
+                                  nadj_rec_local,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC, &
+                                  accel_crust_mantle,adj_sourcearrays, &
+                                  nu,xi_receiver,eta_receiver,gamma_receiver, &
+                                  xigll,yigll,zigll,ibool_crust_mantle, &
+                                  islice_selected_rec,ispec_selected_rec, &
+                                  NSTEP_SUB_ADJ,iadjsrc_len,iadjsrc,iadj_vec, &
+                                  it,it_begin,station_name,network_name,DT)
+      endif
+
+      ! add sources for backward/reconstructed wavefield
+      if (SIMULATION_TYPE == 3) &
+        call compute_add_sources_backward(myrank,NSOURCES,NSTEP, &
+                                  b_accel_crust_mantle,sourcearrays, &
+                                  DT,t0,tshift_cmt,hdur_gaussian,ibool_crust_mantle, &
+                                  islice_selected_source,ispec_selected_source,it, &
+                                  hdur,xi_source,eta_source,gamma_source,nu_source)
+
+
+      ! NOISE_TOMOGRAPHY
+      if ( NOISE_TOMOGRAPHY == 1 ) then
+         ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
+         ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
+         ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
+         ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
+         call add_source_master_rec_noise(myrank,nrec, &
+                                  NSTEP,accel_crust_mantle,noise_sourcearray, &
+                                  ibool_crust_mantle,islice_selected_rec,ispec_selected_rec, &
+                                  it,irec_master_noise)
+      elseif ( NOISE_TOMOGRAPHY == 2 ) then
+         ! second step of noise tomography, i.e., read the surface movie saved at every timestep
+         ! use the movie to drive the ensemble forward wavefield
+         call noise_read_add_surface_movie(nmovie_points,accel_crust_mantle, &
+                                normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
+                                ibelm_top_crust_mantle,ibool_crust_mantle, &
+                                NSPEC2D_TOP(IREGION_CRUST_MANTLE),noise_surface_movie, &
+                                NSTEP-it+1,jacobian2D_top_crust_mantle,wgllwgll_xy)
+          ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
+          ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
+          ! note the ensemble forward sources are generally distributed on the surface of the earth
+          ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
+          ! therefore, we must add it here, before applying the inverse of mass matrix
+      elseif ( NOISE_TOMOGRAPHY == 3 ) then
+          ! third step of noise tomography, i.e., read the surface movie saved at every timestep
+          ! use the movie to reconstruct the ensemble forward wavefield
+          ! the ensemble adjoint wavefield is done as usual
+          ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
+          call noise_read_add_surface_movie(nmovie_points,b_accel_crust_mantle, &
+                                normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
+                                ibelm_top_crust_mantle,ibool_crust_mantle, &
+                                NSPEC2D_TOP(IREGION_CRUST_MANTLE),noise_surface_movie, &
+                                it,jacobian2D_top_crust_mantle,wgllwgll_xy)
+      endif
+
+
+      ! ****************************************************
+      ! **********  add matching with fluid part  **********
+      ! ****************************************************
+
+      ! only for elements in first matching layer in the solid
+
+      !---
+      !--- couple with outer core at the bottom of the mantle
+      !---
+      if(ACTUALLY_COUPLE_FLUID_CMB) &
+        call compute_coupling_CMB_fluid(displ_crust_mantle,b_displ_crust_mantle, &
+                              accel_crust_mantle,b_accel_crust_mantle, &
+                              ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
+                              accel_outer_core,b_accel_outer_core, &
+                              normal_top_outer_core,jacobian2D_top_outer_core, &
+                              wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
+                              RHO_TOP_OC,minus_g_cmb, &
+                              SIMULATION_TYPE,NSPEC2D_BOTTOM(IREGION_CRUST_MANTLE))
+
+      !---
+      !--- couple with outer core at the top of the inner core
+      !---
+      if(ACTUALLY_COUPLE_FLUID_ICB) &
+        call compute_coupling_ICB_fluid(displ_inner_core,b_displ_inner_core, &
+                              accel_inner_core,b_accel_inner_core, &
+                              ibool_inner_core,ibelm_top_inner_core,  &
+                              accel_outer_core,b_accel_outer_core, &
+                              normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                              wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
+                              RHO_BOTTOM_OC,minus_g_icb, &
+                              SIMULATION_TYPE,NSPEC2D_TOP(IREGION_INNER_CORE))
+
+    endif ! iphase == 1
+
+    
+    ! assemble all the contributions between slices using MPI
+
+    ! crust/mantle and inner core handled in the same call
+    ! in order to reduce the number of MPI messages by 2
+
+    if( iphase == 1 ) then
+      ! sends out MPI interface data      
+      if(.NOT. GPU_MODE) then
+        ! on CPU      
+        ! sends accel values to corresponding MPI interface neighbors
+        ! crust mantle
+        call assemble_MPI_vector_ext_mesh_s(NPROCTOT_VAL,NGLOB_CRUST_MANTLE, &
+                      accel_crust_mantle, &
+                      buffer_send_vector_crust_mantle,buffer_recv_vector_crust_mantle, &
+                      num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
+                      nibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle,&
+                      my_neighbours_crust_mantle, &
+                      request_send_vector_crust_mantle,request_recv_vector_crust_mantle)
+        ! inner core
+        call assemble_MPI_vector_ext_mesh_s(NPROCTOT_VAL,NGLOB_INNER_CORE, &
+                      accel_inner_core, &
+                      buffer_send_vector_inner_core,buffer_recv_vector_inner_core, &
+                      num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
+                      nibool_interfaces_inner_core,ibool_interfaces_inner_core,&
+                      my_neighbours_inner_core, &
+                      request_send_vector_inner_core,request_recv_vector_inner_core)
+      else
+        ! on GPU    
+        call load_GPU_elastic()
+        ! crust mantle
+        call assemble_MPI_vector_send_cuda(NPROCTOT_VAL, &
+                      buffer_send_vector_crust_mantle,buffer_recv_vector_crust_mantle, &
+                      num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
+                      nibool_interfaces_crust_mantle,&
+                      my_neighbours_crust_mantle, &
+                      request_send_vector_crust_mantle,request_recv_vector_crust_mantle, &
+                      IREGION_CRUST_MANTLE, &
+                      1) ! <-- 1 == fwd accel
+        ! inner core
+        call assemble_MPI_vector_send_cuda(NPROCTOT_VAL, &
+                      buffer_send_vector_inner_core,buffer_recv_vector_inner_core, &
+                      num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
+                      nibool_interfaces_inner_core,&
+                      my_neighbours_inner_core, &
+                      request_send_vector_inner_core,request_recv_vector_inner_core, &
+                      IREGION_INNER_CORE, &
+                      1)
+      endif ! GPU_MODE
+
+      ! adjoint / kernel runs
+      if (SIMULATION_TYPE == 3) then
+        if(.NOT. GPU_MODE) then
+          ! on CPU          
+          ! sends accel values to corresponding MPI interface neighbors
+          ! crust mantle
+          call assemble_MPI_vector_ext_mesh_s(NPROCTOT_VAL,NGLOB_CRUST_MANTLE, &
+                        b_accel_crust_mantle, &
+                        b_buffer_send_vector_crust_mantle,b_buffer_recv_vector_crust_mantle, &
+                        num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
+                        nibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle,&
+                        my_neighbours_crust_mantle, &
+                        b_request_send_vector_crust_mantle,b_request_recv_vector_crust_mantle)
+          ! inner core
+          call assemble_MPI_vector_ext_mesh_s(NPROCTOT_VAL,NGLOB_INNER_CORE, &
+                        b_accel_inner_core, &
+                        b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
+                        num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
+                        nibool_interfaces_inner_core,ibool_interfaces_inner_core,&
+                        my_neighbours_inner_core, &
+                        b_request_send_vector_inner_core,b_request_recv_vector_inner_core)
+        else 
+          ! on GPU
+          call load_GPU_elastic()
+          ! crust mantle
+          call assemble_MPI_vector_send_cuda(NPROCTOT_VAL, &
+                      b_buffer_send_vector_crust_mantle,b_buffer_recv_vector_crust_mantle, &
+                      num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
+                      nibool_interfaces_crust_mantle,&
+                      my_neighbours_crust_mantle, &
+                      b_request_send_vector_crust_mantle,b_request_recv_vector_crust_mantle, &
+                      IREGION_CRUST_MANTLE, &
+                      3) ! <-- 3 == adjoint b_accel 
+          ! inner core
+          call assemble_MPI_vector_send_cuda(NPROCTOT_VAL, &
+                      b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
+                      num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
+                      nibool_interfaces_inner_core,&
+                      my_neighbours_inner_core, &
+                      b_request_send_vector_inner_core,b_request_recv_vector_inner_core, &
+                      IREGION_INNER_CORE, &
+                      3)
+        endif ! GPU
+      endif ! SIMULATION_TYPE == 3
+
+    else
+      ! waits for send/receive requests to be completed and assembles values      
+      if(.NOT. GPU_MODE) then
+        ! on CPU
+        ! crust mantle
+        call assemble_MPI_vector_ext_mesh_w(NPROCTOT_VAL,NGLOB_CRUST_MANTLE, &
+                              accel_crust_mantle, &
+                              buffer_recv_vector_crust_mantle,num_interfaces_crust_mantle,&
+                              max_nibool_interfaces_crust_mantle, &
+                              nibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle, &
+                              request_send_vector_crust_mantle,request_recv_vector_crust_mantle)
+        ! inner core
+        call assemble_MPI_vector_ext_mesh_w(NPROCTOT_VAL,NGLOB_INNER_CORE, &
+                              accel_inner_core, &
+                              buffer_recv_vector_inner_core,num_interfaces_inner_core,&
+                              max_nibool_interfaces_inner_core, &
+                              nibool_interfaces_inner_core,ibool_interfaces_inner_core, &
+                              request_send_vector_inner_core,request_recv_vector_inner_core)
+      else
+        ! on GPU
+        ! crust mantle  
+        call assemble_MPI_vector_write_cuda(NPROCTOT_VAL, &
+                            buffer_recv_vector_crust_mantle, &
+                            num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
+                            request_send_vector_crust_mantle,request_recv_vector_crust_mantle, &
+                            IREGION_CRUST_MANTLE, &
+                            1) ! <-- 1 == fwd accel
+        ! inner core
+        call assemble_MPI_vector_write_cuda(NPROCTOT_VAL, &
+                            buffer_recv_vector_inner_core, &
+                            num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
+                            request_send_vector_inner_core,request_recv_vector_inner_core, &
+                            IREGION_INNER_CORE, &
+                            1) 
+        call load_CPU_elastic()
+      endif
+      
+
+      ! adjoint / kernel runs
+      if (SIMULATION_TYPE == 3) then
+        ! waits for send/receive requests to be completed and assembles values      
+        if(.NOT. GPU_MODE) then
+          ! on CPU
+          ! crust mantle
+          call assemble_MPI_vector_ext_mesh_w(NPROCTOT_VAL,NGLOB_CRUST_MANTLE, &
+                              b_accel_crust_mantle, &
+                              b_buffer_recv_vector_crust_mantle,num_interfaces_crust_mantle,&
+                              max_nibool_interfaces_crust_mantle, &
+                              nibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle, &
+                              b_request_send_vector_crust_mantle,b_request_recv_vector_crust_mantle)
+          ! inner core
+          call assemble_MPI_vector_ext_mesh_w(NPROCTOT_VAL,NGLOB_INNER_CORE, &
+                              b_accel_inner_core, &
+                              b_buffer_recv_vector_inner_core,num_interfaces_inner_core,&
+                              max_nibool_interfaces_inner_core, &
+                              nibool_interfaces_inner_core,ibool_interfaces_inner_core, &
+                              b_request_send_vector_inner_core,b_request_recv_vector_inner_core)
+                            
+        else
+          ! on GPU
+          ! crust mantle
+          call assemble_MPI_vector_write_cuda(NPROCTOT_VAL, &
+                            b_buffer_recv_vector_crust_mantle, &
+                            num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
+                            b_request_send_vector_crust_mantle,b_request_recv_vector_crust_mantle, &
+                            IREGION_CRUST_MANTLE, &
+                            3) ! <-- 3 == adjoint b_accel 
+          ! inner core
+          call assemble_MPI_vector_write_cuda(NPROCTOT_VAL,&
+                            b_buffer_recv_vector_inner_core, &
+                            num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
+                            b_request_send_vector_inner_core,b_request_recv_vector_inner_core, &
+                            IREGION_INNER_CORE, &
+                            3) 
+          call load_CPU_elastic()
+        endif                            
+      endif ! SIMULATION_TYPE == 3
+    endif ! iphase == 1
+
+  enddo ! iphase
+
+  ! updates (only) acceleration w/ rotation in the crust/mantle region (touches oceans)
+  if(.NOT. GPU_MODE) then
+    ! on CPU
+    call compute_forces_el_update_accel(veloc_crust_mantle,accel_crust_mantle, &
+                                      two_omega_earth,rmass_crust_mantle)
+    ! adjoint / kernel runs
+    if (SIMULATION_TYPE == 3) &
+      call compute_forces_el_update_accel(b_veloc_crust_mantle,b_accel_crust_mantle, &
+                                      b_two_omega_earth,rmass_crust_mantle)
+  else
+    ! on GPU
+    call load_GPU_elastic()    
+    call kernel_3_a_cuda(Mesh_pointer, &
+                        deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS_VAL)
+    call load_CPU_elastic()
+  endif
+  
+  ! couples ocean with crust mantle
+  ! (updates acceleration with ocean load approximation)
+  if(OCEANS_VAL) &
+    call compute_coupling_ocean(accel_crust_mantle,b_accel_crust_mantle, &
+                          rmass_crust_mantle,rmass_ocean_load,normal_top_crust_mantle, &
+                          ibool_crust_mantle,ibelm_top_crust_mantle, &
+                          updated_dof_ocean_load, &
+                          SIMULATION_TYPE,NSPEC2D_TOP(IREGION_CRUST_MANTLE))
+
+  ! Newmark time scheme:
+  ! corrector terms for elastic parts
+  ! (updates velocity)
+  if(.NOT. GPU_MODE ) then
+    ! on CPU
+    call compute_forces_el_update_veloc(veloc_crust_mantle,accel_crust_mantle, &
+                                      veloc_inner_core,accel_inner_core, &
+                                      deltatover2,two_omega_earth,rmass_inner_core)
+    ! adjoint / kernel runs
+    if (SIMULATION_TYPE == 3) &
+      call compute_forces_el_update_veloc(b_veloc_crust_mantle,b_accel_crust_mantle, &
+                                        b_veloc_inner_core,b_accel_inner_core, &
+                                        b_deltatover2,b_two_omega_earth,rmass_inner_core)
+  else
+    ! on GPU
+    call load_GPU_elastic()
+    call kernel_3_b_cuda(Mesh_pointer, &
+                        deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS_VAL)
+    call load_CPU_elastic()    
+  endif
+  
+  end subroutine compute_forces_elastic
+
+
+!=====================================================================
+
+  subroutine compute_forces_el_update_accel(veloc_crust_mantle,accel_crust_mantle, &
+                                           two_omega_earth,rmass_crust_mantle)
+
+  use specfem_par,only: CUSTOM_REAL,NGLOB_CRUST_MANTLE,NDIM
+
+#ifdef _HANDOPT
+  use specfem_par,only: imodulo_NGLOB_CRUST_MANTLE4
+#endif
+
+  implicit none
+
+  ! velocity & acceleration
+  ! crust/mantle region
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: veloc_crust_mantle,accel_crust_mantle
+
+  ! mass matrix
+  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: rmass_crust_mantle
+
+  real(kind=CUSTOM_REAL) :: two_omega_earth
+
+  ! local parameters
+  integer :: i
+
+  ! updates acceleration w/ rotation in crust/mantle region only
+
+#ifdef _HANDOPT_NEWMARK
+! way 2:
+  if(imodulo_NGLOB_CRUST_MANTLE4 >= 1) then
+    do i=1,imodulo_NGLOB_CRUST_MANTLE4
+      accel_crust_mantle(1,i) = accel_crust_mantle(1,i)*rmass_crust_mantle(i) &
+             + two_omega_earth*veloc_crust_mantle(2,i)
+      accel_crust_mantle(2,i) = accel_crust_mantle(2,i)*rmass_crust_mantle(i) &
+             - two_omega_earth*veloc_crust_mantle(1,i)
+      accel_crust_mantle(3,i) = accel_crust_mantle(3,i)*rmass_crust_mantle(i)
+    enddo
+  endif
+  do i=imodulo_NGLOB_CRUST_MANTLE4+1,NGLOB_CRUST_MANTLE,4
+    accel_crust_mantle(1,i) = accel_crust_mantle(1,i)*rmass_crust_mantle(i) &
+             + two_omega_earth*veloc_crust_mantle(2,i)
+    accel_crust_mantle(2,i) = accel_crust_mantle(2,i)*rmass_crust_mantle(i) &
+             - two_omega_earth*veloc_crust_mantle(1,i)
+    accel_crust_mantle(3,i) = accel_crust_mantle(3,i)*rmass_crust_mantle(i)
+
+    accel_crust_mantle(1,i+1) = accel_crust_mantle(1,i+1)*rmass_crust_mantle(i+1) &
+             + two_omega_earth*veloc_crust_mantle(2,i+1)
+    accel_crust_mantle(2,i+1) = accel_crust_mantle(2,i+1)*rmass_crust_mantle(i+1) &
+             - two_omega_earth*veloc_crust_mantle(1,i+1)
+    accel_crust_mantle(3,i+1) = accel_crust_mantle(3,i+1)*rmass_crust_mantle(i+1)
+
+    accel_crust_mantle(1,i+2) = accel_crust_mantle(1,i+2)*rmass_crust_mantle(i+2) &
+             + two_omega_earth*veloc_crust_mantle(2,i+2)
+    accel_crust_mantle(2,i+2) = accel_crust_mantle(2,i+2)*rmass_crust_mantle(i+2) &
+             - two_omega_earth*veloc_crust_mantle(1,i+2)
+    accel_crust_mantle(3,i+2) = accel_crust_mantle(3,i+2)*rmass_crust_mantle(i+2)
+
+    accel_crust_mantle(1,i+3) = accel_crust_mantle(1,i+3)*rmass_crust_mantle(i+3) &
+             + two_omega_earth*veloc_crust_mantle(2,i+3)
+    accel_crust_mantle(2,i+3) = accel_crust_mantle(2,i+3)*rmass_crust_mantle(i+3) &
+             - two_omega_earth*veloc_crust_mantle(1,i+3)
+    accel_crust_mantle(3,i+3) = accel_crust_mantle(3,i+3)*rmass_crust_mantle(i+3)
+  enddo
+#else
+! way 1:
+  do i=1,NGLOB_CRUST_MANTLE
+    accel_crust_mantle(1,i) = accel_crust_mantle(1,i)*rmass_crust_mantle(i) &
+             + two_omega_earth*veloc_crust_mantle(2,i)
+    accel_crust_mantle(2,i) = accel_crust_mantle(2,i)*rmass_crust_mantle(i) &
+             - two_omega_earth*veloc_crust_mantle(1,i)
+    accel_crust_mantle(3,i) = accel_crust_mantle(3,i)*rmass_crust_mantle(i)
+  enddo
+#endif
+
+  end subroutine compute_forces_el_update_accel
+
+
+!=====================================================================
+
+  subroutine compute_forces_el_update_veloc(veloc_crust_mantle,accel_crust_mantle, &
+                                            veloc_inner_core,accel_inner_core, &
+                                            deltatover2,two_omega_earth,rmass_inner_core)
+
+  use specfem_par,only: CUSTOM_REAL,NGLOB_CRUST_MANTLE,NGLOB_INNER_CORE,NDIM
+
+#ifdef _HANDOPT
+  use specfem_par,only: imodulo_NGLOB_CRUST_MANTLE4,imodulo_NGLOB_INNER_CORE
+#endif
+
+  implicit none
+
+  ! acceleration & velocity
+  ! crust/mantle region
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: veloc_crust_mantle,accel_crust_mantle
+  ! inner core
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: veloc_inner_core,accel_inner_core
+
+  ! mass matrix
+  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: rmass_inner_core
+
+  real(kind=CUSTOM_REAL) :: deltatover2,two_omega_earth
+
+  ! local parameters
+  integer :: i
+
+  ! Newmark time scheme:
+  !
+  ! note:
+  !   - crust/mantle region
+  !         needs only velocity corrector terms
+  !         (acceleration already updated before)
+  !   - inner core region
+  !         needs both, acceleration update & velocity corrector terms
+
+#ifdef _HANDOPT_NEWMARK
+! way 2:
+  ! crust/mantle region
+  if( imodulo_NGLOB_CRUST_MANTLE4 >= 1 ) then
+    do i=1,imodulo_NGLOB_CRUST_MANTLE4
+      veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + deltatover2*accel_crust_mantle(:,i)
+    enddo
+  endif
+  do i=imodulo_NGLOB_CRUST_MANTLE4+1,NGLOB_CRUST_MANTLE,4
+    veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + deltatover2*accel_crust_mantle(:,i)
+    veloc_crust_mantle(:,i+1) = veloc_crust_mantle(:,i+1) + deltatover2*accel_crust_mantle(:,i+1)
+    veloc_crust_mantle(:,i+2) = veloc_crust_mantle(:,i+2) + deltatover2*accel_crust_mantle(:,i+2)
+    veloc_crust_mantle(:,i+3) = veloc_crust_mantle(:,i+3) + deltatover2*accel_crust_mantle(:,i+3)
+  enddo
+
+  ! inner core region
+  if(imodulo_NGLOB_INNER_CORE >= 1) then
+    do i=1,imodulo_NGLOB_INNER_CORE
+      accel_inner_core(1,i) = accel_inner_core(1,i)*rmass_inner_core(i) &
+             + two_omega_earth*veloc_inner_core(2,i)
+      accel_inner_core(2,i) = accel_inner_core(2,i)*rmass_inner_core(i) &
+             - two_omega_earth*veloc_inner_core(1,i)
+      accel_inner_core(3,i) = accel_inner_core(3,i)*rmass_inner_core(i)
+
+      veloc_inner_core(:,i) = veloc_inner_core(:,i) + deltatover2*accel_inner_core(:,i)
+    enddo
+  endif
+  do i=imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE,3
+    accel_inner_core(1,i) = accel_inner_core(1,i)*rmass_inner_core(i) &
+           + two_omega_earth*veloc_inner_core(2,i)
+    accel_inner_core(2,i) = accel_inner_core(2,i)*rmass_inner_core(i) &
+           - two_omega_earth*veloc_inner_core(1,i)
+    accel_inner_core(3,i) = accel_inner_core(3,i)*rmass_inner_core(i)
+
+    veloc_inner_core(:,i) = veloc_inner_core(:,i) + deltatover2*accel_inner_core(:,i)
+
+    accel_inner_core(1,i+1) = accel_inner_core(1,i+1)*rmass_inner_core(i+1) &
+           + two_omega_earth*veloc_inner_core(2,i+1)
+    accel_inner_core(2,i+1) = accel_inner_core(2,i+1)*rmass_inner_core(i+1) &
+           - two_omega_earth*veloc_inner_core(1,i+1)
+    accel_inner_core(3,i+1) = accel_inner_core(3,i+1)*rmass_inner_core(i+1)
+
+    veloc_inner_core(:,i+1) = veloc_inner_core(:,i+1) + deltatover2*accel_inner_core(:,i+1)
+
+    accel_inner_core(1,i+2) = accel_inner_core(1,i+2)*rmass_inner_core(i+2) &
+           + two_omega_earth*veloc_inner_core(2,i+2)
+    accel_inner_core(2,i+2) = accel_inner_core(2,i+2)*rmass_inner_core(i+2) &
+           - two_omega_earth*veloc_inner_core(1,i+2)
+    accel_inner_core(3,i+2) = accel_inner_core(3,i+2)*rmass_inner_core(i+2)
+
+    veloc_inner_core(:,i+2) = veloc_inner_core(:,i+2) + deltatover2*accel_inner_core(:,i+2)
+  enddo
+#else
+! way 1:
+  ! mantle
+  do i=1,NGLOB_CRUST_MANTLE
+    veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + deltatover2*accel_crust_mantle(:,i)
+  enddo
+  ! inner core
+  do i=1,NGLOB_INNER_CORE
+    accel_inner_core(1,i) = accel_inner_core(1,i)*rmass_inner_core(i) &
+           + two_omega_earth*veloc_inner_core(2,i)
+    accel_inner_core(2,i) = accel_inner_core(2,i)*rmass_inner_core(i) &
+           - two_omega_earth*veloc_inner_core(1,i)
+    accel_inner_core(3,i) = accel_inner_core(3,i)*rmass_inner_core(i)
+
+    veloc_inner_core(:,i) = veloc_inner_core(:,i) + deltatover2*accel_inner_core(:,i)
+  enddo
+#endif
+
+  end subroutine compute_forces_el_update_veloc
+
+!=====================================================================
+
+  subroutine load_GPU_elastic
+  
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  implicit none
+  
+  ! daniel: TODO - temporary transfers to the GPU
+  call transfer_fields_cm_to_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle, &
+                                  veloc_crust_mantle,accel_crust_mantle,Mesh_pointer)
+  call transfer_fields_ic_to_device(NDIM*NGLOB_INNER_CORE,displ_inner_core, &
+                                  veloc_inner_core,accel_inner_core,Mesh_pointer)
+
+  if( SIMULATION_TYPE == 3 ) then
+    call transfer_b_fields_cm_to_device(NDIM*NGLOB_CRUST_MANTLE,b_displ_crust_mantle, &
+                                  b_veloc_crust_mantle,b_accel_crust_mantle,Mesh_pointer)
+    call transfer_b_fields_ic_to_device(NDIM*NGLOB_INNER_CORE,b_displ_inner_core, &
+                                  b_veloc_inner_core,b_accel_inner_core,Mesh_pointer)
+  endif
+    
+  end subroutine 
+
+!=====================================================================
+
+  subroutine load_CPU_elastic
+  
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  implicit none
+  
+  ! daniel: TODO - temporary transfers back to the CPU
+  call transfer_fields_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle, &
+                                  veloc_crust_mantle,accel_crust_mantle,Mesh_pointer)                                  
+  call transfer_fields_ic_from_device(NDIM*NGLOB_INNER_CORE,displ_inner_core, &
+                                  veloc_inner_core,accel_inner_core,Mesh_pointer)
+
+  if( SIMULATION_TYPE == 3 ) then
+    call transfer_b_fields_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,b_displ_crust_mantle, &
+                                  b_veloc_crust_mantle,b_accel_crust_mantle,Mesh_pointer)                                  
+    call transfer_b_fields_ic_from_device(NDIM*NGLOB_INNER_CORE,b_displ_inner_core, &
+                                  b_veloc_inner_core,b_accel_inner_core,Mesh_pointer)  
+  endif
+    
+  end subroutine

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -25,92 +25,91 @@
 !
 !=====================================================================
 
-  subroutine compute_forces_inner_core(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_inner_core,accel_inner_core,xstore,ystore,zstore, &
-          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-!----------------------
-            is_on_a_slice_edge_inner_core,icall, &
-            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_yy,hprime_zz, &
-          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore,muvstore,ibool,idoubling, &
-          c11store,c33store,c12store,c13store,c44store,R_memory,epsilondev,epsilon_trace_over_3,&
-          one_minus_sum_beta,alphaval,betaval,gammaval,factor_common, &
-          vx,vy,vz,vnspec)
+  subroutine compute_forces_inner_core( displ_inner_core,accel_inner_core, &
+                                        phase_is_inner, &
+                                        R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                        epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                        epsilondev_xz,epsilondev_yz, &
+                                        epsilon_trace_over_3,&
+                                        alphaval,betaval,gammaval,factor_common, &
+                                        vx,vy,vz,vnspec)
 
-  implicit none
+  use constants
 
-  include "constants.h"
+  use specfem_par,only: &
+    hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+    minus_gravity_table,density_table,minus_deriv_gravity_table, &
+    myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+    iprocfrom_faces,iprocto_faces, &
+    iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+    npoin2D_max_all_CM_IC,INCLUDE_CENTRAL_CUBE
 
-! include values created by the mesher
-! done for performance only using static allocation to allow for loop unrolling
-  include "OUTPUT_FILES/values_from_mesher.h"
+  use specfem_par_innercore,only: &
+    xstore => xstore_inner_core,ystore => ystore_inner_core,zstore => zstore_inner_core, &
+    xix => xix_inner_core,xiy => xiy_inner_core,xiz => xiz_inner_core, &
+    etax => etax_inner_core,etay => etay_inner_core,etaz => etaz_inner_core, &
+    gammax => gammax_inner_core,gammay => gammay_inner_core,gammaz => gammaz_inner_core, &
+    kappavstore => kappavstore_inner_core, &
+    muvstore => muvstore_inner_core, &
+    c11store => c11store_inner_core,c12store => c12store_inner_core,c13store => c13store_inner_core, &
+    c33store => c33store_inner_core,c44store => c44store_inner_core, &
+    ibool => ibool_inner_core,idoubling => idoubling_inner_core, &
+    one_minus_sum_beta => one_minus_sum_beta_inner_core, &
+    ibool_inner_core,idoubling_inner_core, &
+    iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+    npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+    iboolfaces_inner_core,iboolcorner_inner_core, &
+    nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+    npoin2D_cube_from_slices, &
+    ibool_central_cube, &
+    receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC, &
+    phase_ispec_inner => phase_ispec_inner_inner_core, &
+    nspec_outer => nspec_outer_inner_core, &
+    nspec_inner => nspec_inner_inner_core
 
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: displ_inner_core,accel_inner_core
+  use specfem_par_crustmantle,only: &
+    iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+    npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+    iboolfaces_crust_mantle,iboolcorner_crust_mantle
 
-! for attenuation
-! memory variables R_ij are stored at the local rather than global level
-! to allow for optimization of cache access by compiler
-  integer i_SLS,i_memory
-  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
+  implicit none
 
-! variable lengths for factor_common and one_minus_sum_beta
-  integer vx, vy, vz, vnspec
+  ! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: displ_inner_core,accel_inner_core
 
-  real(kind=CUSTOM_REAL), dimension(vx, vy, vz, vnspec) :: one_minus_sum_beta
-
+  ! for attenuation
+  ! memory variables R_ij are stored at the local rather than global level
+  ! to allow for optimization of cache access by compiler
+  ! variable lengths for factor_common and one_minus_sum_beta
+  integer vx, vy, vz, vnspec
   real(kind=CUSTOM_REAL), dimension(N_SLS, vx, vy, vz, vnspec) :: factor_common
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-  real(kind=CUSTOM_REAL), dimension(NGLLX, NGLLY, NGLLZ) :: factor_common_use
 
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: R_memory
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilondev
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilon_trace_over_3
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: R_memory
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
+    R_xx,R_yy,R_xy,R_xz,R_yz
 
-! array with the local to global mapping per slice
-  integer, dimension(NSPEC_INNER_CORE) :: idoubling
 
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: xix,xiy,xiz, &
-                      etax,etay,etaz,gammax,gammay,gammaz
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilondev
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
+    epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
 
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilon_trace_over_3
 
+  ! inner/outer element run flag
+  logical :: phase_is_inner
+
+  ! local parameters
+  
+  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
+  real(kind=CUSTOM_REAL), dimension(NGLLX, NGLLY, NGLLZ) :: factor_common_use
+  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
+  integer i_SLS
+
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
 
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: kappavstore,muvstore
-
-!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
-!    c11store,c33store,c12store,c13store,c44store
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_IC) :: &
-    c11store,c33store,c12store,c13store,c44store
-
   integer ispec,iglob,ispec_strain
   integer i,j,k,l
 
@@ -135,126 +134,43 @@
   real(kind=CUSTOM_REAL) tempy1l,tempy2l,tempy3l
   real(kind=CUSTOM_REAL) tempz1l,tempz2l,tempz3l
 
-! for gravity
-  integer int_radius
-  real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
+  ! for gravity
   double precision radius,rho,minus_g,minus_dg
   double precision minus_g_over_radius,minus_dg_plus_g_over_radius
   double precision cos_theta,sin_theta,cos_phi,sin_phi
   double precision cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq
   double precision theta,phi,factor,gxl,gyl,gzl,sx_l,sy_l,sz_l
   double precision Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl
-  double precision, dimension(NRAD_GRAVITY) :: minus_gravity_table,density_table,minus_deriv_gravity_table
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
-  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: xstore,ystore,zstore
+  real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
+  integer :: int_radius
 
-! this for non blocking MPI
-  integer :: iphase,icall
+!  integer :: computed_elements
+  integer :: num_elements,ispec_p
+  integer :: iphase
 
-  integer :: computed_elements
-
-  logical, dimension(NSPEC_INNER_CORE) :: is_on_a_slice_edge_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: accel_crust_mantle
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
-
-  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
-
-  integer :: ichunk,iproc_xi,iproc_eta,myrank
-
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
-
-  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
-  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
-
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-       npoin2D_xi_inner_core,npoin2D_eta_inner_core
-
-! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
-
-! communication pattern for corners between chunks
-  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
-
-  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
-  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
-
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_inner_core
-
-  integer :: npoin2D_max_all_CM_IC
-  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC) :: buffer_send_faces,buffer_received_faces
-
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
-     buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector
-
-! for matching with central cube in inner core
-  integer nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
-  integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
-  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices
-  double precision, dimension(npoin2D_cube_from_slices,NDIM,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
-  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
-  integer receiver_cube_from_slices
-  logical :: INCLUDE_CENTRAL_CUBE
-
-! local to global mapping
-  integer NSPEC2D_BOTTOM_INNER_CORE
-  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
-
 ! ****************************************************
 !   big loop over all spectral elements in the solid
 ! ****************************************************
 
-  computed_elements = 0
+!  computed_elements = 0
+  if( .not. phase_is_inner ) then
+    iphase = 1
+    num_elements = nspec_outer
+  else
+    iphase = 2
+    num_elements = nspec_inner
+  endif
 
-  do ispec = 1,NSPEC_INNER_CORE
+  do ispec_p = 1,num_elements
 
-! hide communications by computing the edges first
-    if((icall == 2 .and. is_on_a_slice_edge_inner_core(ispec)) .or. &
-       (icall == 1 .and. .not. is_on_a_slice_edge_inner_core(ispec))) cycle
+    ispec = phase_ispec_inner(ispec_p,iphase)
 
-! exclude fictitious elements in central cube
+    ! only compute element which belong to current phase (inner or outer elements)
+
+    ! exclude fictitious elements in central cube
     if(idoubling(ispec) /= IFLAG_IN_FICTITIOUS_CUBE) then
 
-! process the communications every ELEMENTS_NONBLOCKING elements
-    computed_elements = computed_elements + 1
-    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_CM_IC) == 0) then
-
-      if(iphase <= 7) call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
-            NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
-
-      if(INCLUDE_CENTRAL_CUBE) then
-          if(iphase > 7 .and. iphase_CC <= 4) &
-            call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-                   npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-                   receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
-                   ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,accel_inner_core,NDIM,iphase_CC)
-      endif
-
-    endif
-
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
@@ -422,14 +338,14 @@
 ! subtract memory variables if attenuation
           if(ATTENUATION_VAL .and. ( USE_ATTENUATION_MIMIC .eqv. .false. ) ) then
             do i_SLS = 1,N_SLS
-              R_xx_val = R_memory(1,i_SLS,i,j,k,ispec)
-              R_yy_val = R_memory(2,i_SLS,i,j,k,ispec)
+              R_xx_val = R_xx(i_SLS,i,j,k,ispec)
+              R_yy_val = R_yy(i_SLS,i,j,k,ispec)
               sigma_xx = sigma_xx - R_xx_val
               sigma_yy = sigma_yy - R_yy_val
               sigma_zz = sigma_zz + R_xx_val + R_yy_val
-              sigma_xy = sigma_xy - R_memory(3,i_SLS,i,j,k,ispec)
-              sigma_xz = sigma_xz - R_memory(4,i_SLS,i,j,k,ispec)
-              sigma_yz = sigma_yz - R_memory(5,i_SLS,i,j,k,ispec)
+              sigma_xy = sigma_xy - R_xy(i_SLS,i,j,k,ispec)
+              sigma_xz = sigma_xz - R_xz(i_SLS,i,j,k,ispec)
+              sigma_yz = sigma_yz - R_yz(i_SLS,i,j,k,ispec)
             enddo
           endif
 
@@ -649,14 +565,36 @@
 
       do i_SLS = 1,N_SLS
         factor_common_use = factor_common(i_SLS,:,:,:,ispec)
-        do i_memory = 1,5
-          R_memory(i_memory,i_SLS,:,:,:,ispec) = &
-                  alphaval(i_SLS) * &
-                  R_memory(i_memory,i_SLS,:,:,:,ispec) + muvstore(:,:,:,ispec) * &
-                  factor_common_use * &
-                  (betaval(i_SLS) * &
-                  epsilondev(i_memory,:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(i_memory,:,:,:))
-        enddo
+!        do i_memory = 1,5
+!          R_memory(i_memory,i_SLS,:,:,:,ispec) = &
+!                  alphaval(i_SLS) * &
+!                  R_memory(i_memory,i_SLS,:,:,:,ispec) + muvstore(:,:,:,ispec) * &
+!                  factor_common_use * &
+!                  (betaval(i_SLS) * &
+!                  epsilondev(i_memory,:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(i_memory,:,:,:))
+!        enddo
+
+        R_xx(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xx(i_SLS,:,:,:,ispec) &
+                  + muvstore(:,:,:,ispec) * factor_common_use * &
+                  (betaval(i_SLS) * epsilondev_xx(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(1,:,:,:))
+
+        R_yy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yy(i_SLS,:,:,:,ispec) &
+                  + muvstore(:,:,:,ispec) * factor_common_use * &
+                  (betaval(i_SLS) * epsilondev_yy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(2,:,:,:))
+
+        R_xy(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xy(i_SLS,:,:,:,ispec) &
+                  + muvstore(:,:,:,ispec) * factor_common_use * &
+                  (betaval(i_SLS) * epsilondev_xy(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(3,:,:,:))
+
+        R_xz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_xz(i_SLS,:,:,:,ispec) &
+                  + muvstore(:,:,:,ispec) * factor_common_use * &
+                  (betaval(i_SLS) * epsilondev_xz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(4,:,:,:))
+
+        R_yz(i_SLS,:,:,:,ispec) = alphaval(i_SLS) * R_yz(i_SLS,:,:,:,ispec) &
+                  + muvstore(:,:,:,ispec) * factor_common_use * &
+                  (betaval(i_SLS) * epsilondev_yz(:,:,:,ispec) + gammaval(i_SLS) * epsilondev_loc(5,:,:,:))
+
+
       enddo
 
     endif
@@ -667,7 +605,11 @@
       do k=1,NGLLZ
         do j=1,NGLLY
           do i=1,NGLLX
-            epsilondev(:,i,j,k,ispec) = epsilondev_loc(:,i,j,k)
+            epsilondev_xx(i,j,k,ispec) = epsilondev_loc(1,i,j,k)
+            epsilondev_yy(i,j,k,ispec) = epsilondev_loc(2,i,j,k)
+            epsilondev_xy(i,j,k,ispec) = epsilondev_loc(3,i,j,k)
+            epsilondev_xz(i,j,k,ispec) = epsilondev_loc(4,i,j,k)
+            epsilondev_yz(i,j,k,ispec) = epsilondev_loc(5,i,j,k)
           enddo
         enddo
       enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -30,99 +30,88 @@
 ! or compile with: -D_HANDOPT
 !#define _HANDOPT
 
-! BEWARE:
-! BEWARE: we have observed that using _HANDOPT in combination with -O3 or higher can lead to problems on some machines;
-! BEWARE: thus, be careful when using it. At the very least, run the same test simulation once with _HANDOPT and once without
-! BEWARE: and make sure that all the seismograms you get are the same down to roundoff noise.
-! BEWARE:
-
 ! note: these hand optimizations should help compilers to pipeline the code and make better use of the cache;
 !          depending on compilers, it can further decrease the computation time by ~ 30%.
 !          the original routines are commented with "! way 1", the hand-optimized routines with  "! way 2"
 
-  subroutine compute_forces_inner_core_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_inner_core,accel_inner_core,xstore,ystore,zstore, &
-          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-!----------------------
-            is_on_a_slice_edge_inner_core,icall, &
-            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_INNER_CORE,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore,muvstore,ibool,idoubling, &
-          c11store,c33store,c12store,c13store,c44store,R_memory,epsilondev,epsilon_trace_over_3,&
-          one_minus_sum_beta,alphaval,betaval,gammaval,factor_common, &
-          vx,vy,vz,vnspec)
+  subroutine compute_forces_inner_core_Dev( displ_inner_core,accel_inner_core, &
+                                            phase_is_inner, &
+                                            R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                            epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                            epsilondev_xz,epsilondev_yz, &
+                                            epsilon_trace_over_3,&
+                                            alphaval,betaval,gammaval,factor_common, &
+                                            vx,vy,vz,vnspec)
 
 ! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
 
-  implicit none
+  use constants
 
-  include "constants.h"
+  use specfem_par,only: &
+    hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+    minus_gravity_table,density_table,minus_deriv_gravity_table, &
+    myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+    iprocfrom_faces,iprocto_faces, &
+    iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+    npoin2D_max_all_CM_IC,INCLUDE_CENTRAL_CUBE
 
-! include values created by the mesher
-! done for performance only using static allocation to allow for loop unrolling
-  include "OUTPUT_FILES/values_from_mesher.h"
+  use specfem_par_innercore,only: &
+    xstore => xstore_inner_core,ystore => ystore_inner_core,zstore => zstore_inner_core, &
+    xix => xix_inner_core,xiy => xiy_inner_core,xiz => xiz_inner_core, &
+    etax => etax_inner_core,etay => etay_inner_core,etaz => etaz_inner_core, &
+    gammax => gammax_inner_core,gammay => gammay_inner_core,gammaz => gammaz_inner_core, &
+    kappavstore => kappavstore_inner_core, &
+    muvstore => muvstore_inner_core, &
+    c11store => c11store_inner_core,c12store => c12store_inner_core,c13store => c13store_inner_core, &
+    c33store => c33store_inner_core,c44store => c44store_inner_core, &
+    ibool => ibool_inner_core,idoubling => idoubling_inner_core, &
+    one_minus_sum_beta => one_minus_sum_beta_inner_core, &
+    ibool_inner_core,idoubling_inner_core, &
+    iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+    npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+    iboolfaces_inner_core,iboolcorner_inner_core, &
+    nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
+    npoin2D_cube_from_slices, &
+    ibool_central_cube, &
+    receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC, &
+    phase_ispec_inner => phase_ispec_inner_inner_core, &
+    nspec_outer => nspec_outer_inner_core, &
+    nspec_inner => nspec_inner_inner_core
 
-  ! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: displ_inner_core,accel_inner_core
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool
 
-  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: xstore,ystore,zstore
+  use specfem_par_crustmantle,only: &
+    iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+    npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+    iboolfaces_crust_mantle,iboolcorner_crust_mantle
 
-  ! arrays with mesh parameters per slice
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: xix,xiy,xiz, &
-                      etax,etay,etaz,gammax,gammay,gammaz
+  implicit none
 
+  ! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: displ_inner_core,accel_inner_core
+
   ! for attenuation
   ! memory variables R_ij are stored at the local rather than global level
   ! to allow for optimization of cache access by compiler
   ! variable lengths for factor_common and one_minus_sum_beta
   integer vx, vy, vz, vnspec
   real(kind=CUSTOM_REAL), dimension(N_SLS, vx, vy, vz, vnspec) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(vx, vy, vz, vnspec) :: one_minus_sum_beta
-
   real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
 
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: R_memory
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilondev
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilon_trace_over_3
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: R_memory
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
+    R_xx,R_yy,R_xy,R_xz,R_yz
 
-  ! array with derivatives of Lagrange polynomials and precalculated products
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xxT,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilondev
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
+    epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
 
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: kappavstore,muvstore
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: epsilon_trace_over_3
 
-!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
-!    c11store,c33store,c12store,c13store,c44store
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_IC) :: &
-    c11store,c33store,c12store,c13store,c44store
+  ! inner/outer element run flag
+  logical :: phase_is_inner
 
-  ! array with the local to global mapping per slice
-  integer, dimension(NSPEC_INNER_CORE) :: idoubling
+  ! local parameters
 
-  double precision, dimension(NRAD_GRAVITY) :: minus_gravity_table,density_table,minus_deriv_gravity_table
-
-! local parameters
   ! Deville
   ! manually inline the calls to the Deville et al. (2002) routines
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
@@ -192,68 +181,10 @@
   integer :: ispec,ispec_strain
   integer :: i,j,k
   integer :: iglob1
+!  integer :: computed_elements
+  integer :: num_elements,ispec_p
+  integer :: iphase
 
-! this for non blocking MPI
-  integer :: iphase,icall
-
-  integer :: computed_elements
-
-  logical, dimension(NSPEC_INNER_CORE) :: is_on_a_slice_edge_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: accel_crust_mantle
-
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
-
-  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
-
-  integer :: ichunk,iproc_xi,iproc_eta,myrank
-
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
-
-  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
-  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
-
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-       npoin2D_xi_inner_core,npoin2D_eta_inner_core
-
-! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
-
-! communication pattern for corners between chunks
-  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
-
-  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
-  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
-
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_inner_core
-
-  integer :: npoin2D_max_all_CM_IC
-  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC) :: buffer_send_faces,buffer_received_faces
-
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
-     buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector
-
-! for matching with central cube in inner core
-  integer nb_msgs_theor_in_cube, npoin2D_cube_from_slices,iphase_CC
-  integer, dimension(nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
-  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices
-  double precision, dimension(npoin2D_cube_from_slices,NDIM,nb_msgs_theor_in_cube) :: buffer_all_cube_from_slices
-  integer, dimension(nb_msgs_theor_in_cube,npoin2D_cube_from_slices):: ibool_central_cube
-  integer receiver_cube_from_slices
-  logical :: INCLUDE_CENTRAL_CUBE
-
-! local to global mapping
-  integer NSPEC2D_BOTTOM_INNER_CORE
-  integer, dimension(NSPEC2D_BOTTOM_INNER_CORE) :: ibelm_bottom_inner_core
-
 #ifdef _HANDOPT
   integer, dimension(5) :: iglobv5
 #endif
@@ -262,47 +193,24 @@
 !   big loop over all spectral elements in the solid
 ! ****************************************************
 
-  computed_elements = 0
+!  computed_elements = 0
+  if( .not. phase_is_inner ) then
+    iphase = 1
+    num_elements = nspec_outer
+  else
+    iphase = 2
+    num_elements = nspec_inner
+  endif
 
-  do ispec = 1,NSPEC_INNER_CORE
+  do ispec_p = 1,num_elements
 
-! hide communications by computing the edges first
-    if((icall == 2 .and. is_on_a_slice_edge_inner_core(ispec)) .or. &
-       (icall == 1 .and. .not. is_on_a_slice_edge_inner_core(ispec))) cycle
+    ispec = phase_ispec_inner(ispec_p,iphase)
 
+    ! only compute element which belong to current phase (inner or outer elements)
+
     ! exclude fictitious elements in central cube
     if(idoubling(ispec) /= IFLAG_IN_FICTITIOUS_CUBE) then
 
-! process the communications every ELEMENTS_NONBLOCKING elements
-      computed_elements = computed_elements + 1
-      if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_CM_IC) == 0) then
-
-        if(iphase <= 7) call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_CM, &
-            NGLOB1D_RADIAL_IC,NCHUNKS_VAL,iphase)
-
-        if(INCLUDE_CENTRAL_CUBE) then
-          if(iphase > 7 .and. iphase_CC <= 4) &
-            call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-                   npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-                   receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
-                   ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,accel_inner_core,NDIM,iphase_CC)
-        endif
-
-      endif
-
       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
       ! for incompressible fluid flow, Cambridge University Press (2002),
       ! pages 386 and 389 and Figure 8.3.1
@@ -542,8 +450,12 @@
             if(ATTENUATION_VAL .and. ( USE_ATTENUATION_MIMIC .eqv. .false. ) ) then
 
               ! note: fortran passes pointers to array location, thus R_memory(1,1,...) should be fine
-              call compute_element_att_stress( R_memory(1,1,i,j,k,ispec), &
-                    sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
+              call compute_element_att_stress(R_xx(1,i,j,k,ispec), &
+                                             R_yy(1,i,j,k,ispec), &
+                                             R_xy(1,i,j,k,ispec), &
+                                             R_xz(1,i,j,k,ispec), &
+                                             R_yz(1,i,j,k,ispec), &
+                                             sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
 
             endif
 
@@ -808,17 +720,23 @@
       if(ATTENUATION_VAL .and. ( USE_ATTENUATION_MIMIC .eqv. .false. ) ) then
 
         ! updates R_memory
-        call compute_element_att_memory_ic(ispec,R_memory, &
+        call compute_element_att_memory_ic(ispec,R_xx,R_yy,R_xy,R_xz,R_yz, &
                                       vx,vy,vz,vnspec,factor_common, &
                                       alphaval,betaval,gammaval, &
                                       muvstore, &
-                                      epsilondev,epsilondev_loc)
+                                      epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                      epsilondev_xz,epsilondev_yz, &
+                                      epsilondev_loc)
 
       endif
 
       ! save deviatoric strain for Runge-Kutta scheme
       if(COMPUTE_AND_STORE_STRAIN) then
-        epsilondev(:,:,:,:,ispec) = epsilondev_loc(:,:,:,:)
+        epsilondev_xx(:,:,:,ispec) = epsilondev_loc(1,:,:,:)
+        epsilondev_yy(:,:,:,ispec) = epsilondev_loc(2,:,:,:)
+        epsilondev_xy(:,:,:,ispec) = epsilondev_loc(3,:,:,:)
+        epsilondev_xz(:,:,:,ispec) = epsilondev_loc(4,:,:,:)
+        epsilondev_yz(:,:,:,ispec) = epsilondev_loc(5,:,:,:)
       endif
 
     endif   ! end test to exclude fictitious elements in central cube

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -26,72 +26,60 @@
 !=====================================================================
 
   subroutine compute_forces_outer_core(time,deltat,two_omega_earth, &
-                          A_array_rotation,B_array_rotation, &
-                          d_ln_density_dr_table, &
-                          minus_rho_g_over_kappa_fluid,displfluid,accelfluid, &
-                          div_displfluid, &
-                          xstore,ystore,zstore, &
-                          xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-          is_on_a_slice_edge_outer_core, &
-          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-          iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces, &
-          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-          buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-          buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar,iphase,icall, &
-                          hprime_xx,hprime_yy,hprime_zz, &
-                          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-                          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-                          ibool,MOVIE_VOLUME)
+                                      A_array_rotation,B_array_rotation, &
+                                      displfluid,accelfluid,div_displfluid, &
+                                      phase_is_inner)
 
-  implicit none
+  use constants
 
-  include "constants.h"
+  use specfem_par,only: &
+    hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+    minus_rho_g_over_kappa_fluid,d_ln_density_dr_table, &
+    MOVIE_VOLUME, &
+    myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+    iprocfrom_faces,iprocto_faces, &
+    iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+    npoin2D_max_all_CM_IC
 
-! include values created by the mesher
-! done for performance only using static allocation to allow for loop unrolling
-  include "OUTPUT_FILES/values_from_mesher.h"
+  use specfem_par_outercore,only: &
+    xstore => xstore_outer_core,ystore => ystore_outer_core,zstore => zstore_outer_core, &
+    xix => xix_outer_core,xiy => xiy_outer_core,xiz => xiz_outer_core, &
+    etax => etax_outer_core,etay => etay_outer_core,etaz => etaz_outer_core, &
+    gammax => gammax_outer_core,gammay => gammay_outer_core,gammaz => gammaz_outer_core, &
+    ibool => ibool_outer_core, &
+    iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
+    npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
+    iboolfaces_outer_core,iboolcorner_outer_core, &
+    phase_ispec_inner => phase_ispec_inner_outer_core, &
+    nspec_outer => nspec_outer_outer_core, &
+    nspec_inner => nspec_inner_outer_core
 
-! displacement and acceleration
+  implicit none
+
+  ! for the Euler scheme for rotation
+  real(kind=CUSTOM_REAL) time,deltat,two_omega_earth
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROTATION) :: &
+    A_array_rotation,B_array_rotation
+
+  ! displacement and acceleration
   real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: displfluid,accelfluid
 
-! divergence of displacement
+  ! divergence of displacement
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ADJOINT) :: div_displfluid
 
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec_outer_core) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec_outer_core) :: xix,xiy,xiz, &
-                      etax,etay,etaz,gammax,gammay,gammaz
+  ! inner/outer element run flag
+  logical :: phase_is_inner
 
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
+  ! local parameters
 
-  logical MOVIE_VOLUME
-
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
-
-! for gravity
+  ! for gravity
   integer int_radius
   double precision radius,theta,phi,gxl,gyl,gzl
   double precision cos_theta,sin_theta,cos_phi,sin_phi
-  double precision, dimension(NRAD_GRAVITY) :: minus_rho_g_over_kappa_fluid
-  double precision, dimension(NRAD_GRAVITY) :: d_ln_density_dr_table
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: gravity_term
-  real(kind=CUSTOM_REAL), dimension(nglob_outer_core) :: xstore,ystore,zstore
-
-! for the Euler scheme for rotation
-  real(kind=CUSTOM_REAL) time,deltat,two_omega_earth
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROTATION) :: &
-    A_array_rotation,B_array_rotation
-
+  ! for the Euler scheme for rotation
   real(kind=CUSTOM_REAL) two_omega_deltat,cos_two_omega_t,sin_two_omega_t,A_rotation,B_rotation, &
        ux_rotation,uy_rotation,dpotentialdx_with_rot,dpotentialdy_with_rot
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: source_euler_A,source_euler_B
@@ -105,74 +93,30 @@
 
   double precision grad_x_ln_rho,grad_y_ln_rho,grad_z_ln_rho
 
-! this for non blocking MPI
-  integer :: ichunk,iproc_xi,iproc_eta,myrank
+!  integer :: computed_elements
+  integer :: num_elements,ispec_p
+  integer :: iphase
 
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_OC) :: iboolleft_xi_outer_core,iboolright_xi_outer_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_OC) :: iboolleft_eta_outer_core,iboolright_eta_outer_core
-
-  integer npoin2D_faces_outer_core(NUMFACES_SHARED)
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_outer_core,npoin2D_eta_outer_core
-
-! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
-
-! communication pattern for corners between chunks
-  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
-
-! indirect addressing for each message for faces and corners of the chunks
-! a given slice can belong to at most one corner and at most two faces
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_outer_core
-
-! buffers for send and receive between faces of the slices and the chunks
-! we use the same buffers to assemble scalars and vectors because vectors are
-! always three times bigger and therefore scalars can use the first part
-! of the vector buffer in memory even if it has an additional index here
-! allocate these automatic arrays in the memory stack to avoid memory fragmentation with "allocate()"
-  integer :: npoin2D_max_all_CM_IC
-  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED) :: buffer_send_faces,buffer_received_faces
-
-  integer, dimension(NGLOB1D_RADIAL_OC,NUMCORNERS_SHARED) :: iboolcorner_outer_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLOB1D_RADIAL_OC) :: buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar
-
-  logical, dimension(NSPEC_OUTER_CORE) :: is_on_a_slice_edge_outer_core
-
-  integer :: iphase,icall
-
-  integer :: computed_elements
-
 ! ****************************************************
 !   big loop over all spectral elements in the fluid
 ! ****************************************************
 
-  if (NSPEC_OUTER_CORE_ADJOINT /= 1 .and. icall == 1) div_displfluid(:,:,:,:) = 0._CUSTOM_REAL
+  if (NSPEC_OUTER_CORE_ADJOINT /= 1 .and. ( .not. phase_is_inner )) div_displfluid(:,:,:,:) = 0._CUSTOM_REAL
 
-  computed_elements = 0
+!  computed_elements = 0
+  if( .not. phase_is_inner ) then
+    iphase = 1
+    num_elements = nspec_outer
+  else
+    iphase = 2
+    num_elements = nspec_inner
+  endif
 
-  do ispec = 1,NSPEC_OUTER_CORE
+  do ispec_p = 1,num_elements
 
-! hide communications by computing the edges first
-    if((icall == 2 .and. is_on_a_slice_edge_outer_core(ispec)) .or. &
-       (icall == 1 .and. .not. is_on_a_slice_edge_outer_core(ispec))) cycle
+    ispec = phase_ispec_inner(ispec_p,iphase)
 
-! process the communications every ELEMENTS_NONBLOCKING elements
-    computed_elements = computed_elements + 1
-    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_OC) == 0 .and. iphase <= 7) &
-      call assemble_MPI_scalar(myrank,accelfluid,NGLOB_OUTER_CORE, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-            iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
-            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_OC, &
-            NGLOB2DMAX_XMIN_XMAX_OC,NGLOB2DMAX_YMIN_YMAX_OC,NGLOB2DMAX_XY_VAL,NCHUNKS_VAL,iphase)
+    ! only compute element which belong to current phase (inner or outer elements)
 
     do k=1,NGLLZ
       do j=1,NGLLY

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -26,81 +26,69 @@
 !=====================================================================
 
   subroutine compute_forces_outer_core_Dev(time,deltat,two_omega_earth, &
-                            A_array_rotation,B_array_rotation, &
-                            d_ln_density_dr_table, &
-                            minus_rho_g_over_kappa_fluid,displfluid,accelfluid, &
-                            div_displfluid, &
-                            xstore,ystore,zstore, &
-                            xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-          is_on_a_slice_edge_outer_core, &
-          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-          iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces, &
-          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-          buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-          buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar,iphase,icall, &
-                            hprime_xx,hprime_xxT, &
-                            hprimewgll_xx,hprimewgll_xxT, &
-                            wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-                            ibool,MOVIE_VOLUME)
+                                          A_array_rotation,B_array_rotation, &
+                                          displfluid,accelfluid,div_displfluid, &
+                                          phase_is_inner)
 
 ! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
 
-  implicit none
+  use constants
 
-  include "constants.h"
+  use specfem_par,only: &
+    hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT,wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
+    minus_rho_g_over_kappa_fluid,d_ln_density_dr_table, &
+    MOVIE_VOLUME, &
+    myrank,iproc_xi,iproc_eta,ichunk,addressing, &
+    iprocfrom_faces,iprocto_faces, &
+    iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+    npoin2D_max_all_CM_IC
 
-! include values created by the mesher
-! done for performance only using static allocation to allow for loop unrolling
-  include "OUTPUT_FILES/values_from_mesher.h"
+  use specfem_par_outercore,only: &
+    xstore => xstore_outer_core,ystore => ystore_outer_core,zstore => zstore_outer_core, &
+    xix => xix_outer_core,xiy => xiy_outer_core,xiz => xiz_outer_core, &
+    etax => etax_outer_core,etay => etay_outer_core,etaz => etaz_outer_core, &
+    gammax => gammax_outer_core,gammay => gammay_outer_core,gammaz => gammaz_outer_core, &
+    ibool => ibool_outer_core, &
+    iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
+    npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
+    iboolfaces_outer_core,iboolcorner_outer_core, &
+    phase_ispec_inner => phase_ispec_inner_outer_core, &
+    nspec_outer => nspec_outer_outer_core, &
+    nspec_inner => nspec_inner_outer_core
 
-! displacement and acceleration
+
+  implicit none
+
+  ! for the Euler scheme for rotation
+  real(kind=CUSTOM_REAL) time,deltat,two_omega_earth
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROTATION) :: &
+    A_array_rotation,B_array_rotation
+
+  ! displacement and acceleration
   real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: displfluid,accelfluid
 
-! divergence of displacement
+  ! divergence of displacement
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ADJOINT) :: div_displfluid
 
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec_outer_core) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec_outer_core) :: xix,xiy,xiz, &
-                      etax,etay,etaz,gammax,gammay,gammaz
+  ! inner/outer element run flag
+  logical :: phase_is_inner
 
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xxT,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
+  ! local parameters
 
-  logical MOVIE_VOLUME
-
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: newtempx1,newtempx2,newtempx3
-
-! for gravity
+  ! for gravity
   integer int_radius
   double precision radius,theta,phi,gxl,gyl,gzl
   double precision cos_theta,sin_theta,cos_phi,sin_phi
-  double precision, dimension(NRAD_GRAVITY) :: minus_rho_g_over_kappa_fluid
-  double precision, dimension(NRAD_GRAVITY) :: d_ln_density_dr_table
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: gravity_term
-  real(kind=CUSTOM_REAL), dimension(nglob_outer_core) :: xstore,ystore,zstore
-
-! for the Euler scheme for rotation
-  real(kind=CUSTOM_REAL) time,deltat,two_omega_earth
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROTATION) :: &
-    A_array_rotation,B_array_rotation
-
+  ! for the Euler scheme for rotation
   real(kind=CUSTOM_REAL) two_omega_deltat,cos_two_omega_t,sin_two_omega_t,A_rotation,B_rotation, &
        ux_rotation,uy_rotation,dpotentialdx_with_rot,dpotentialdy_with_rot
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: source_euler_A,source_euler_B
 
   integer ispec,iglob
   integer i,j,k
-
   real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
   real(kind=CUSTOM_REAL) dpotentialdxl,dpotentialdyl,dpotentialdzl
   real(kind=CUSTOM_REAL) sum_terms
@@ -128,74 +116,30 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     displ_times_grad_x_ln_rho,displ_times_grad_y_ln_rho,displ_times_grad_z_ln_rho
 
-! this for non blocking MPI
-  integer :: ichunk,iproc_xi,iproc_eta,myrank
+!  integer :: computed_elements
+  integer :: num_elements,ispec_p
+  integer :: iphase
 
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_OC) :: iboolleft_xi_outer_core,iboolright_xi_outer_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_OC) :: iboolleft_eta_outer_core,iboolright_eta_outer_core
-
-  integer npoin2D_faces_outer_core(NUMFACES_SHARED)
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_outer_core,npoin2D_eta_outer_core
-
-! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces
-
-! communication pattern for corners between chunks
-  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
-
-! indirect addressing for each message for faces and corners of the chunks
-! a given slice can belong to at most one corner and at most two faces
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_outer_core
-
-! buffers for send and receive between faces of the slices and the chunks
-! we use the same buffers to assemble scalars and vectors because vectors are
-! always three times bigger and therefore scalars can use the first part
-! of the vector buffer in memory even if it has an additional index here
-! allocate these automatic arrays in the memory stack to avoid memory fragmentation with "allocate()"
-  integer :: npoin2D_max_all_CM_IC
-  real(kind=CUSTOM_REAL), dimension(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED) :: buffer_send_faces,buffer_received_faces
-
-  integer, dimension(NGLOB1D_RADIAL_OC,NUMCORNERS_SHARED) :: iboolcorner_outer_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLOB1D_RADIAL_OC) :: buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar
-
-  logical, dimension(NSPEC_OUTER_CORE) :: is_on_a_slice_edge_outer_core
-
-  integer :: iphase,icall
-
-  integer :: computed_elements
-
 ! ****************************************************
 !   big loop over all spectral elements in the fluid
 ! ****************************************************
 
-  if (NSPEC_OUTER_CORE_ADJOINT /= 1 .and. icall == 1) div_displfluid(:,:,:,:) = 0._CUSTOM_REAL
+  if (NSPEC_OUTER_CORE_ADJOINT /= 1 .and. (.not. phase_is_inner) ) div_displfluid(:,:,:,:) = 0._CUSTOM_REAL
 
-  computed_elements = 0
+!  computed_elements = 0
+  if( .not. phase_is_inner ) then
+    iphase = 1
+    num_elements = nspec_outer
+  else
+    iphase = 2
+    num_elements = nspec_inner
+  endif
 
-  do ispec = 1,NSPEC_OUTER_CORE
+  do ispec_p = 1,num_elements
 
-! hide communications by computing the edges first
-    if((icall == 2 .and. is_on_a_slice_edge_outer_core(ispec)) .or. &
-       (icall == 1 .and. .not. is_on_a_slice_edge_outer_core(ispec))) cycle
+    ispec = phase_ispec_inner(ispec_p,iphase)
 
-! process the communications every ELEMENTS_NONBLOCKING elements
-    computed_elements = computed_elements + 1
-    if (USE_NONBLOCKING_COMMS .and. icall == 2 .and. mod(computed_elements,ELEMENTS_NONBLOCKING_OC) == 0 .and. iphase <= 7) &
-      call assemble_MPI_scalar(myrank,accelfluid,NGLOB_OUTER_CORE, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-            iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
-            NUMMSGS_FACES_VAL,NCORNERSCHUNKS_VAL, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL_OC, &
-            NGLOB2DMAX_XMIN_XMAX_OC,NGLOB2DMAX_YMIN_YMAX_OC,NGLOB2DMAX_XY_VAL,NCHUNKS_VAL,iphase)
+    ! only compute element which belong to current phase (inner or outer elements)
 
     do k=1,NGLLZ
       do j=1,NGLLY

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -26,11 +26,306 @@
 !=====================================================================
 
 
+  subroutine compute_kernels()
+
+! kernel calculations
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  use specfem_par_movie
+  implicit none
+
+  ! local parameters
+  ! dummy array that does not need to be actually read
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
+  logical,dimension(:),allocatable:: dummy_ispec_is_tiso
+
+  ! crust mantle
+  call compute_kernels_crust_mantle(ibool_crust_mantle, &
+                        rho_kl_crust_mantle,beta_kl_crust_mantle, &
+                        alpha_kl_crust_mantle,cijkl_kl_crust_mantle, &
+                        accel_crust_mantle,b_displ_crust_mantle, &
+                        epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                        epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                        b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,b_epsilondev_xy_crust_mantle, &
+                        b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
+                        eps_trace_over_3_crust_mantle,b_eps_trace_over_3_crust_mantle, &
+                        deltat)
+
+  ! outer core
+  call compute_kernels_outer_core(ibool_outer_core, &
+                      xix_outer_core,xiy_outer_core,xiz_outer_core, &
+                      etax_outer_core,etay_outer_core,etaz_outer_core, &
+                      gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
+                      hprime_xx,hprime_yy,hprime_zz, &
+                      displ_outer_core,accel_outer_core, &
+                      b_displ_outer_core,b_accel_outer_core, &
+                      vector_accel_outer_core,vector_displ_outer_core, &
+                      b_vector_displ_outer_core, &
+                      div_displ_outer_core,b_div_displ_outer_core, &
+                      rhostore_outer_core,kappavstore_outer_core, &
+                      rho_kl_outer_core,alpha_kl_outer_core, &
+                      deviatoric_outercore,nspec_beta_kl_outer_core,beta_kl_outer_core, &
+                      deltat)
+
+  ! inner core
+  call compute_kernels_inner_core(ibool_inner_core, &
+                        rho_kl_inner_core,beta_kl_inner_core, &
+                        alpha_kl_inner_core, &
+                        accel_inner_core,b_displ_inner_core, &
+                        epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+                        epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+                        b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+                        b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
+                        eps_trace_over_3_inner_core,b_eps_trace_over_3_inner_core, &
+                        deltat)
+
+!<YANGL
+  ! NOISE TOMOGRAPHY --- source strength kernel
+  if (NOISE_TOMOGRAPHY == 3)  &
+     call compute_kernels_strength_noise(nmovie_points,ibool_crust_mantle, &
+                        Sigma_kl_crust_mantle,displ_crust_mantle,deltat,it, &
+                        normal_x_noise,normal_y_noise,normal_z_noise, &
+                        NSPEC2D_TOP(IREGION_CRUST_MANTLE),noise_surface_movie, &
+                        ibelm_top_crust_mantle)
+!>YANGL
+
+  ! --- boundary kernels ------
+  if (SAVE_BOUNDARY_MESH) then
+    fluid_solid_boundary = .false.
+    iregion_code = IREGION_CRUST_MANTLE
+
+    ! Moho
+    if (.not. SUPPRESS_CRUSTAL_MESH .and. HONOR_1D_SPHERICAL_MOHO) then
+      call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+               b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
+               ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+          ! -- idoubling_crust_mantle, &
+               xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+               etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+               gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
+               rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+               kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+               c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+               c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+               c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+               c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+               c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+               c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+               k_top,ibelm_moho_top,normal_moho,moho_kl_top,fluid_solid_boundary,NSPEC2D_MOHO)
+
+      call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+               b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
+               ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+            ! --idoubling_crust_mantle, &
+               xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+               etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+               gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
+               rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+               kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+               c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+               c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+               c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+               c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+               c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+               c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+               k_bot,ibelm_moho_bot,normal_moho,moho_kl_bot,fluid_solid_boundary,NSPEC2D_MOHO)
+
+      moho_kl = moho_kl + (moho_kl_top - moho_kl_bot) * deltat
+    endif
+
+    ! 400
+    call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+               b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
+               ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+          ! --idoubling_crust_mantle, &
+               xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+               etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+               gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
+               rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+               kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+               c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+               c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+               c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+               c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+               c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+               c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+               k_top,ibelm_400_top,normal_400,d400_kl_top,fluid_solid_boundary,NSPEC2D_400)
+
+    call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+               b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
+               ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+            ! --idoubling_crust_mantle, &
+               xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+               etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+               gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
+               rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+               kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+               c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+               c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+               c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+               c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+               c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+               c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+               k_bot,ibelm_400_bot,normal_400,d400_kl_bot,fluid_solid_boundary,NSPEC2D_400)
+
+    d400_kl = d400_kl + (d400_kl_top - d400_kl_bot) * deltat
+
+    ! 670
+    call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+               b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
+               ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+            ! --idoubling_crust_mantle, &
+               xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+               etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+               gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
+               rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+               kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+               c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+               c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+               c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+               c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+               c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+               c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+               k_top,ibelm_670_top,normal_670,d670_kl_top,fluid_solid_boundary,NSPEC2D_670)
+
+    call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+               b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
+               ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+            ! --idoubling_crust_mantle, &
+               xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+               etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+               gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
+               rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
+               kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+               c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+               c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+               c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+               c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+               c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+               c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+               k_bot,ibelm_670_bot,normal_670,d670_kl_bot,fluid_solid_boundary,NSPEC2D_670)
+
+    d670_kl = d670_kl + (d670_kl_top - d670_kl_bot) * deltat
+
+    ! CMB
+    fluid_solid_boundary = .true.
+    iregion_code = IREGION_CRUST_MANTLE
+    call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
+               b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
+               ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
+            ! -- idoubling_crust_mantle, &
+               xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+               etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
+               gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
+               rhostore_crust_mantle,kappavstore_crust_mantle, muvstore_crust_mantle, &
+               kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
+               c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
+               c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
+               c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
+               c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
+               c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+               c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+               k_top,ibelm_bottom_crust_mantle,normal_top_outer_core, &
+               cmb_kl_top,fluid_solid_boundary,NSPEC2D_CMB)
+
+    iregion_code = IREGION_OUTER_CORE
+    ! dummy allocation
+    allocate(dummy_ispec_is_tiso(NSPEC_OUTER_CORE))
+    dummy_ispec_is_tiso(:) = .false.
+    call compute_boundary_kernel(vector_displ_outer_core,vector_accel_outer_core, &
+               b_vector_displ_outer_core,nspec_outer_core, &
+               iregion_code,ystore_outer_core,zstore_outer_core,ibool_outer_core,dummy_ispec_is_tiso, &
+            ! --idoubling_outer_core, &
+               xix_outer_core,xiy_outer_core,xiz_outer_core, &
+               etax_outer_core,etay_outer_core,etaz_outer_core,&
+               gammax_outer_core,gammay_outer_core,gammaz_outer_core,hprime_xx,hprime_yy,hprime_zz, &
+               rhostore_outer_core,kappavstore_outer_core,dummy_array, &
+               dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array, &
+               k_bot,ibelm_top_outer_core,normal_top_outer_core, &
+               cmb_kl_bot,fluid_solid_boundary,NSPEC2D_CMB)
+
+    cmb_kl = cmb_kl + (cmb_kl_top - cmb_kl_bot) * deltat
+
+    ! ICB
+    fluid_solid_boundary = .true.
+    call compute_boundary_kernel(vector_displ_outer_core,vector_accel_outer_core, &
+               b_vector_displ_outer_core,nspec_outer_core, &
+               iregion_code,ystore_outer_core,zstore_outer_core,ibool_outer_core,dummy_ispec_is_tiso, &
+            ! --idoubling_outer_core, &
+               xix_outer_core,xiy_outer_core,xiz_outer_core, &
+               etax_outer_core,etay_outer_core,etaz_outer_core,&
+               gammax_outer_core,gammay_outer_core,gammaz_outer_core,hprime_xx,hprime_yy,hprime_zz, &
+               rhostore_outer_core,kappavstore_outer_core,dummy_array, &
+               dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array, &
+               k_top,ibelm_bottom_outer_core,normal_bottom_outer_core, &
+               icb_kl_top,fluid_solid_boundary,NSPEC2D_ICB)
+
+    deallocate(dummy_ispec_is_tiso)
+
+    ! ICB
+    iregion_code = IREGION_INNER_CORE
+    ! dummy allocation
+    allocate(dummy_ispec_is_tiso(NSPEC_INNER_CORE))
+    dummy_ispec_is_tiso(:) = .false.
+    call compute_boundary_kernel(displ_inner_core,accel_inner_core, &
+               b_displ_inner_core,nspec_inner_core,iregion_code, &
+               ystore_inner_core,zstore_inner_core,ibool_inner_core,dummy_ispec_is_tiso, &
+            ! -- idoubling_inner_core, &
+               xix_inner_core,xiy_inner_core,xiz_inner_core, &
+               etax_inner_core,etay_inner_core,etaz_inner_core,&
+               gammax_inner_core,gammay_inner_core,gammaz_inner_core,hprime_xx,hprime_yy,hprime_zz, &
+               rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
+               dummy_array,dummy_array,dummy_array, &
+               c11store_inner_core,c12store_inner_core,c13store_inner_core,dummy_array, &
+               dummy_array,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array,dummy_array, &
+               c33store_inner_core,dummy_array,dummy_array, &
+               dummy_array,c44store_inner_core,dummy_array,dummy_array, &
+               dummy_array,dummy_array,dummy_array, &
+               k_bot,ibelm_top_inner_core,normal_bottom_outer_core, &
+               icb_kl_bot,fluid_solid_boundary,NSPEC2D_ICB)
+    deallocate(dummy_ispec_is_tiso)
+    icb_kl = icb_kl + (icb_kl_top - icb_kl_bot) * deltat
+  endif
+
+  ! approximate hessian
+  if( APPROXIMATE_HESS_KL ) then
+    call compute_kernels_hessian(ibool_crust_mantle, &
+                        hess_kl_crust_mantle,&
+                        accel_crust_mantle,b_accel_crust_mantle, &
+                        deltat)
+  endif
+
+
+  end subroutine compute_kernels
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
   subroutine compute_kernels_crust_mantle(ibool_crust_mantle, &
                           rho_kl_crust_mantle,beta_kl_crust_mantle, &
                           alpha_kl_crust_mantle,cijkl_kl_crust_mantle, &
                           accel_crust_mantle,b_displ_crust_mantle, &
-                          epsilondev_crust_mantle,b_epsilondev_crust_mantle, &
+                          epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                          epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                          b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,b_epsilondev_xy_crust_mantle, &
+                          b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
                           eps_trace_over_3_crust_mantle,b_eps_trace_over_3_crust_mantle, &
                           deltat)
 
@@ -53,11 +348,18 @@
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE_ADJOINT) :: &
     b_displ_crust_mantle
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
-    epsilondev_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
-    b_epsilondev_crust_mantle
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
+!    epsilondev_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
+    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+    epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle
 
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
+!    b_epsilondev_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
+    b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,b_epsilondev_xy_crust_mantle, &
+    b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle
+
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: &
     eps_trace_over_3_crust_mantle
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
@@ -95,9 +397,18 @@
              + accel_crust_mantle(2,iglob) * b_displ_crust_mantle(2,iglob) &
              + accel_crust_mantle(3,iglob) * b_displ_crust_mantle(3,iglob) )
 
-          epsilondev_loc(:) = epsilondev_crust_mantle(:,i,j,k,ispec)
-          b_epsilondev_loc(:) = b_epsilondev_crust_mantle(:,i,j,k,ispec)
+          epsilondev_loc(1) = epsilondev_xx_crust_mantle(i,j,k,ispec)
+          epsilondev_loc(2) = epsilondev_yy_crust_mantle(i,j,k,ispec)
+          epsilondev_loc(3) = epsilondev_xy_crust_mantle(i,j,k,ispec)
+          epsilondev_loc(4) = epsilondev_xz_crust_mantle(i,j,k,ispec)
+          epsilondev_loc(5) = epsilondev_yz_crust_mantle(i,j,k,ispec)
 
+          b_epsilondev_loc(1) = b_epsilondev_xx_crust_mantle(i,j,k,ispec)
+          b_epsilondev_loc(2) = b_epsilondev_yy_crust_mantle(i,j,k,ispec)
+          b_epsilondev_loc(3) = b_epsilondev_xy_crust_mantle(i,j,k,ispec)
+          b_epsilondev_loc(4) = b_epsilondev_xz_crust_mantle(i,j,k,ispec)
+          b_epsilondev_loc(5) = b_epsilondev_yz_crust_mantle(i,j,k,ispec)
+
           ! For anisotropic kernels
           if (ANISOTROPIC_KL) then
 
@@ -446,7 +757,10 @@
                           rho_kl_inner_core,beta_kl_inner_core, &
                           alpha_kl_inner_core, &
                           accel_inner_core,b_displ_inner_core, &
-                          epsilondev_inner_core,b_epsilondev_inner_core, &
+                          epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+                          epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+                          b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+                          b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
                           eps_trace_over_3_inner_core,b_eps_trace_over_3_inner_core, &
                           deltat)
 
@@ -466,11 +780,20 @@
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE_ADJOINT) :: &
     b_displ_inner_core
 
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: &
-    epsilondev_inner_core
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: &
-    b_epsilondev_inner_core
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: &
+!    epsilondev_inner_core
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: &
+    epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+    epsilondev_xz_inner_core,epsilondev_yz_inner_core
 
+
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: &
+!    b_epsilondev_inner_core
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: &
+    b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+    b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core
+
+
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STRAIN_ONLY) :: &
     eps_trace_over_3_inner_core
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: &
@@ -497,8 +820,18 @@
              + accel_inner_core(2,iglob) * b_displ_inner_core(2,iglob) &
              + accel_inner_core(3,iglob) * b_displ_inner_core(3,iglob) )
 
-          epsilondev_loc(:) = epsilondev_inner_core(:,i,j,k,ispec)
-          b_epsilondev_loc(:) = b_epsilondev_inner_core(:,i,j,k,ispec)
+          epsilondev_loc(1) = epsilondev_xx_inner_core(i,j,k,ispec)
+          epsilondev_loc(2) = epsilondev_yy_inner_core(i,j,k,ispec)
+          epsilondev_loc(3) = epsilondev_xy_inner_core(i,j,k,ispec)
+          epsilondev_loc(4) = epsilondev_xz_inner_core(i,j,k,ispec)
+          epsilondev_loc(5) = epsilondev_yz_inner_core(i,j,k,ispec)
+
+          b_epsilondev_loc(1) = b_epsilondev_xx_inner_core(i,j,k,ispec)
+          b_epsilondev_loc(2) = b_epsilondev_yy_inner_core(i,j,k,ispec)
+          b_epsilondev_loc(3) = b_epsilondev_xy_inner_core(i,j,k,ispec)
+          b_epsilondev_loc(4) = b_epsilondev_xz_inner_core(i,j,k,ispec)
+          b_epsilondev_loc(5) = b_epsilondev_yz_inner_core(i,j,k,ispec)
+
           beta_kl_inner_core(i,j,k,ispec) =  beta_kl_inner_core(i,j,k,ispec) &
              + deltat * (epsilondev_loc(1)*b_epsilondev_loc(1) + epsilondev_loc(2)*b_epsilondev_loc(2) &
                 + (epsilondev_loc(1)+epsilondev_loc(2)) * (b_epsilondev_loc(1)+b_epsilondev_loc(2)) &

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -191,7 +191,9 @@
 !
 
   subroutine compute_seismograms_adjoint(NSOURCES,nrec_local,displ_crust_mantle, &
-                    eps_trace_over_3_crust_mantle,epsilondev_crust_mantle, &
+                    eps_trace_over_3_crust_mantle, &
+                    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                    epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
                     nu_source,Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
                     hxir_store,hetar_store,hgammar_store, &
                     hpxir_store,hpetar_store,hpgammar_store, &
@@ -215,9 +217,13 @@
     displ_crust_mantle
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: &
     eps_trace_over_3_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
-    epsilondev_crust_mantle
 
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
+!    epsilondev_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
+    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+    epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle
+
   double precision, dimension(NDIM,NDIM,NSOURCES) :: nu_source
   double precision, dimension(NSOURCES) :: Mxx,Myy,Mzz,Mxy,Mxz,Myz
 
@@ -297,11 +303,11 @@
           uzd = uzd + dble(displ_crust_mantle(3,iglob))*hlagrange
 
           eps_trace = eps_trace + dble(eps_trace_over_3_crust_mantle(i,j,k,ispec_selected_source(irec)))*hlagrange
-          dxx = dxx + dble(epsilondev_crust_mantle(1,i,j,k,ispec_selected_source(irec)))*hlagrange
-          dyy = dyy + dble(epsilondev_crust_mantle(2,i,j,k,ispec_selected_source(irec)))*hlagrange
-          dxy = dxy + dble(epsilondev_crust_mantle(3,i,j,k,ispec_selected_source(irec)))*hlagrange
-          dxz = dxz + dble(epsilondev_crust_mantle(4,i,j,k,ispec_selected_source(irec)))*hlagrange
-          dyz = dyz + dble(epsilondev_crust_mantle(5,i,j,k,ispec_selected_source(irec)))*hlagrange
+          dxx = dxx + dble(epsilondev_xx_crust_mantle(i,j,k,ispec_selected_source(irec)))*hlagrange
+          dyy = dyy + dble(epsilondev_yy_crust_mantle(i,j,k,ispec_selected_source(irec)))*hlagrange
+          dxy = dxy + dble(epsilondev_xy_crust_mantle(i,j,k,ispec_selected_source(irec)))*hlagrange
+          dxz = dxz + dble(epsilondev_xz_crust_mantle(i,j,k,ispec_selected_source(irec)))*hlagrange
+          dyz = dyz + dble(epsilondev_yz_crust_mantle(i,j,k,ispec_selected_source(irec)))*hlagrange
 
           displ_s(:,i,j,k) = displ_crust_mantle(:,iglob)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -25,81 +25,37 @@
 !
 !=====================================================================
 
-  subroutine compute_stacey_crust_mantle(ichunk,SIMULATION_TYPE, &
-                              NSTEP,it,SAVE_FORWARD,ibool_crust_mantle, &
-                              veloc_crust_mantle,accel_crust_mantle,b_accel_crust_mantle, &
-                              jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle, &
-                              jacobian2D_ymin_crust_mantle,jacobian2D_ymax_crust_mantle, &
-                              wgllwgll_xz,wgllwgll_yz, &
-                              normal_xmin_crust_mantle,normal_xmax_crust_mantle, &
-                              normal_ymin_crust_mantle,normal_ymax_crust_mantle, &
-                              rho_vp_crust_mantle,rho_vs_crust_mantle, &
-                              ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle, &
-                              ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle, &
-                              nimin_crust_mantle,nimax_crust_mantle, &
-                              njmin_crust_mantle,njmax_crust_mantle, &
-                              nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
-                              nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-                              nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
-                              reclen_xmin_crust_mantle,reclen_xmax_crust_mantle, &
-                              reclen_ymin_crust_mantle,reclen_ymax_crust_mantle, &
-                              nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm, &
-                              absorb_xmin_crust_mantle,absorb_xmax_crust_mantle, &
-                              absorb_ymin_crust_mantle,absorb_ymax_crust_mantle)
+  subroutine compute_stacey_crust_mantle()
 
-  implicit none
+  use constants
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
+  use specfem_par,only: &
+    ichunk,SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
+    wgllwgll_xz,wgllwgll_yz,wgllwgll_xy
 
-  integer ichunk,SIMULATION_TYPE
-  integer NSTEP,it
-  logical SAVE_FORWARD
+  use specfem_par_crustmantle, only: &
+    veloc_crust_mantle,accel_crust_mantle,b_accel_crust_mantle, &
+    ibool_crust_mantle, &
+    jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle, &
+    jacobian2D_ymin_crust_mantle,jacobian2D_ymax_crust_mantle, &
+    normal_xmin_crust_mantle,normal_xmax_crust_mantle, &
+    normal_ymin_crust_mantle,normal_ymax_crust_mantle, &
+    rho_vp_crust_mantle,rho_vs_crust_mantle, &
+    ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle, &
+    ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle, &
+    nimin_crust_mantle,nimax_crust_mantle, &
+    njmin_crust_mantle,njmax_crust_mantle, &
+    nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
+    nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
+    nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
+    reclen_xmin_crust_mantle,reclen_xmax_crust_mantle, &
+    reclen_ymin_crust_mantle,reclen_ymax_crust_mantle, &
+    nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm, &
+    absorb_xmin_crust_mantle,absorb_xmax_crust_mantle, &
+    absorb_ymin_crust_mantle,absorb_ymax_crust_mantle
 
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
+  implicit none
 
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: &
-    veloc_crust_mantle,accel_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE_ADJOINT) :: &
-    b_accel_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_CM) :: &
-    jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_CM) :: &
-    jacobian2D_ymin_crust_mantle,jacobian2D_ymax_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_CM) :: &
-  normal_xmin_crust_mantle,normal_xmax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2DMAX_YMIN_YMAX_CM) :: &
-  normal_ymin_crust_mantle,normal_ymax_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STACEY) :: &
-    rho_vp_crust_mantle,rho_vs_crust_mantle
-
-  integer, dimension(NSPEC2DMAX_XMIN_XMAX_CM) :: ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle
-  integer, dimension(NSPEC2DMAX_YMIN_YMAX_CM) :: ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle
-
-  integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_CM) :: &
-    nimin_crust_mantle,nimax_crust_mantle,nkmin_eta_crust_mantle
-  integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_CM) :: &
-    njmin_crust_mantle,njmax_crust_mantle,nkmin_xi_crust_mantle
-
-  integer nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-    nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle
-
-  integer reclen_xmin_crust_mantle,reclen_xmax_crust_mantle,&
-    reclen_ymin_crust_mantle,reclen_ymax_crust_mantle
-
-  integer nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLY,NGLLZ,nabs_xmin_cm) :: absorb_xmin_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLY,NGLLZ,nabs_xmax_cm) :: absorb_xmax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLZ,nabs_ymin_cm) :: absorb_ymin_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLZ,nabs_ymax_cm) :: absorb_ymax_crust_mantle
-
-
   ! local parameters
   real(kind=CUSTOM_REAL) :: weight
   real(kind=CUSTOM_REAL) :: vn,vx,vy,vz,nx,ny,nz,tx,ty,tz

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -26,86 +26,39 @@
 !=====================================================================
 
 
-  subroutine compute_stacey_outer_core(ichunk,SIMULATION_TYPE,SAVE_FORWARD, &
-                              NSTEP,it,ibool_outer_core, &
-                              veloc_outer_core,accel_outer_core,b_accel_outer_core, &
-                              vp_outer_core,wgllwgll_xz,wgllwgll_yz,wgllwgll_xy, &
-                              jacobian2D_bottom_outer_core, &
-                              jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core, &
-                              jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core, &
-                              ibelm_bottom_outer_core, &
-                              ibelm_xmin_outer_core,ibelm_xmax_outer_core, &
-                              ibelm_ymin_outer_core,ibelm_ymax_outer_core, &
-                              nimin_outer_core,nimax_outer_core, &
-                              njmin_outer_core,njmax_outer_core, &
-                              nkmin_xi_outer_core,nkmin_eta_outer_core, &
-                              NSPEC2D_BOTTOM, &
-                              nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-                              nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
-                              reclen_zmin, &
-                              reclen_xmin_outer_core,reclen_xmax_outer_core, &
-                              reclen_ymin_outer_core,reclen_ymax_outer_core, &
-                              nabs_zmin_oc, &
-                              nabs_xmin_oc,nabs_xmax_oc,nabs_ymin_oc,nabs_ymax_oc, &
-                              absorb_zmin_outer_core, &
-                              absorb_xmin_outer_core,absorb_xmax_outer_core, &
-                              absorb_ymin_outer_core,absorb_ymax_outer_core)
+  subroutine compute_stacey_outer_core()
 
-  implicit none
+  use constants
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
+  use specfem_par,only: &
+    ichunk,SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
+    wgllwgll_xz,wgllwgll_yz,wgllwgll_xy, &
+    NSPEC2D_BOTTOM
 
-  integer ichunk,SIMULATION_TYPE
-  integer NSTEP,it
-  logical SAVE_FORWARD
+  use specfem_par_outercore,only: &
+    veloc_outer_core,accel_outer_core,b_accel_outer_core, &
+    ibool_outer_core, &
+    jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core, &
+    jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core, &
+    jacobian2D_bottom_outer_core, &
+    nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
+    nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
+    vp_outer_core, &
+    nimin_outer_core,nimax_outer_core,nkmin_eta_outer_core, &
+    njmin_outer_core,njmax_outer_core,nkmin_xi_outer_core, &
+    absorb_xmin_outer_core,absorb_xmax_outer_core, &
+    absorb_ymin_outer_core,absorb_ymax_outer_core, &
+    absorb_zmin_outer_core, &
+    reclen_xmin_outer_core,reclen_xmax_outer_core, &
+    reclen_ymin_outer_core,reclen_ymax_outer_core, &
+    reclen_zmin, &
+    ibelm_xmin_outer_core,ibelm_xmax_outer_core, &
+    ibelm_ymin_outer_core,ibelm_ymax_outer_core, &
+    ibelm_bottom_outer_core
+  implicit none
 
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: ibool_outer_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: &
-    veloc_outer_core,accel_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE_ADJOINT) :: &
-    b_accel_outer_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_STACEY) :: vp_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: &
-    jacobian2D_bottom_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_OC) :: &
-    jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_OC) :: &
-    jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core
-
-
-  integer, dimension(NSPEC2D_BOTTOM_OC) :: ibelm_bottom_outer_core
-  integer, dimension(NSPEC2DMAX_XMIN_XMAX_OC) :: ibelm_xmin_outer_core,ibelm_xmax_outer_core
-  integer, dimension(NSPEC2DMAX_YMIN_YMAX_OC) :: ibelm_ymin_outer_core,ibelm_ymax_outer_core
-
-  integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_OC) :: &
-    nimin_outer_core,nimax_outer_core,nkmin_eta_outer_core
-  integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_OC) :: &
-    njmin_outer_core,njmax_outer_core,nkmin_xi_outer_core
-
-  integer, dimension(MAX_NUM_REGIONS) :: NSPEC2D_BOTTOM
-  integer nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-    nspec2D_ymin_outer_core,nspec2D_ymax_outer_core
-
-  integer reclen_zmin,reclen_xmin_outer_core,reclen_xmax_outer_core,&
-    reclen_ymin_outer_core,reclen_ymax_outer_core
-
-  integer nabs_xmin_oc,nabs_xmax_oc,nabs_ymin_oc,nabs_ymax_oc,nabs_zmin_oc
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,nabs_xmin_oc) :: absorb_xmin_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,nabs_xmax_oc) :: absorb_xmax_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,nabs_ymin_oc) :: absorb_ymin_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,nabs_ymax_oc) :: absorb_ymax_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,nabs_zmin_oc) :: absorb_zmin_outer_core
-
   ! local parameters
   real(kind=CUSTOM_REAL) :: sn,weight
-  !integer :: reclen1,reclen2
   integer :: i,j,k,ispec2D,ispec,iglob
 
   ! note: we use c functions for I/O as they still have a better performance than
@@ -124,14 +77,7 @@
     if (SIMULATION_TYPE == 3 .and. nspec2D_xmin_outer_core > 0)  then
       ! note: backward/reconstructed wavefields are read in after the Newmark time scheme in the first time loop
       !          this leads to a corresponding boundary condition at time index NSTEP - (it-1) = NSTEP - it + 1
-
       call read_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,NSTEP-it+1)
-
-!      read(61,rec=NSTEP-it+1) reclen1,absorb_xmin_outer_core,reclen2
-!      if (reclen1 /= reclen_xmin_outer_core .or. reclen1 /= reclen2)  &
-!         call exit_MPI(myrank,'Error reading absorbing contribution absorb_xmin_outer_core')
-
-
     endif
 
     do ispec2D=1,nspec2D_xmin_outer_core
@@ -163,10 +109,7 @@
 
     ! writes absorbing boundary values
     if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmin_outer_core > 0 ) then
-
       call write_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,it)
-
-!      write(61,rec=it) reclen_xmin_outer_core,absorb_xmin_outer_core,reclen_xmin_outer_core
     endif
 
   endif
@@ -176,12 +119,7 @@
   if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
 
     if (SIMULATION_TYPE == 3 .and. nspec2D_xmax_outer_core > 0)  then
-
       call read_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,NSTEP-it+1)
-
-!      read(62,rec=NSTEP-it+1) reclen1,absorb_xmax_outer_core,reclen2
-!      if (reclen1 /= reclen_xmax_outer_core .or. reclen1 /= reclen2)  &
-!         call exit_MPI(myrank,'Error reading absorbing contribution absorb_xmax_outer_core')
     endif
 
     do ispec2D=1,nspec2D_xmax_outer_core
@@ -214,20 +152,13 @@
 
     if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmax_outer_core > 0 ) then
       call write_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,it)
-
-!      write(62,rec=it) reclen_xmax_outer_core,absorb_xmax_outer_core,reclen_xmax_outer_core
     endif
 
   endif
 
   !   ymin
   if (SIMULATION_TYPE == 3 .and. nspec2D_ymin_outer_core > 0)  then
-
     call read_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,NSTEP-it+1)
-
-!    read(63,rec=NSTEP-it+1) reclen1,absorb_ymin_outer_core,reclen2
-!    if (reclen1 /= reclen_ymin_outer_core .or. reclen1 /= reclen2)  &
-!       call exit_MPI(myrank,'Error reading absorbing contribution absorb_ymin_outer_core')
   endif
 
   do ispec2D=1,nspec2D_ymin_outer_core
@@ -260,19 +191,13 @@
 
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymin_outer_core > 0 ) then
     call write_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,it)
-
-!    write(63,rec=it) reclen_ymin_outer_core,absorb_ymin_outer_core,reclen_ymin_outer_core
   endif
 
   !   ymax
   if (SIMULATION_TYPE == 3 .and. nspec2D_ymax_outer_core > 0)  then
-
     call read_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,NSTEP-it+1)
-
-!    read(64,rec=NSTEP-it+1) reclen1,absorb_ymax_outer_core,reclen2
-!    if (reclen1 /= reclen_ymax_outer_core .or. reclen1 /= reclen2)  &
-!       call exit_MPI(myrank,'Error reading absorbing contribution absorb_ymax_outer_core')
   endif
+
   do ispec2D=1,nspec2D_ymax_outer_core
 
     ispec=ibelm_ymax_outer_core(ispec2D)
@@ -303,18 +228,11 @@
 
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_ymax_outer_core > 0 ) then
     call write_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,it)
-
-!    write(64,rec=it) reclen_ymax_outer_core,absorb_ymax_outer_core,reclen_ymax_outer_core
   endif
 
   ! for surface elements exactly on the ICB
   if (SIMULATION_TYPE == 3 .and. NSPEC2D_BOTTOM(IREGION_OUTER_CORE)> 0)  then
-
     call read_abs(8,absorb_zmin_outer_core,reclen_zmin,NSTEP-it+1)
-
-!    read(65,rec=NSTEP-it+1) reclen1,absorb_zmin_outer_core,reclen2
-!    if (reclen1 /= reclen_zmin .or. reclen1 /= reclen2)  &
-!       call exit_MPI(myrank,'Error reading absorbing contribution absorb_zmin_outer_core')
   endif
 
   do ispec2D = 1,NSPEC2D_BOTTOM(IREGION_OUTER_CORE)
@@ -344,8 +262,6 @@
 
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. NSPEC2D_BOTTOM(IREGION_OUTER_CORE) > 0 ) then
     call write_abs(8,absorb_zmin_outer_core,reclen_zmin,it)
-
-!    write(65,rec=it) reclen_zmin,absorb_zmin_outer_core,reclen_zmin
   endif
 
   end subroutine compute_stacey_outer_core

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,283 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  5 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!                        Princeton University, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+  subroutine finalize_simulation()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  use specfem_par_movie
+  implicit none
+
+  ! synchronize all processes, waits until all processes have written their seismograms
+  call sync_all()
+
+  ! closes Stacey absorbing boundary snapshots
+  if( ABSORBING_CONDITIONS ) then
+    ! crust mantle
+    if (nspec2D_xmin_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
+      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+      call close_file_abs(0)
+    endif
+
+    if (nspec2D_xmax_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
+      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+      call close_file_abs(1)
+    endif
+
+    if (nspec2D_ymin_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
+      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+      call close_file_abs(2)
+    endif
+
+    if (nspec2D_ymax_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
+      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+      call close_file_abs(3)
+    endif
+
+    ! outer core
+    if (nspec2D_xmin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
+      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+      call close_file_abs(4)
+    endif
+
+    if (nspec2D_xmax_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
+      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+      call close_file_abs(5)
+    endif
+
+    if (nspec2D_ymin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
+      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+      call close_file_abs(6)
+    endif
+
+    if (nspec2D_ymax_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
+      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+      call close_file_abs(7)
+    endif
+
+    if (NSPEC2D_BOTTOM(IREGION_OUTER_CORE) > 0 .and. (SIMULATION_TYPE == 3 &
+      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+      call close_file_abs(8)
+    endif
+
+    ! frees memory
+    deallocate(absorb_xmin_crust_mantle, &
+              absorb_xmax_crust_mantle, &
+              absorb_ymin_crust_mantle, &
+              absorb_ymax_crust_mantle, &
+              absorb_xmin_outer_core, &
+              absorb_xmax_outer_core, &
+              absorb_ymin_outer_core, &
+              absorb_ymax_outer_core, &
+              absorb_zmin_outer_core)
+  endif
+
+  ! save/read the surface movie using the same c routine as we do for absorbing boundaries (file ID is 9)
+  if (NOISE_TOMOGRAPHY/=0) then
+    call close_file_abs(9)
+  endif
+
+  ! synchronize all processes
+  call sync_all()
+
+  ! save files to local disk or tape system if restart file
+  call save_forward_arrays()
+
+  ! synchronize all processes
+  call sync_all()
+
+  ! dump kernel arrays
+  if (SIMULATION_TYPE == 3) then
+    ! crust mantle
+    call save_kernels_crust_mantle(myrank,scale_t,scale_displ, &
+                  cijkl_kl_crust_mantle,rho_kl_crust_mantle, &
+                  alpha_kl_crust_mantle,beta_kl_crust_mantle, &
+                  ystore_crust_mantle,zstore_crust_mantle, &
+                  rhostore_crust_mantle,muvstore_crust_mantle, &
+                  kappavstore_crust_mantle,ibool_crust_mantle, &
+                  kappahstore_crust_mantle,muhstore_crust_mantle, &
+                  eta_anisostore_crust_mantle,ispec_is_tiso_crust_mantle, &
+              ! --idoubling_crust_mantle, &
+                  LOCAL_PATH)
+
+    ! noise strength kernel
+    if (NOISE_TOMOGRAPHY == 3) then
+       call save_kernels_strength_noise(myrank,LOCAL_PATH,Sigma_kl_crust_mantle)
+    endif
+
+    ! outer core
+    call save_kernels_outer_core(myrank,scale_t,scale_displ, &
+                        rho_kl_outer_core,alpha_kl_outer_core, &
+                        rhostore_outer_core,kappavstore_outer_core, &
+                        deviatoric_outercore,nspec_beta_kl_outer_core,beta_kl_outer_core, &
+                        LOCAL_PATH)
+
+    ! inner core
+    call save_kernels_inner_core(myrank,scale_t,scale_displ, &
+                          rho_kl_inner_core,beta_kl_inner_core,alpha_kl_inner_core, &
+                          rhostore_inner_core,muvstore_inner_core,kappavstore_inner_core, &
+                          LOCAL_PATH)
+
+    ! boundary kernel
+    if (SAVE_BOUNDARY_MESH) then
+      call save_kernels_boundary_kl(myrank,scale_t,scale_displ, &
+                                  moho_kl,d400_kl,d670_kl,cmb_kl,icb_kl, &
+                                  LOCAL_PATH,HONOR_1D_SPHERICAL_MOHO)
+    endif
+
+    ! approximate hessian
+    if( APPROXIMATE_HESS_KL ) then
+      call save_kernels_hessian(myrank,scale_t,scale_displ, &
+                                            hess_kl_crust_mantle,LOCAL_PATH)
+    endif
+  endif
+
+  ! save source derivatives for adjoint simulations
+  if (SIMULATION_TYPE == 2 .and. nrec_local > 0) then
+    call save_kernels_source_derivatives(nrec_local,NSOURCES,scale_displ,scale_t, &
+                                nu_source,moment_der,sloc_der,stshift_der,shdur_der,number_receiver_global)
+  endif
+
+  ! frees dynamically allocated memory
+  ! mpi buffers
+  deallocate(buffer_send_faces, &
+            buffer_received_faces, &
+            b_buffer_send_faces, &
+            b_buffer_received_faces)
+
+  ! central cube buffers
+  deallocate(sender_from_slices_to_cube, &
+            buffer_all_cube_from_slices, &
+            b_buffer_all_cube_from_slices, &
+            buffer_slices, &
+            b_buffer_slices, &
+            buffer_slices2, &
+            ibool_central_cube)
+
+  ! sources
+  deallocate(islice_selected_source, &
+          ispec_selected_source, &
+          Mxx, &
+          Myy, &
+          Mzz, &
+          Mxy, &
+          Mxz, &
+          Myz, &
+          xi_source, &
+          eta_source, &
+          gamma_source, &
+          tshift_cmt, &
+          hdur, &
+          hdur_gaussian, &
+          theta_source, &
+          phi_source, &
+          nu_source)
+  if (SIMULATION_TYPE == 1  .or. SIMULATION_TYPE == 3) deallocate(sourcearrays)
+  if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
+    deallocate(iadj_vec)
+    if(nadj_rec_local > 0) then
+      deallocate(adj_sourcearrays)
+      deallocate(iadjsrc,iadjsrc_len)
+    endif
+  endif
+
+  ! receivers
+  deallocate(islice_selected_rec, &
+          ispec_selected_rec, &
+          xi_receiver, &
+          eta_receiver, &
+          gamma_receiver, &
+          station_name, &
+          network_name, &
+          stlat, &
+          stlon, &
+          stele, &
+          stbur, &
+          nu, &
+          number_receiver_global)
+  if( nrec_local > 0 ) then
+    deallocate(hxir_store, &
+              hetar_store, &
+              hgammar_store)
+    if( SIMULATION_TYPE == 2 ) then
+      deallocate(moment_der,stshift_der)
+    endif
+  endif
+  deallocate(seismograms)
+
+  if (SIMULATION_TYPE == 3) then
+    if( APPROXIMATE_HESS_KL ) then
+      deallocate(hess_kl_crust_mantle)
+    endif
+    deallocate(beta_kl_outer_core)
+  endif
+
+  ! movies
+  if(MOVIE_SURFACE .or. NOISE_TOMOGRAPHY /= 0 ) then
+    deallocate(store_val_x, &
+              store_val_y, &
+              store_val_z, &
+              store_val_ux, &
+              store_val_uy, &
+              store_val_uz)
+    if (MOVIE_SURFACE) then
+      deallocate(store_val_x_all, &
+            store_val_y_all, &
+            store_val_z_all, &
+            store_val_ux_all, &
+            store_val_uy_all, &
+            store_val_uz_all)
+    endif
+  endif
+  if(MOVIE_VOLUME) then
+    deallocate(nu_3dmovie)
+  endif
+
+  ! noise simulations
+  if ( NOISE_TOMOGRAPHY /= 0 ) then
+    deallocate(noise_sourcearray, &
+            normal_x_noise, &
+            normal_y_noise, &
+            normal_z_noise, &
+            mask_noise, &
+            noise_surface_movie)
+  endif
+
+  ! close the main output file
+  if(myrank == 0) then
+    write(IMAIN,*)
+    write(IMAIN,*) 'End of the simulation'
+    write(IMAIN,*)
+    close(IMAIN)
+  endif
+
+  ! synchronize all the processes to make sure everybody has finished
+  call sync_all()
+
+  end subroutine finalize_simulation

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/fix_non_blocking_flags.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/fix_non_blocking_flags.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/fix_non_blocking_flags.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -102,7 +102,8 @@
 ! a full face are missing, therefore let us add them
   subroutine fix_non_blocking_central_cube(is_on_a_slice_edge, &
          ibool,nspec,nglob,nb_msgs_theor_in_cube,ibelm_bottom_inner_core, &
-         idoubling_inner_core,npoin2D_cube_from_slices,ibool_central_cube,NSPEC2D_BOTTOM_INNER_CORE,ichunk)
+         idoubling_inner_core,npoin2D_cube_from_slices, &
+         ibool_central_cube,NSPEC2D_BOTTOM_INNER_CORE,ichunk)
 
   implicit none
 
@@ -144,8 +145,8 @@
 
   if(ichunk == CHUNK_AB .or. ichunk == CHUNK_AB_ANTIPODE) then
 
-! clean the mask
-  mask_ibool(:) = .false.
+    ! clean the mask
+    mask_ibool(:) = .false.
 
     do imsg = 1,nb_msgs_theor_in_cube
       do ipoin = 1,npoin2D_cube_from_slices
@@ -153,24 +154,24 @@
       enddo
     enddo
 
-! now label all the elements that have at least one corner belonging
-! to any of these buffers as elements that must contribute to the
-! first step of the calculations (performed on the edges before starting
-! the non blocking communications); there is no need to examine the inside
-! of the elements, checking their eight corners is sufficient
-  do ispec = 1,nspec
-    do k = 1,NGLLZ,NGLLZ-1
-      do j  = 1,NGLLY,NGLLY-1
-        do i = 1,NGLLX,NGLLX-1
-          if(mask_ibool(ibool(i,j,k,ispec))) then
-            is_on_a_slice_edge(ispec) = .true.
-            goto 888
-          endif
+    ! now label all the elements that have at least one corner belonging
+    ! to any of these buffers as elements that must contribute to the
+    ! first step of the calculations (performed on the edges before starting
+    ! the non blocking communications); there is no need to examine the inside
+    ! of the elements, checking their eight corners is sufficient
+    do ispec = 1,nspec
+      do k = 1,NGLLZ,NGLLZ-1
+        do j  = 1,NGLLY,NGLLY-1
+          do i = 1,NGLLX,NGLLX-1
+            if(mask_ibool(ibool(i,j,k,ispec))) then
+              is_on_a_slice_edge(ispec) = .true.
+              goto 888
+            endif
+          enddo
         enddo
       enddo
+    888 continue
     enddo
-  888 continue
-  enddo
 
   endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/initialize_simulation.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/initialize_simulation.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -26,129 +26,33 @@
 !=====================================================================
 
 
-  subroutine initialize_simulation(myrank,MIN_ATTENUATION_PERIOD,MAX_ATTENUATION_PERIOD,NER_CRUST, &
-                NER_80_MOHO,NER_220_80,NER_400_220,NER_600_400,NER_670_600,NER_771_670, &
-                NER_TOPDDOUBLEPRIME_771,NER_CMB_TOPDDOUBLEPRIME,NER_OUTER_CORE, &
-                NER_TOP_CENTRAL_CUBE_ICB,ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,NEX_ETA, &
-                NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-                NTSTEP_BETWEEN_READ_ADJSRC,NSTEP,NSOURCES,NTSTEP_BETWEEN_FRAMES, &
-                NTSTEP_BETWEEN_OUTPUT_INFO,NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN,SIMULATION_TYPE, &
-                DT,ROCEAN,RMIDDLE_CRUST,RMOHO,R80,R220,R400,R600,R670,R771,&
-                RTOPDDOUBLEPRIME,RCMB,RICB, &
-                RHO_TOP_OC,RHO_BOTTOM_OC,RHO_OCEANS, &
-                MOVIE_VOLUME_TYPE,MOVIE_START,MOVIE_STOP, &
-                HDUR_MOVIE,MOVIE_TOP,MOVIE_BOTTOM,MOVIE_WEST,MOVIE_EAST, &
-                MOVIE_NORTH,MOVIE_SOUTH,MOVIE_SURFACE,MOVIE_VOLUME, &
-                RECEIVERS_CAN_BE_BURIED,PRINT_SOURCE_TIME_FUNCTION, &
-                SAVE_MESH_FILES,ABSORBING_CONDITIONS,INCLUDE_CENTRAL_CUBE,SAVE_FORWARD, &
-                SAVE_ALL_SEISMOS_IN_ONE_FILE,MOVIE_COARSE,OUTPUT_SEISMOS_ASCII_TEXT, &
-                OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY, &
-                ROTATE_SEISMOGRAMS_RT,WRITE_SEISMOGRAMS_BY_MASTER,USE_BINARY_FOR_LARGE_FILE, &
-                LOCAL_PATH,MODEL,OUTPUT_FILES, &
-                ratio_sampling_array, ner, doubling_index,r_bottom,r_top, &
-                this_region_has_a_doubling,rmins,rmaxs, &
-                TOPOGRAPHY,HONOR_1D_SPHERICAL_MOHO,ONE_CRUST, &
-                nspl,rspl,espl,espl2,ibathy_topo, &
-                NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX,NSPEC2D_BOTTOM,NSPEC2D_TOP, &
-                NGLOB1D_RADIAL,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX, &
-                xigll,yigll,zigll,wxgll,wygll,wzgll,wgll_cube, &
-                hprime_xx,hprime_yy,hprime_zz,hprime_xxT, &
-                hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,hprimewgll_xxT, &
-                wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                rec_filename,STATIONS,nrec,NOISE_TOMOGRAPHY)
+  subroutine initialize_simulation()
 
+  use specfem_par
   implicit none
 
   include 'mpif.h'
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
 
-  integer myrank
-
-  ! parameters read from parameter file
-  integer MIN_ATTENUATION_PERIOD,MAX_ATTENUATION_PERIOD,NER_CRUST, &
-          NER_80_MOHO,NER_220_80,NER_400_220,NER_600_400,NER_670_600,NER_771_670, &
-          NER_TOPDDOUBLEPRIME_771,NER_CMB_TOPDDOUBLEPRIME,NER_OUTER_CORE, &
-          NER_TOP_CENTRAL_CUBE_ICB,NEX_XI,NEX_ETA, &
-          NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-          NTSTEP_BETWEEN_READ_ADJSRC,NSTEP,NSOURCES,NTSTEP_BETWEEN_FRAMES, &
-          NTSTEP_BETWEEN_OUTPUT_INFO,NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN,SIMULATION_TYPE, &
-          MOVIE_VOLUME_TYPE,MOVIE_START,MOVIE_STOP,NOISE_TOMOGRAPHY
-
-  double precision DT,ROCEAN,RMIDDLE_CRUST, &
-          RMOHO,R80,R220,R400,R600,R670,R771,RTOPDDOUBLEPRIME,RCMB,RICB, &
-          RHO_TOP_OC,RHO_BOTTOM_OC,RHO_OCEANS,HDUR_MOVIE, &
-          MOVIE_TOP,MOVIE_BOTTOM,MOVIE_WEST,MOVIE_EAST,MOVIE_NORTH,MOVIE_SOUTH
-
-  logical   MOVIE_SURFACE,MOVIE_VOLUME,RECEIVERS_CAN_BE_BURIED,PRINT_SOURCE_TIME_FUNCTION, &
-          SAVE_MESH_FILES,ABSORBING_CONDITIONS,INCLUDE_CENTRAL_CUBE,SAVE_FORWARD, &
-          SAVE_ALL_SEISMOS_IN_ONE_FILE,MOVIE_COARSE,OUTPUT_SEISMOS_ASCII_TEXT,&
-          OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY,&
-          ROTATE_SEISMOGRAMS_RT,WRITE_SEISMOGRAMS_BY_MASTER,USE_BINARY_FOR_LARGE_FILE
-
-  character(len=150) LOCAL_PATH,OUTPUT_FILES
-
-  integer, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: ratio_sampling_array,ner
-  integer, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: doubling_index
-
-  double precision, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: r_bottom,r_top
-
-  logical, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: this_region_has_a_doubling
-  double precision, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: rmins,rmaxs
-
-
-  ! mesh model parameters
-  logical TOPOGRAPHY,HONOR_1D_SPHERICAL_MOHO,ONE_CRUST
-  !logical COMPUTE_AND_STORE_STRAIN
-
-  ! for ellipticity
-  integer nspl
-  double precision rspl(NR),espl(NR),espl2(NR)
-
-  ! use integer array to store values
-  integer, dimension(NX_BATHY,NY_BATHY) :: ibathy_topo
-
-  integer, dimension(MAX_NUM_REGIONS) :: NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX, &
-               NSPEC2D_BOTTOM,NSPEC2D_TOP, &
-               NGLOB1D_RADIAL,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
-
-  ! Gauss-Lobatto-Legendre points of integration and weights
-  double precision, dimension(NGLLX) :: xigll,wxgll
-  double precision, dimension(NGLLY) :: yigll,wygll
-  double precision, dimension(NGLLZ) :: zigll,wzgll
-  ! product of weights for gravity term
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
-  ! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xxT,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-  character(len=150) rec_filename,STATIONS
-  integer nrec
-
   ! local parameters
   integer, dimension(MAX_NUM_REGIONS) :: NSPEC_computed,NGLOB_computed, &
-               NSPEC2D_XI,NSPEC2D_ETA,NSPEC1D_RADIAL
+    NSPEC2D_XI,NSPEC2D_ETA,NSPEC1D_RADIAL
   logical :: CUT_SUPERBRICK_XI,CUT_SUPERBRICK_ETA
   integer, dimension(NB_SQUARE_CORNERS,NB_CUT_CASE) :: DIFF_NSPEC1D_RADIAL
   integer, dimension(NB_SQUARE_EDGES_ONEDIR,NB_CUT_CASE) :: DIFF_NSPEC2D_XI,DIFF_NSPEC2D_ETA
   integer :: ratio_divide_central_cube
   integer :: sizeprocs
-  integer :: ier,i,j,ios
+  integer :: ier,ios
   integer :: NPROC,NPROCTOT,NEX_PER_PROC_XI,NEX_PER_PROC_ETA,NCHUNKS,NPROC_XI,NPROC_ETA
   double precision :: RMOHO_FICTITIOUS_IN_MESHER,R120,R_CENTRAL_CUBE,CENTER_LONGITUDE_IN_DEGREES,&
-   CENTER_LATITUDE_IN_DEGREES,ANGULAR_WIDTH_ETA_IN_DEGREES,ANGULAR_WIDTH_XI_IN_DEGREES,&
-   GAMMA_ROTATION_AZIMUTH
+    CENTER_LATITUDE_IN_DEGREES,ANGULAR_WIDTH_ETA_IN_DEGREES,&
+    GAMMA_ROTATION_AZIMUTH
   integer :: REFERENCE_1D_MODEL,THREE_D_MODEL
   logical :: TRANSVERSE_ISOTROPY,ANISOTROPIC_3D_MANTLE,ANISOTROPIC_INNER_CORE,OCEANS, &
     ATTENUATION,ATTENUATION_3D,ROTATION,ELLIPTICITY,GRAVITY,CASE_3D,ISOTROPIC_3D_MANTLE, &
     HETEROGEN_3D_MANTLE,CRUSTAL,INFLATE_CENTRAL_CUBE
-  character(len=150) :: MODEL,dummystring
+  character(len=150) :: dummystring
   integer, external :: err_occurred
+
   ! sizeprocs returns number of processes started (should be equal to NPROCTOT).
   ! myrank is the rank of each process, between 0 and sizeprocs-1.
   ! as usual in MPI, process 0 is in charge of coordinating everything
@@ -196,6 +100,10 @@
       call exit_MPI(myrank,'an error occurred while reading the parameter file')
     endif
 
+    ! GPU_MODE is in par_file
+    ! parameter is optional, may not be in the Par_file
+    call read_gpu_mode(GPU_MODE)
+
   endif
 
   ! distributes parameters from master to all processes
@@ -232,6 +140,10 @@
                 HONOR_1D_SPHERICAL_MOHO,CRUSTAL,ONE_CRUST,CASE_3D,TRANSVERSE_ISOTROPY, &
                 ISOTROPIC_3D_MANTLE,ANISOTROPIC_3D_MANTLE,HETEROGEN_3D_MANTLE, &
                 ATTENUATION,ATTENUATION_3D,ANISOTROPIC_INNER_CORE,NOISE_TOMOGRAPHY)
+
+  ! broadcasts GPU_MODE
+  call broadcast_gpu_parameters(myrank,GPU_MODE)
+
   ! get the base pathname for output files
   call get_value_string(OUTPUT_FILES, 'OUTPUT_FILES', 'OUTPUT_FILES')
 
@@ -326,6 +238,59 @@
 
   endif
 
+  ! checks flags
+  call initialize_simulation_check(sizeprocs,NPROCTOT,NSPEC_COMPUTED, &
+                                  ATTENUATION,ATTENUATION_3D,NCHUNKS,GRAVITY,ROTATION, &
+                                  ELLIPTICITY,OCEANS,NPROC_XI,NPROC_ETA, &
+                                  TRANSVERSE_ISOTROPY,ANISOTROPIC_3D_MANTLE, &
+                                  ANISOTROPIC_INNER_CORE)
+
+  ! counts receiver stations
+  if (SIMULATION_TYPE == 1) then
+    rec_filename = 'DATA/STATIONS'
+  else
+    rec_filename = 'DATA/STATIONS_ADJOINT'
+  endif
+  call get_value_string(STATIONS, 'solver.STATIONS', rec_filename)
+
+  ! get total number of receivers
+  if(myrank == 0) then
+    open(unit=IIN,file=STATIONS,iostat=ios,status='old',action='read')
+    nrec = 0
+    do while(ios == 0)
+      read(IIN,"(a)",iostat=ios) dummystring
+      if(ios == 0) nrec = nrec + 1
+    enddo
+    close(IIN)
+  endif
+
+  ! broadcast the information read on the master to the nodes
+  call MPI_BCAST(nrec,1,MPI_INTEGER,0,MPI_COMM_WORLD,ier)
+  if(nrec < 1) call exit_MPI(myrank,trim(STATIONS)//': need at least one receiver')
+
+  end subroutine initialize_simulation
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine initialize_simulation_check(sizeprocs,NPROCTOT,NSPEC_COMPUTED, &
+                                        ATTENUATION,ATTENUATION_3D,NCHUNKS,GRAVITY,ROTATION, &
+                                        ELLIPTICITY,OCEANS,NPROC_XI,NPROC_ETA, &
+                                        TRANSVERSE_ISOTROPY,ANISOTROPIC_3D_MANTLE, &
+                                        ANISOTROPIC_INNER_CORE)
+
+  use specfem_par
+  implicit none
+
+  integer :: sizeprocs
+  integer :: NPROCTOT,NCHUNKS,NPROC_XI,NPROC_ETA
+  integer, dimension(MAX_NUM_REGIONS) :: NSPEC_computed
+
+  logical :: TRANSVERSE_ISOTROPY,ANISOTROPIC_3D_MANTLE,ANISOTROPIC_INNER_CORE,OCEANS, &
+    ATTENUATION,ATTENUATION_3D,ROTATION,ELLIPTICITY,GRAVITY
+
+
   ! check that the code is running with the requested nb of processes
   if(sizeprocs /= NPROCTOT) call exit_MPI(myrank,'wrong number of MPI processes(initialization specfem)')
 
@@ -336,71 +301,71 @@
   endif
   if (NSPEC_computed(IREGION_OUTER_CORE) /= NSPEC_OUTER_CORE) then
       write(IMAIN,*) 'NSPEC_OUTER_CORE:',NSPEC_computed(IREGION_OUTER_CORE),NSPEC_OUTER_CORE
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 2')
+      call exit_MPI(myrank,'error in compiled parameters, please recompile solver 2')
   endif
   if (NSPEC_computed(IREGION_INNER_CORE) /= NSPEC_INNER_CORE) then
       write(IMAIN,*) 'NSPEC_INNER_CORE:',NSPEC_computed(IREGION_INNER_CORE),NSPEC_INNER_CORE
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 3')
+      call exit_MPI(myrank,'error in compiled parameters, please recompile solver 3')
   endif
   if (ATTENUATION_3D .NEQV. ATTENUATION_3D_VAL) then
       write(IMAIN,*) 'ATTENUATION_3D:',ATTENUATION_3D,ATTENUATION_3D_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 4')
+      call exit_MPI(myrank,'error in compiled parameters ATTENUATION_3D, please recompile solver')
   endif
   if (NCHUNKS /= NCHUNKS_VAL) then
       write(IMAIN,*) 'NCHUNKS:',NCHUNKS,NCHUNKS_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 6')
+      call exit_MPI(myrank,'error in compiled parameters NCHUNKS, please recompile solver')
   endif
   if (GRAVITY .NEQV. GRAVITY_VAL) then
       write(IMAIN,*) 'GRAVITY:',GRAVITY,GRAVITY_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 7')
+      call exit_MPI(myrank,'error in compiled parameters GRAVITY, please recompile solver')
   endif
   if (ROTATION .NEQV. ROTATION_VAL) then
       write(IMAIN,*) 'ROTATION:',ROTATION,ROTATION_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 8')
+      call exit_MPI(myrank,'error in compiled parameters ROTATION, please recompile solver')
   endif
   if (ATTENUATION .NEQV. ATTENUATION_VAL) then
       write(IMAIN,*) 'ATTENUATION:',ATTENUATION,ATTENUATION_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 9')
+      call exit_MPI(myrank,'error in compiled parameters ATTENUATION, please recompile solver')
   endif
   if (ELLIPTICITY .NEQV. ELLIPTICITY_VAL) then
       write(IMAIN,*) 'ELLIPTICITY:',ELLIPTICITY,ELLIPTICITY_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 10')
+      call exit_MPI(myrank,'error in compiled parameters ELLIPTICITY, please recompile solver')
   endif
   if (OCEANS .NEQV. OCEANS_VAL) then
       write(IMAIN,*) 'OCEANS:',OCEANS,OCEANS_VAL
-      call exit_MPI(myrank,'error in compiled parameters, please recompile solver 10')
+      call exit_MPI(myrank,'error in compiled parameters OCEANS, please recompile solver')
   endif
-  if (NPROCTOT /= NPROCTOT_VAL) then
-      write(IMAIN,*) 'NPROCTOT:',NPROCTOT,NPROCTOT_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 11')
-  endif
   if (NPROC_XI /= NPROC_XI_VAL) then
       write(IMAIN,*) 'NPROC_XI:',NPROC_XI,NPROC_XI_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 11')
+      call exit_MPI(myrank,'error in compiled parameters NPROC_XI, please recompile solver')
   endif
   if (NPROC_ETA /= NPROC_ETA_VAL) then
       write(IMAIN,*) 'NPROC_ETA:',NPROC_ETA,NPROC_ETA_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 11')
+      call exit_MPI(myrank,'error in compiled parameters NPROC_ETA, please recompile solver')
   endif
+  if (NPROCTOT /= NPROCTOT_VAL) then
+      write(IMAIN,*) 'NPROCTOT:',NPROCTOT,NPROCTOT_VAL
+      call exit_MPI(myrank,'error in compiled parameters NPROCTOT, please recompile solver')
+  endif
   if (NEX_XI /= NEX_XI_VAL) then
       write(IMAIN,*) 'NEX_XI:',NEX_XI,NEX_XI_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 12')
+      call exit_MPI(myrank,'error in compiled parameters NEX_XI, please recompile solver')
   endif
   if (NEX_ETA /= NEX_ETA_VAL) then
       write(IMAIN,*) 'NEX_ETA:',NEX_ETA,NEX_ETA_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 13')
+      call exit_MPI(myrank,'error in compiled parameters NEX_ETA, please recompile solver')
   endif
   if (TRANSVERSE_ISOTROPY .NEQV. TRANSVERSE_ISOTROPY_VAL) then
       write(IMAIN,*) 'TRANSVERSE_ISOTROPY:',TRANSVERSE_ISOTROPY,TRANSVERSE_ISOTROPY_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 14')
+      call exit_MPI(myrank,'error in compiled parameters, please recompile solver 14')
   endif
   if (ANISOTROPIC_3D_MANTLE .NEQV. ANISOTROPIC_3D_MANTLE_VAL) then
       write(IMAIN,*) 'ANISOTROPIC_3D_MANTLE:',ANISOTROPIC_3D_MANTLE,ANISOTROPIC_3D_MANTLE_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 15')
+      call exit_MPI(myrank,'error in compiled parameters, please recompile solver 15')
   endif
   if (ANISOTROPIC_INNER_CORE .NEQV. ANISOTROPIC_INNER_CORE_VAL) then
       write(IMAIN,*) 'ANISOTROPIC_INNER_CORE:',ANISOTROPIC_INNER_CORE,ANISOTROPIC_INNER_CORE_VAL
-       call exit_MPI(myrank,'error in compiled parameters, please recompile solver 16')
+      call exit_MPI(myrank,'error in compiled parameters, please recompile solver 16')
   endif
 
   ! check simulation pararmeters
@@ -464,57 +429,21 @@
     endif
   endif
 
-  ! make ellipticity
-  if(ELLIPTICITY_VAL) call make_ellipticity(nspl,rspl,espl,espl2,ONE_CRUST)
-
-  ! read topography and bathymetry file
-  if(myrank == 0 .and. (TOPOGRAPHY .or. OCEANS_VAL)) call read_topo_bathy_file(ibathy_topo)
-  ! broadcast the information read on the master to the nodes
-  call MPI_BCAST(ibathy_topo,NX_BATHY*NY_BATHY,MPI_INTEGER,0,MPI_COMM_WORLD,ier)
-
-  ! set up GLL points, weights and derivation matrices
-  call define_derivation_matrices(xigll,yigll,zigll,wxgll,wygll,wzgll, &
-         hprime_xx,hprime_yy,hprime_zz, &
-         hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-         wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube)
-
-  if( USE_DEVILLE_PRODUCTS_VAL ) then
-
-  ! check that optimized routines from Deville et al. (2002) can be used
-    if(NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5) &
-      stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ = 5'
-
-    ! define transpose of derivation matrix
-    do j = 1,NGLLY
-      do i = 1,NGLLX
-        hprime_xxT(j,i) = hprime_xx(i,j)
-        hprimewgll_xxT(j,i) = hprimewgll_xx(i,j)
-      enddo
-    enddo
+  ! check that buffer size is valid when defining/using buffer_send_chunkcorn_scalar
+  if( NGLOB1D_RADIAL_CM < NGLOB1D_RADIAL_OC .or. NGLOB1D_RADIAL_CM < NGLOB1D_RADIAL_IC ) then
+    call exit_MPI(myrank, 'NGLOB1D_RADIAL_CM must be larger than for outer core or inner core, please check mesh')
   endif
 
-  ! counts receiver stations
-  if (SIMULATION_TYPE == 1) then
-    rec_filename = 'DATA/STATIONS'
-  else
-    rec_filename = 'DATA/STATIONS_ADJOINT'
+  ! check for GPU runs
+  if( GPU_MODE ) then
+    if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) &
+      call exit_mpi(myrank,'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5')
+    if( CUSTOM_REAL /= 4 ) &
+      call exit_mpi(myrank,'GPU mode runs only with CUSTOM_REAL == 4')
+    if( ATTENUATION_VAL ) then
+      if( N_SLS /= 3 ) &
+        call exit_mpi(myrank,'GPU mode does not support N_SLS /= 3 yet')
+    endif
   endif
-  call get_value_string(STATIONS, 'solver.STATIONS', rec_filename)
-  ! get total number of receivers
-  if(myrank == 0) then
-    open(unit=IIN,file=STATIONS,iostat=ios,status='old',action='read')
-    nrec = 0
-    do while(ios == 0)
-      read(IIN,"(a)",iostat=ios) dummystring
-      if(ios == 0) nrec = nrec + 1
-    enddo
-    close(IIN)
-  endif
-  ! broadcast the information read on the master to the nodes
-  call MPI_BCAST(nrec,1,MPI_INTEGER,0,MPI_COMM_WORLD,ier)
-  if(nrec < 1) call exit_MPI(myrank,trim(STATIONS)//': need at least one receiver')
 
-
-  end subroutine initialize_simulation
-
-
+  end subroutine initialize_simulation_check

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,484 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  5 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!                        Princeton University, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+  subroutine iterate_time()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  use specfem_par_movie
+  implicit none
+
+  include 'mpif.h'
+
+!
+!   s t a r t   t i m e   i t e r a t i o n s
+!
+
+  ! synchronize all processes to make sure everybody is ready to start time loop
+  call sync_all()
+
+  if(myrank == 0) then
+    write(IMAIN,*)
+    write(IMAIN,*) 'Starting time iteration loop...'
+    write(IMAIN,*)
+  endif
+
+  ! create an empty file to monitor the start of the simulation
+  if(myrank == 0) then
+    open(unit=IOUT,file=trim(OUTPUT_FILES)//'/starttimeloop.txt',status='unknown',action='write')
+    write(IOUT,*) 'hello, starting time loop'
+    close(IOUT)
+  endif
+
+  ! initialize variables for writing seismograms
+  seismo_offset = it_begin-1
+  seismo_current = 0
+
+#ifdef _HANDOPT
+  imodulo_NGLOB_CRUST_MANTLE = mod(NGLOB_CRUST_MANTLE,3)
+  imodulo_NGLOB_CRUST_MANTLE4 = mod(NGLOB_CRUST_MANTLE,4)
+  imodulo_NGLOB_INNER_CORE = mod(NGLOB_INNER_CORE,3)
+  imodulo_NGLOB_OUTER_CORE = mod(NGLOB_OUTER_CORE,3)
+#endif
+
+! get MPI starting time
+  time_start = MPI_WTIME()
+
+! *********************************************************
+! ************* MAIN LOOP OVER THE TIME STEPS *************
+! *********************************************************
+
+  do it = it_begin,it_end
+
+    ! simulation status output and stability check
+    if(mod(it,NTSTEP_BETWEEN_OUTPUT_INFO) == 0 .or. it == 5 .or. it == NSTEP) then
+      call it_check_stability()
+    endif
+
+    ! update displacement using Newmark time scheme
+    call it_update_displacement_scheme()
+
+    ! acoustic solver for outer core
+    ! (needs to be done first, before elastic one)
+    call compute_forces_acoustic()
+
+    ! elastic solver for crust/mantle and inner core
+    call compute_forces_elastic()
+
+    ! restores last time snapshot saved for backward/reconstruction of wavefields
+    ! note: this is done here after the Newmark time scheme, otherwise the indexing for sources
+    !          and adjoint sources will become more complicated
+    !          that is, index it for adjoint sources will match index NSTEP - 1 for backward/reconstructed wavefields
+    if( SIMULATION_TYPE == 3 .and. it == 1 ) then
+      call read_forward_arrays()
+    endif
+
+    ! write the seismograms with time shift
+    call write_seismograms()
+
+    ! adjoint simulations: kernels
+    if( SIMULATION_TYPE == 3 ) then
+      call compute_kernels()
+    endif
+
+    ! outputs movie files
+    if( MOVIE_SURFACE .or. MOVIE_VOLUME ) then
+      call write_movie_output()
+    endif
+
+    ! first step of noise tomography, i.e., save a surface movie at every time step
+    ! modified from the subroutine 'write_movie_surface'
+    if ( NOISE_TOMOGRAPHY == 1 ) then
+      call noise_save_surface_movie(displ_crust_mantle, &
+                              ibelm_top_crust_mantle,ibool_crust_mantle, &
+                              NSPEC2D_TOP(IREGION_CRUST_MANTLE),noise_surface_movie,it)
+    endif
+
+  enddo   ! end of main time loop
+
+!
+!---- end of time iteration loop
+!
+
+  ! Transfer fields from GPU card to host for further analysis
+  if(GPU_MODE) call it_transfer_from_GPU()
+
+  end subroutine iterate_time
+
+!=====================================================================
+
+  subroutine it_update_displacement_scheme()
+
+! explicit Newmark time scheme with acoustic & elastic domains:
+! (see e.g. Hughes, 1987; Chaljub et al., 2003)
+!
+! chi(t+delta_t) = chi(t) + delta_t chi_dot(t) + 1/2 delta_t**2 chi_dot_dot(t)
+! chi_dot(t+delta_t) = chi_dot(t) + 1/2 delta_t chi_dot_dot(t) + 1/2 delta_t chi_dot_dot(t+delta_t)
+! chi_dot_dot(t+delta_t) = 1/M_acoustic( -K_acoustic chi(t+delta) + B_acoustic u(t+delta_t) + f(t+delta_t) )
+!
+! u(t+delta_t) = u(t) + delta_t  v(t) + 1/2  delta_t**2 a(t)
+! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
+! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
+!
+! where
+!   chi, chi_dot, chi_dot_dot are acoustic (fluid) potentials ( dotted with respect to time)
+!   u, v, a are displacement,velocity & acceleration
+!   M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
+!   f denotes a source term (acoustic/elastic)
+!
+! note that this stage calculates the predictor terms
+!
+!   for
+!   potential chi_dot(t+delta) requires + 1/2 delta_t chi_dot_dot(t+delta_t)
+!                                   at a later stage (corrector) once where chi_dot_dot(t+delta) is calculated
+!   and similar,
+!   velocity v(t+delta_t) requires  + 1/2 delta_t a(t+delta_t)
+!                                   at a later stage once where a(t+delta) is calculated
+! also:
+!   boundary term B_elastic requires chi_dot_dot(t+delta)
+!                                   thus chi_dot_dot has to be updated first before the elastic boundary term is considered
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  use specfem_par_movie
+  implicit none
+
+  ! local parameters
+  integer :: i
+
+  ! Newmark time scheme update
+#ifdef _HANDOPT_NEWMARK
+! way 2:
+! One common technique in computational science to help enhance pipelining is loop unrolling
+!
+! we're accessing NDIM=3 components at each line,
+! that is, for an iteration, the register must contain
+! NDIM * displ_ + NDIM * veloc_ + NDIM * accel + deltat + deltatsq..
+! in most cases a real (CUSTOM_REAL) value will have 4 bytes,
+! assuming a default cache size of about 128 bytes, we unroll here in steps of 3, thus 29 reals or 118 bytes,
+! rather than with steps of 4
+  ! mantle
+  if(imodulo_NGLOB_CRUST_MANTLE >= 1) then
+    do i = 1,imodulo_NGLOB_CRUST_MANTLE
+      displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
+        + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
+
+      veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
+        + deltatover2*accel_crust_mantle(:,i)
+
+      accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+    enddo
+  endif
+  do i = imodulo_NGLOB_CRUST_MANTLE+1,NGLOB_CRUST_MANTLE, 3 ! in steps of 3
+    displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
+      + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
+    displ_crust_mantle(:,i+1) = displ_crust_mantle(:,i+1) &
+      + deltat*veloc_crust_mantle(:,i+1) + deltatsqover2*accel_crust_mantle(:,i+1)
+    displ_crust_mantle(:,i+2) = displ_crust_mantle(:,i+2) &
+      + deltat*veloc_crust_mantle(:,i+2) + deltatsqover2*accel_crust_mantle(:,i+2)
+
+
+    veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
+      + deltatover2*accel_crust_mantle(:,i)
+    veloc_crust_mantle(:,i+1) = veloc_crust_mantle(:,i+1) &
+      + deltatover2*accel_crust_mantle(:,i+1)
+    veloc_crust_mantle(:,i+2) = veloc_crust_mantle(:,i+2) &
+      + deltatover2*accel_crust_mantle(:,i+2)
+
+    ! set acceleration to zero
+    ! note: we do initialize acceleration in this loop since it is read already into the cache,
+    !           otherwise it would have to be read in again for this explicitly,
+    !           which would make this step more expensive
+    accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+    accel_crust_mantle(:,i+1) = 0._CUSTOM_REAL
+    accel_crust_mantle(:,i+2) = 0._CUSTOM_REAL
+  enddo
+
+  ! outer core
+  do i=1,NGLOB_OUTER_CORE
+    displ_outer_core(i) = displ_outer_core(i) &
+      + deltat*veloc_outer_core(i) + deltatsqover2*accel_outer_core(i)
+
+    veloc_outer_core(i) = veloc_outer_core(i) &
+      + deltatover2*accel_outer_core(i)
+
+    accel_outer_core(i) = 0._CUSTOM_REAL
+  enddo
+
+  ! inner core
+  if(imodulo_NGLOB_INNER_CORE >= 1) then
+    do i = 1,imodulo_NGLOB_INNER_CORE
+      displ_inner_core(:,i) = displ_inner_core(:,i) &
+        + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
+
+      veloc_inner_core(:,i) = veloc_inner_core(:,i) &
+        + deltatover2*accel_inner_core(:,i)
+
+      accel_inner_core(:,i) = 0._CUSTOM_REAL
+    enddo
+  endif
+  do i = imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE, 3 ! in steps of 3
+    displ_inner_core(:,i) = displ_inner_core(:,i) &
+      + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
+    displ_inner_core(:,i+1) = displ_inner_core(:,i+1) &
+      + deltat*veloc_inner_core(:,i+1) + deltatsqover2*accel_inner_core(:,i+1)
+    displ_inner_core(:,i+2) = displ_inner_core(:,i+2) &
+      + deltat*veloc_inner_core(:,i+2) + deltatsqover2*accel_inner_core(:,i+2)
+
+
+    veloc_inner_core(:,i) = veloc_inner_core(:,i) &
+      + deltatover2*accel_inner_core(:,i)
+    veloc_inner_core(:,i+1) = veloc_inner_core(:,i+1) &
+      + deltatover2*accel_inner_core(:,i+1)
+    veloc_inner_core(:,i+2) = veloc_inner_core(:,i+2) &
+      + deltatover2*accel_inner_core(:,i+2)
+
+    accel_inner_core(:,i) = 0._CUSTOM_REAL
+    accel_inner_core(:,i+1) = 0._CUSTOM_REAL
+    accel_inner_core(:,i+2) = 0._CUSTOM_REAL
+  enddo
+
+#else
+! way 1:
+  ! mantle
+  do i=1,NGLOB_CRUST_MANTLE
+    displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
+      + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
+    veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
+      + deltatover2*accel_crust_mantle(:,i)
+    accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+  enddo
+  ! outer core
+  do i=1,NGLOB_OUTER_CORE
+    displ_outer_core(i) = displ_outer_core(i) &
+      + deltat*veloc_outer_core(i) + deltatsqover2*accel_outer_core(i)
+    veloc_outer_core(i) = veloc_outer_core(i) &
+      + deltatover2*accel_outer_core(i)
+    accel_outer_core(i) = 0._CUSTOM_REAL
+  enddo
+  ! inner core
+  do i=1,NGLOB_INNER_CORE
+    displ_inner_core(:,i) = displ_inner_core(:,i) &
+      + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
+    veloc_inner_core(:,i) = veloc_inner_core(:,i) &
+      + deltatover2*accel_inner_core(:,i)
+    accel_inner_core(:,i) = 0._CUSTOM_REAL
+  enddo
+#endif
+
+
+
+
+  ! backward field
+  if (SIMULATION_TYPE == 3) then
+
+#ifdef _HANDOPT_NEWMARK
+! way 2:
+    ! mantle
+    if(imodulo_NGLOB_CRUST_MANTLE >= 1) then
+      do i=1,imodulo_NGLOB_CRUST_MANTLE
+        b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
+          + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
+        b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
+          + b_deltatover2*b_accel_crust_mantle(:,i)
+        b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+      enddo
+    endif
+    do i=imodulo_NGLOB_CRUST_MANTLE+1,NGLOB_CRUST_MANTLE,3
+      b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
+        + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
+      b_displ_crust_mantle(:,i+1) = b_displ_crust_mantle(:,i+1) &
+        + b_deltat*b_veloc_crust_mantle(:,i+1) + b_deltatsqover2*b_accel_crust_mantle(:,i+1)
+      b_displ_crust_mantle(:,i+2) = b_displ_crust_mantle(:,i+2) &
+        + b_deltat*b_veloc_crust_mantle(:,i+2) + b_deltatsqover2*b_accel_crust_mantle(:,i+2)
+
+
+      b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
+        + b_deltatover2*b_accel_crust_mantle(:,i)
+      b_veloc_crust_mantle(:,i+1) = b_veloc_crust_mantle(:,i+1) &
+        + b_deltatover2*b_accel_crust_mantle(:,i+1)
+      b_veloc_crust_mantle(:,i+2) = b_veloc_crust_mantle(:,i+2) &
+        + b_deltatover2*b_accel_crust_mantle(:,i+2)
+
+      b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+      b_accel_crust_mantle(:,i+1) = 0._CUSTOM_REAL
+      b_accel_crust_mantle(:,i+2) = 0._CUSTOM_REAL
+    enddo
+
+    ! outer core
+    do i=1,NGLOB_OUTER_CORE
+      b_displ_outer_core(i) = b_displ_outer_core(i) &
+        + b_deltat*b_veloc_outer_core(i) + b_deltatsqover2*b_accel_outer_core(i)
+      b_veloc_outer_core(i) = b_veloc_outer_core(i) &
+        + b_deltatover2*b_accel_outer_core(i)
+      b_accel_outer_core(i) = 0._CUSTOM_REAL
+    enddo
+
+    ! inner core
+    if(imodulo_NGLOB_INNER_CORE >= 1) then
+      do i=1,imodulo_NGLOB_INNER_CORE
+        b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
+          + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
+        b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
+          + b_deltatover2*b_accel_inner_core(:,i)
+        b_accel_inner_core(:,i) = 0._CUSTOM_REAL
+      enddo
+    endif
+    do i=imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE,3
+      b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
+        + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
+      b_displ_inner_core(:,i+1) = b_displ_inner_core(:,i+1) &
+        + b_deltat*b_veloc_inner_core(:,i+1) + b_deltatsqover2*b_accel_inner_core(:,i+1)
+      b_displ_inner_core(:,i+2) = b_displ_inner_core(:,i+2) &
+        + b_deltat*b_veloc_inner_core(:,i+2) + b_deltatsqover2*b_accel_inner_core(:,i+2)
+
+      b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
+        + b_deltatover2*b_accel_inner_core(:,i)
+      b_veloc_inner_core(:,i+1) = b_veloc_inner_core(:,i+1) &
+        + b_deltatover2*b_accel_inner_core(:,i+1)
+      b_veloc_inner_core(:,i+2) = b_veloc_inner_core(:,i+2) &
+        + b_deltatover2*b_accel_inner_core(:,i+2)
+
+      b_accel_inner_core(:,i) = 0._CUSTOM_REAL
+      b_accel_inner_core(:,i+1) = 0._CUSTOM_REAL
+      b_accel_inner_core(:,i+2) = 0._CUSTOM_REAL
+    enddo
+#else
+! way 1:
+    ! mantle
+    do i=1,NGLOB_CRUST_MANTLE
+      b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
+        + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
+      b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
+        + b_deltatover2*b_accel_crust_mantle(:,i)
+      b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+    enddo
+    ! outer core
+    do i=1,NGLOB_OUTER_CORE
+      b_displ_outer_core(i) = b_displ_outer_core(i) &
+        + b_deltat*b_veloc_outer_core(i) + b_deltatsqover2*b_accel_outer_core(i)
+      b_veloc_outer_core(i) = b_veloc_outer_core(i) &
+        + b_deltatover2*b_accel_outer_core(i)
+      b_accel_outer_core(i) = 0._CUSTOM_REAL
+    enddo
+    ! inner core
+    do i=1,NGLOB_INNER_CORE
+      b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
+        + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
+      b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
+        + b_deltatover2*b_accel_inner_core(:,i)
+      b_accel_inner_core(:,i) = 0._CUSTOM_REAL
+    enddo
+#endif
+  endif ! SIMULATION_TYPE == 3
+
+  ! integral of strain for adjoint movie volume
+  if(MOVIE_VOLUME .and. (MOVIE_VOLUME_TYPE == 2 .or. MOVIE_VOLUME_TYPE == 3) ) then
+!    Iepsilondev_crust_mantle(:,:,:,:,:) = Iepsilondev_crust_mantle(:,:,:,:,:)  &
+!                                            + deltat*epsilondev_crust_mantle(:,:,:,:,:)
+    Iepsilondev_crust_mantle(1,:,:,:,:) = Iepsilondev_crust_mantle(1,:,:,:,:)  &
+                                            + deltat*epsilondev_xx_crust_mantle(:,:,:,:)
+    Iepsilondev_crust_mantle(2,:,:,:,:) = Iepsilondev_crust_mantle(2,:,:,:,:)  &
+                                            + deltat*epsilondev_yy_crust_mantle(:,:,:,:)
+    Iepsilondev_crust_mantle(3,:,:,:,:) = Iepsilondev_crust_mantle(3,:,:,:,:)  &
+                                            + deltat*epsilondev_xy_crust_mantle(:,:,:,:)
+    Iepsilondev_crust_mantle(4,:,:,:,:) = Iepsilondev_crust_mantle(4,:,:,:,:)  &
+                                            + deltat*epsilondev_xz_crust_mantle(:,:,:,:)
+    Iepsilondev_crust_mantle(5,:,:,:,:) = Iepsilondev_crust_mantle(5,:,:,:,:)  &
+                                            + deltat*epsilondev_yz_crust_mantle(:,:,:,:)
+
+    Ieps_trace_over_3_crust_mantle(:,:,:,:) = Ieps_trace_over_3_crust_mantle(:,:,:,:) &
+                                            + deltat*eps_trace_over_3_crust_mantle(:,:,:,:)
+  endif
+
+  end subroutine it_update_displacement_scheme
+
+
+!=====================================================================
+
+  subroutine it_check_stability()
+
+! computes the maximum of the norm of the displacement
+! in all the slices using an MPI reduction
+! and output timestamp file to check that simulation is running fine
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  use specfem_par_movie
+  implicit none
+
+  ! compute the maximum of the norm of the displacement
+  ! in all the slices using an MPI reduction
+  ! and output timestamp file to check that simulation is running fine
+  call check_simulation_stability(it,displ_crust_mantle,displ_inner_core,displ_outer_core, &
+                        b_displ_crust_mantle,b_displ_inner_core,b_displ_outer_core, &
+                        eps_trace_over_3_crust_mantle, &
+                        epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                        epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                        SIMULATION_TYPE,OUTPUT_FILES,time_start,DT,t0,NSTEP, &
+                        myrank)
+
+  ! daniel: debugging
+  !if( maxval(displ_crust_mantle(1,:)**2 + &
+  !                displ_crust_mantle(2,:)**2 + displ_crust_mantle(3,:)**2) > 1.e4 ) then
+  !  print*,'slice',myrank
+  !  print*,'  crust_mantle displ:', maxval(displ_crust_mantle(1,:)), &
+  !           maxval(displ_crust_mantle(2,:)),maxval(displ_crust_mantle(3,:))
+  !  print*,'  indxs: ',maxloc( displ_crust_mantle(1,:)),maxloc( displ_crust_mantle(2,:)),maxloc( displ_crust_mantle(3,:))
+  !  indx = maxloc( displ_crust_mantle(3,:) )
+  !  rval = xstore_crust_mantle(indx(1))
+  !  thetaval = ystore_crust_mantle(indx(1))
+  !  phival = zstore_crust_mantle(indx(1))
+  !  !thetaval = PI/2.0d0-datan(1.006760466d0*dcos(dble(thetaval))/dmax1(TINYVAL,dsin(dble(thetaval))))
+  !  print*,'r/lat/lon:',rval*R_EARTH_KM,90.0-thetaval*180./PI,phival*180./PI
+  !  call rthetaphi_2_xyz(rval,thetaval,phival,xstore_crust_mantle(indx(1)),&
+  !                     ystore_crust_mantle(indx(1)),zstore_crust_mantle(indx(1)))
+  !  print*,'x/y/z:',rval,thetaval,phival
+  !  call exit_MPI(myrank,'error stability')
+  !endif
+
+  end subroutine it_check_stability
+
+!=====================================================================
+
+  subroutine it_transfer_from_GPU()
+
+! transfers fields on GPU back onto CPU
+
+  use specfem_par
+  implicit none
+
+  ! frees allocated memory on GPU
+  call prepare_cleanup_device(Mesh_pointer)
+
+  end subroutine it_transfer_from_GPU

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_receivers.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_receivers.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -703,8 +703,7 @@
 
 ! main process broadcasts the results to all the slices
   call MPI_BCAST(nrec,1,MPI_INTEGER,0,MPI_COMM_WORLD,ier)
-  call MPI_BARRIER(MPI_COMM_WORLD,ier)
-
+  
   call MPI_BCAST(islice_selected_rec,nrec,MPI_INTEGER,0,MPI_COMM_WORLD,ier)
   call MPI_BCAST(ispec_selected_rec,nrec,MPI_INTEGER,0,MPI_COMM_WORLD,ier)
   call MPI_BCAST(xi_receiver,nrec,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,ier)

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -25,94 +25,194 @@
 !
 !=====================================================================
 
-  subroutine prepare_timerun_rmass(myrank,rmass_ocean_load,rmass_crust_mantle, &
-                      rmass_outer_core,rmass_inner_core, &
-                      iproc_xi,iproc_eta,ichunk,addressing, &
-                      iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle, &
-                      iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-                      npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-                      iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-                      iboolleft_xi_outer_core,iboolright_xi_outer_core, &
-                      iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-                      npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-                      iboolfaces_outer_core,iboolcorner_outer_core, &
-                      iboolleft_xi_inner_core,iboolright_xi_inner_core, &
-                      iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-                      npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-                      iboolfaces_inner_core,iboolcorner_inner_core, &
-                      iprocfrom_faces,iprocto_faces,imsg_type, &
-                      iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-                      buffer_send_faces_scalar,buffer_received_faces_scalar, &
-                      buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
-                      NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
-                      NGLOB1D_RADIAL,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB2DMAX_XY,npoin2D_max_all_CM_IC)
+  subroutine prepare_timerun()
 
+  use specfem_par
+  use specfem_par_movie
   implicit none
-
+  
   include 'mpif.h'
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
 
-  integer myrank,npoin2D_max_all_CM_IC
+  ! get MPI starting time
+  time_start = MPI_WTIME()
+  
+  ! user output infos
+  call prepare_timerun_user_output()
 
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE_OCEANS) :: rmass_ocean_load
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: rmass_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: rmass_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: rmass_inner_core
+  ! sets up mass matrices
+  call prepare_timerun_mass_matrices()
 
-  integer ichunk,iproc_xi,iproc_eta
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
+  ! convert x/y/z into r/theta/phi spherical coordinates
+  call prepare_timerun_convert_coord()
+  
+  ! allocate files to save movies
+  ! for noise tomography, store_val_x/y/z/ux/uy/uz needed for 'surface movie'
+  if(MOVIE_SURFACE .or. NOISE_TOMOGRAPHY /= 0 ) then
+    call prepare_timerun_movie_surface()
+  endif
 
-  ! 2-D addressing and buffers for summation between slices
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
+  ! output point and element information for 3D movies
+  if(MOVIE_VOLUME) call prepare_timerun_movie_volume()
 
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_OC) :: iboolleft_xi_outer_core,iboolright_xi_outer_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_OC) :: iboolleft_eta_outer_core,iboolright_eta_outer_core
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
+  ! sets up time increments and rotation constants
+  call prepare_timerun_constants()
 
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle
+  ! precomputes gravity factors
+  call prepare_timerun_gravity()
 
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_outer_core,npoin2D_eta_outer_core
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_inner_core,npoin2D_eta_inner_core
+  ! precomputes attenuation factors
+  if(ATTENUATION_VAL) call prepare_timerun_attenuation()
 
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle, &
-      iboolfaces_outer_core,iboolfaces_inner_core
+  ! initializes arrays
+  call prepare_timerun_init_wavefield()
 
-  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
-  integer npoin2D_faces_outer_core(NUMFACES_SHARED)
-  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
+  ! reads files back from local disk or MT tape system if restart file
+  ! note: for SIMULATION_TYPE 3 simulations, the stored wavefields
+  !          will be read in the time loop after the Newmark time scheme update.
+  !          this makes indexing and timing easier to match with adjoint wavefields indexing.
+  call read_forward_arrays_startrun()
 
-  ! indirect addressing for each corner of the chunks
-  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
-  integer, dimension(NGLOB1D_RADIAL_OC,NUMCORNERS_SHARED) :: iboolcorner_outer_core
-  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
+  ! prepares noise simulations
+  call prepare_timerun_noise()
 
-  ! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces,imsg_type
-  ! communication pattern for corners between chunks
-  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
+  ! prepares GPU arrays
+  if(GPU_MODE) call prepare_timerun_GPU()
 
-  ! buffers for send and receive between faces of the slices and the chunks
-  real(kind=CUSTOM_REAL), dimension(NGLOB2DMAX_XY_VAL) ::  &
-    buffer_send_faces_scalar,buffer_received_faces_scalar
+  ! user output
+  if( myrank == 0 ) then
+    ! elapsed time since beginning of mesh generation
+    tCPU = MPI_WTIME() - time_start
+    write(IMAIN,*)
+    write(IMAIN,*) 'Elapsed time for preparing timerun in seconds = ',sngl(tCPU)
+    write(IMAIN,*)
+  endif
 
-  ! buffers for send and receive between corners of the chunks
-  real(kind=CUSTOM_REAL), dimension(NGLOB1D_RADIAL_CM) :: &
-    buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar
+  end subroutine prepare_timerun
 
-  integer NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-  integer, dimension(MAX_NUM_REGIONS) :: NGLOB1D_RADIAL,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
-  integer NGLOB2DMAX_XY
+  subroutine prepare_timerun_user_output()
 
+  use specfem_par
+  implicit none
+
+  ! user output
+  if(myrank == 0) then
+
+    write(IMAIN,*)
+    write(IMAIN,*) 'Reference radius of the Earth used is ',R_EARTH_KM,' km'
+    write(IMAIN,*)
+
+    write(IMAIN,*)
+    if(OCEANS_VAL) then
+      write(IMAIN,*) 'incorporating the oceans using equivalent load'
+    else
+      write(IMAIN,*) 'no oceans'
+    endif
+
+    write(IMAIN,*)
+    if(ELLIPTICITY_VAL) then
+      write(IMAIN,*) 'incorporating ellipticity'
+    else
+      write(IMAIN,*) 'no ellipticity'
+    endif
+
+    write(IMAIN,*)
+    if(TOPOGRAPHY) then
+      write(IMAIN,*) 'incorporating surface topography'
+    else
+      write(IMAIN,*) 'no surface topography'
+    endif
+
+    write(IMAIN,*)
+    if(GRAVITY_VAL) then
+      write(IMAIN,*) 'incorporating self-gravitation (Cowling approximation)'
+    else
+      write(IMAIN,*) 'no self-gravitation'
+    endif
+
+    write(IMAIN,*)
+    if(ROTATION_VAL) then
+      write(IMAIN,*) 'incorporating rotation'
+    else
+      write(IMAIN,*) 'no rotation'
+    endif
+
+    write(IMAIN,*)
+    if(ATTENUATION_VAL) then
+      write(IMAIN,*) 'incorporating attenuation using ',N_SLS,' standard linear solids'
+
+      if(ATTENUATION_3D_VAL) write(IMAIN,*) 'using 3D attenuation'
+
+      if(USE_ATTENUATION_MIMIC ) write(IMAIN,*) 'mimicking effects on velocity only'
+    else
+      write(IMAIN,*) 'no attenuation'
+    endif
+
+    write(IMAIN,*)
+    write(IMAIN,*)
+    write(IMAIN,*)
+
+  endif
+
+  end subroutine prepare_timerun_user_output
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_mass_matrices()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  implicit none
+
+  ! mass matrices need to be assembled with MPI here once and for all
+  call prepare_timerun_rmass_assembly()
+
+  ! check that all the mass matrices are positive
+  if(OCEANS_VAL) then
+    if(minval(rmass_ocean_load) <= 0.) call exit_MPI(myrank,'negative mass matrix term for the oceans')
+  endif
+  if(minval(rmass_crust_mantle) <= 0.) call exit_MPI(myrank,'negative mass matrix term for the crust_mantle')
+  if(minval(rmass_inner_core) <= 0.) call exit_MPI(myrank,'negative mass matrix term for the inner core')
+  if(minval(rmass_outer_core) <= 0.) call exit_MPI(myrank,'negative mass matrix term for the outer core')
+
+  ! for efficiency, invert final mass matrix once and for all on each slice
+  if(OCEANS_VAL) rmass_ocean_load = 1._CUSTOM_REAL / rmass_ocean_load
+
+  rmass_crust_mantle = 1._CUSTOM_REAL / rmass_crust_mantle
+  rmass_outer_core = 1._CUSTOM_REAL / rmass_outer_core
+  rmass_inner_core = 1._CUSTOM_REAL / rmass_inner_core
+
+  end subroutine prepare_timerun_mass_matrices
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_rmass_assembly()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  implicit none
+
   ! local parameters
-  integer :: ier
+  integer :: ndim_assemble
 
+  ! temporary buffers for send and receive between faces of the slices and the chunks
+  real(kind=CUSTOM_REAL), dimension(npoin2D_max_all_CM_IC) ::  &
+    buffer_send_faces_scalar,buffer_received_faces_scalar
+
   ! synchronize all the processes before assembling the mass matrix
   ! to make sure all the nodes have finished to read their databases
-  call MPI_BARRIER(MPI_COMM_WORLD,ier)
+  call sync_all()
 
   ! the mass matrix needs to be assembled with MPI here once and for all
 
@@ -174,141 +274,208 @@
             NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_INNER_CORE), &
             NGLOB2DMAX_XMIN_XMAX(IREGION_INNER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_INNER_CORE),NGLOB2DMAX_XY,NCHUNKS_VAL)
 
+
+  ! mass matrix including central cube
+  if(INCLUDE_CENTRAL_CUBE) then
+    ! the mass matrix to assemble is a scalar, not a vector
+    ndim_assemble = 1
+
+    ! use central cube buffers to assemble the inner core mass matrix with the central cube
+    call assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
+                 npoin2D_cube_from_slices, buffer_all_cube_from_slices, &
+                 buffer_slices, buffer_slices2, ibool_central_cube, &
+                 receiver_cube_from_slices, ibool_inner_core, &
+                 idoubling_inner_core, NSPEC_INNER_CORE, &
+                 ibelm_bottom_inner_core, NSPEC2D_BOTTOM(IREGION_INNER_CORE), &
+                 NGLOB_INNER_CORE, &
+                 rmass_inner_core,ndim_assemble)
+
+    ! suppress fictitious mass matrix elements in central cube
+    ! because the slices do not compute all their spectral elements in the cube
+    where(rmass_inner_core(:) <= 0.) rmass_inner_core = 1.
+  endif
+
   if(myrank == 0) write(IMAIN,*) 'end assembling MPI mass matrix'
 
-  end subroutine prepare_timerun_rmass
+  end subroutine prepare_timerun_rmass_assembly
 
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine prepare_timerun_centralcube(myrank,rmass_inner_core, &
-                      iproc_xi,iproc_eta,ichunk, &
-                      NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX,NSPEC2D_BOTTOM, &
-                      addressing,ibool_inner_core,idoubling_inner_core, &
-                      xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-                      nspec2D_xmin_inner_core,nspec2D_xmax_inner_core, &
-                      nspec2D_ymin_inner_core,nspec2D_ymax_inner_core, &
-                      ibelm_xmin_inner_core,ibelm_xmax_inner_core, &
-                      ibelm_ymin_inner_core,ibelm_ymax_inner_core,ibelm_bottom_inner_core, &
-                      nb_msgs_theor_in_cube,non_zero_nb_msgs_theor_in_cube, &
-                      npoin2D_cube_from_slices,receiver_cube_from_slices, &
-                      sender_from_slices_to_cube,ibool_central_cube, &
-                      buffer_slices,buffer_slices2,buffer_all_cube_from_slices)
+  subroutine prepare_timerun_convert_coord()
 
+! converts x/y/z into r/theta/phi spherical coordinates
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore  
   implicit none
 
-  include 'mpif.h'
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
+  ! local parameters
+  integer :: i
+  real(kind=CUSTOM_REAL) :: rval,thetaval,phival
+  
+  ! change x, y, z to r, theta and phi once and for all
+  ! IMPROVE dangerous: old name kept (xstore ystore zstore) for new values
 
-  integer myrank
+  ! convert in the crust and mantle
+  do i = 1,NGLOB_CRUST_MANTLE
+    call xyz_2_rthetaphi(xstore_crust_mantle(i), &
+                        ystore_crust_mantle(i), &
+                        zstore_crust_mantle(i),rval,thetaval,phival)
+    xstore_crust_mantle(i) = rval
+    ystore_crust_mantle(i) = thetaval
+    zstore_crust_mantle(i) = phival
+  enddo
 
-  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: rmass_inner_core
+  ! convert in the outer core
+  do i = 1,NGLOB_OUTER_CORE
+    call xyz_2_rthetaphi(xstore_outer_core(i), &
+                        ystore_outer_core(i), &
+                        zstore_outer_core(i),rval,thetaval,phival)
+    xstore_outer_core(i) = rval
+    ystore_outer_core(i) = thetaval
+    zstore_outer_core(i) = phival
+  enddo
 
-  integer ichunk,iproc_xi,iproc_eta
+  ! convert in the inner core
+  do i = 1,NGLOB_INNER_CORE
+    call xyz_2_rthetaphi(xstore_inner_core(i), &
+                        ystore_inner_core(i), &
+                        zstore_inner_core(i),rval,thetaval,phival)
+    xstore_inner_core(i) = rval
+    ystore_inner_core(i) = thetaval
+    zstore_inner_core(i) = phival
+  enddo
 
-  integer, dimension(MAX_NUM_REGIONS) :: NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX,NSPEC2D_BOTTOM
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
+  end subroutine prepare_timerun_convert_coord
 
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
-  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: &
-        xstore_inner_core,ystore_inner_core,zstore_inner_core
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-  integer nspec2D_xmin_inner_core,nspec2D_xmax_inner_core, &
-    nspec2D_ymin_inner_core,nspec2D_ymax_inner_core
+  subroutine prepare_timerun_movie_surface()
 
-  integer, dimension(NSPEC2DMAX_XMIN_XMAX_IC) :: ibelm_xmin_inner_core,ibelm_xmax_inner_core
-  integer, dimension(NSPEC2DMAX_YMIN_YMAX_IC) :: ibelm_ymin_inner_core,ibelm_ymax_inner_core
-  integer, dimension(NSPEC2D_BOTTOM_IC) :: ibelm_bottom_inner_core
-
-  integer nb_msgs_theor_in_cube,non_zero_nb_msgs_theor_in_cube, &
-    npoin2D_cube_from_slices,receiver_cube_from_slices
-
-  integer, dimension(non_zero_nb_msgs_theor_in_cube) :: sender_from_slices_to_cube
-  integer, dimension(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices) :: ibool_central_cube
-  double precision, dimension(npoin2D_cube_from_slices,NDIM) :: buffer_slices,buffer_slices2
-  double precision, dimension(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices,NDIM) :: &
-    buffer_all_cube_from_slices
-
+  use specfem_par
+  use specfem_par_movie
+  implicit none
+  
   ! local parameters
-  integer :: ndim_assemble
+  integer :: ier
 
-  ! create buffers to assemble with the central cube
-  call create_central_cube_buffers(myrank,iproc_xi,iproc_eta,ichunk, &
-               NPROC_XI_VAL,NPROC_ETA_VAL,NCHUNKS_VAL,NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
-               NSPEC2DMAX_XMIN_XMAX(IREGION_INNER_CORE),NSPEC2DMAX_YMIN_YMAX(IREGION_INNER_CORE), &
-               NSPEC2D_BOTTOM(IREGION_INNER_CORE), &
-               addressing,ibool_inner_core,idoubling_inner_core, &
-               xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-               nspec2D_xmin_inner_core,nspec2D_xmax_inner_core, &
-               nspec2D_ymin_inner_core,nspec2D_ymax_inner_core, &
-               ibelm_xmin_inner_core,ibelm_xmax_inner_core, &
-               ibelm_ymin_inner_core,ibelm_ymax_inner_core,ibelm_bottom_inner_core, &
-               nb_msgs_theor_in_cube,non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices, &
-               receiver_cube_from_slices,sender_from_slices_to_cube,ibool_central_cube, &
-               buffer_slices,buffer_slices2,buffer_all_cube_from_slices)
+  if(MOVIE_COARSE .and. NOISE_TOMOGRAPHY ==0) then  ! only output corners !for noise tomography, must NOT be coarse
+     nmovie_points = 2 * 2 * NSPEC2D_TOP(IREGION_CRUST_MANTLE)
+     if(NGLLX /= NGLLY) &
+      call exit_MPI(myrank,'MOVIE_COARSE together with MOVIE_SURFACE requires NGLLX=NGLLY')
+     NIT = NGLLX - 1
+  else
+     nmovie_points = NGLLX * NGLLY * NSPEC2D_TOP(IREGION_CRUST_MANTLE)
+     NIT = 1
+  endif
+  allocate(store_val_x(nmovie_points), &
+          store_val_y(nmovie_points), &
+          store_val_z(nmovie_points), &
+          store_val_ux(nmovie_points), &
+          store_val_uy(nmovie_points), &
+          store_val_uz(nmovie_points),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating movie surface arrays')
 
-  if(myrank == 0) write(IMAIN,*) 'done including central cube'
+  if (MOVIE_SURFACE) then  ! those arrays are not neccessary for noise tomography, so only allocate them in MOVIE_SURFACE case
+     allocate(store_val_x_all(nmovie_points,0:NPROCTOT_VAL-1), &
+            store_val_y_all(nmovie_points,0:NPROCTOT_VAL-1), &
+            store_val_z_all(nmovie_points,0:NPROCTOT_VAL-1), &
+            store_val_ux_all(nmovie_points,0:NPROCTOT_VAL-1), &
+            store_val_uy_all(nmovie_points,0:NPROCTOT_VAL-1), &
+            store_val_uz_all(nmovie_points,0:NPROCTOT_VAL-1),stat=ier)
+     if( ier /= 0 ) call exit_MPI(myrank,'error allocating movie surface all arrays')
+  endif
+  if(myrank == 0) then
+    write(IMAIN,*)
+    write(IMAIN,*) 'Movie surface:'
+    write(IMAIN,*) '  Writing to moviedata*** files in output directory'
+    if(MOVIE_VOLUME_TYPE == 5) then
+      write(IMAIN,*) '  movie output: displacement'
+    else
+      write(IMAIN,*) '  movie output: velocity'
+    endif
+    write(IMAIN,*) '  time steps every: ',NTSTEP_BETWEEN_FRAMES
+  endif
 
-  ! the mass matrix to assemble is a scalar, not a vector
-  ndim_assemble = 1
+  end subroutine prepare_timerun_movie_surface
 
-  ! use these buffers to assemble the inner core mass matrix with the central cube
-  call assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
-               npoin2D_cube_from_slices, buffer_all_cube_from_slices, &
-               buffer_slices, buffer_slices2, ibool_central_cube, &
-               receiver_cube_from_slices, ibool_inner_core, &
-               idoubling_inner_core, NSPEC_INNER_CORE, &
-               ibelm_bottom_inner_core, NSPEC2D_BOTTOM(IREGION_INNER_CORE), &
-               NGLOB_INNER_CORE,rmass_inner_core,ndim_assemble)
-
-  ! suppress fictitious mass matrix elements in central cube
-  ! because the slices do not compute all their spectral elements in the cube
-  where(rmass_inner_core(:) <= 0.) rmass_inner_core = 1.
-
-  end subroutine prepare_timerun_centralcube
-
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine prepare_timerun_constants(myrank,NSTEP, &
-                    DT,t0,scale_t,scale_t_inv,scale_displ,scale_veloc, &
-                    deltat,deltatover2,deltatsqover2, &
-                    b_deltat,b_deltatover2,b_deltatsqover2, &
-                    two_omega_earth,A_array_rotation,B_array_rotation, &
-                    b_two_omega_earth, SIMULATION_TYPE)
+  subroutine prepare_timerun_movie_volume()
 
-! precomputes constants for time integration
-
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_movie
   implicit none
+  
+  ! local parameters
+  integer :: ier
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
+  ! the following has to be true for the the array dimensions of eps to match with those of xstore etc..
+  ! note that epsilondev and eps_trace_over_3 don't have the same dimensions.. could cause trouble
+  if (NSPEC_CRUST_MANTLE_STR_OR_ATT /= NSPEC_CRUST_MANTLE) &
+    stop 'NSPEC_CRUST_MANTLE_STRAINS_ATT /= NSPEC_CRUST_MANTLE'
+  if (NSPEC_CRUST_MANTLE_STRAIN_ONLY /= NSPEC_CRUST_MANTLE) &
+    stop 'NSPEC_CRUST_MANTLE_STRAIN_ONLY /= NSPEC_CRUST_MANTLE'
 
-  integer myrank,NSTEP
+  write(prname,'(a,i6.6,a)') trim(LOCAL_PATH)//'/'//'proc',myrank,'_'
+  call count_points_movie_volume(prname,ibool_crust_mantle, xstore_crust_mantle,ystore_crust_mantle, &
+              zstore_crust_mantle,MOVIE_TOP,MOVIE_BOTTOM,MOVIE_WEST,MOVIE_EAST,MOVIE_NORTH,MOVIE_SOUTH, &
+              MOVIE_COARSE,npoints_3dmovie,nspecel_3dmovie,num_ibool_3dmovie,mask_ibool,mask_3dmovie)
 
-  double precision DT
-  double precision t0
+  allocate(nu_3dmovie(3,3,npoints_3dmovie),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating nu for 3d movie')
 
+  call write_movie_volume_mesh(npoints_3dmovie,prname,ibool_crust_mantle,xstore_crust_mantle, &
+                         ystore_crust_mantle,zstore_crust_mantle, muvstore_crust_mantle_3dmovie, &
+                         mask_3dmovie,mask_ibool,num_ibool_3dmovie,nu_3dmovie,MOVIE_COARSE)
 
-  double precision scale_t,scale_t_inv,scale_displ,scale_veloc
+  if(myrank == 0) then
+    write(IMAIN,*)
+    write(IMAIN,*) 'Movie volume:'
+    write(IMAIN,*) '  Writing to movie3D*** files on local disk databases directory'
+    if(MOVIE_VOLUME_TYPE == 1) then
+      write(IMAIN,*) '  movie output: strain'
+    else if(MOVIE_VOLUME_TYPE == 2) then
+      write(IMAIN,*) '  movie output: time integral of strain'
+    else if(MOVIE_VOLUME_TYPE == 3) then
+      write(IMAIN,*) '  movie output: potency or integral of strain'
+    else if(MOVIE_VOLUME_TYPE == 4) then
+      write(IMAIN,*) '  movie output: divergence and curl'
+    else if(MOVIE_VOLUME_TYPE == 5) then
+      write(IMAIN,*) '  movie output: displacement'
+    else if(MOVIE_VOLUME_TYPE == 6) then
+      write(IMAIN,*) '  movie output: velocity'
+    endif
+    write(IMAIN,*) '  depth(T,B):',MOVIE_TOP,MOVIE_BOTTOM
+    write(IMAIN,*) '  lon(W,E)  :',MOVIE_WEST,MOVIE_EAST
+    write(IMAIN,*) '  lat(S,N)  :',MOVIE_SOUTH,MOVIE_NORTH
+    write(IMAIN,*) '  Starting at time step:',MOVIE_START, 'ending at:',MOVIE_STOP,'every: ',NTSTEP_BETWEEN_FRAMES
+  endif
 
-  real(kind=CUSTOM_REAL) deltat,deltatover2,deltatsqover2
-  real(kind=CUSTOM_REAL) b_deltat,b_deltatover2,b_deltatsqover2
+  if( MOVIE_VOLUME_TYPE < 1 .or. MOVIE_VOLUME_TYPE > 6) &
+      call exit_MPI(myrank, 'MOVIE_VOLUME_TYPE has to be 1,2,3,4,5 or 6')
 
-  real(kind=CUSTOM_REAL) two_omega_earth
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROTATION) :: &
-    A_array_rotation,B_array_rotation
+  end subroutine prepare_timerun_movie_volume
 
-  real(kind=CUSTOM_REAL) b_two_omega_earth
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-  integer SIMULATION_TYPE
+  subroutine prepare_timerun_constants()
 
-  ! local parameters
+! precomputes constants for time integration
 
+  use specfem_par
+  implicit none
 
   if(myrank == 0) then
     write(IMAIN,*)
@@ -386,34 +553,13 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine prepare_timerun_gravity(myrank, &
-                    minus_g_cmb,minus_g_icb, &
-                    minus_gravity_table,minus_deriv_gravity_table, &
-                    density_table,d_ln_density_dr_table,minus_rho_g_over_kappa_fluid, &
-                    ONE_CRUST,RICB,RCMB,RTOPDDOUBLEPRIME, &
-                    R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN)
+  subroutine prepare_timerun_gravity()
 
 ! precomputes gravity factors
 
+  use specfem_par
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
-
-  integer myrank
-
-  real(kind=CUSTOM_REAL) minus_g_cmb,minus_g_icb
-
-  ! lookup table every km for gravity
-  double precision, dimension(NRAD_GRAVITY) :: minus_gravity_table, &
-    minus_deriv_gravity_table,density_table, &
-    d_ln_density_dr_table,minus_rho_g_over_kappa_fluid
-
-  logical ONE_CRUST
-
-  double precision RICB,RCMB,RTOPDDOUBLEPRIME, &
-    R80,R220,R400,R600,R670,R771,RMOHO,RMIDDLE_CRUST,ROCEAN
-
   ! local parameters
   double precision :: rspl_gravity(NR),gspl(NR),gspl2(NR)
   double precision :: radius,radius_km,g,dg
@@ -495,70 +641,16 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine prepare_timerun_attenuation(myrank, &
-                factor_scale_crust_mantle,one_minus_sum_beta_crust_mantle,factor_common_crust_mantle, &
-                factor_scale_inner_core,one_minus_sum_beta_inner_core,factor_common_inner_core, &
-                c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-                c22store_crust_mantle,c23store_crust_mantle, &
-                c33store_crust_mantle,c44store_crust_mantle, &
-                c55store_crust_mantle,c66store_crust_mantle, &
-                muvstore_crust_mantle,muhstore_crust_mantle,ispec_is_tiso_crust_mantle, &
- !----- idoubling_crust_mantle, &
-                muvstore_inner_core, &
-                SIMULATION_TYPE,MOVIE_VOLUME,muvstore_crust_mantle_3dmovie, &
-                c11store_inner_core,c12store_inner_core,c13store_inner_core, &
-                c33store_inner_core,c44store_inner_core, &
-                alphaval,betaval,gammaval,b_alphaval,b_betaval,b_gammaval, &
-                deltat,b_deltat,LOCAL_PATH)
+  subroutine prepare_timerun_attenuation()
 
   ! precomputes attenuation factors
 
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_movie
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
-
-  integer myrank
-
-  ! memory variables and standard linear solids for attenuation
-  real(kind=CUSTOM_REAL), dimension(ATT1,ATT2,ATT3,ATT4) :: one_minus_sum_beta_crust_mantle, factor_scale_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(ATT1,ATT2,ATT3,ATT5) :: one_minus_sum_beta_inner_core, factor_scale_inner_core
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1,ATT2,ATT3,ATT4) :: factor_common_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1,ATT2,ATT3,ATT5) :: factor_common_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_MANTLE) :: &
-      c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-      c22store_crust_mantle,c23store_crust_mantle, &
-      c33store_crust_mantle,c44store_crust_mantle, &
-      c55store_crust_mantle,c66store_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
-        muvstore_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_TISO_MANTLE) :: &
-        muhstore_crust_mantle
-!  integer, dimension(NSPEC_CRUST_MANTLE) :: idoubling_crust_mantle
-  logical, dimension(NSPEC_CRUST_MANTLE) :: ispec_is_tiso_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
-        muvstore_inner_core
-
-
-  integer SIMULATION_TYPE
-  logical MOVIE_VOLUME
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: muvstore_crust_mantle_3dmovie
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_IC) :: &
-        c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-        c13store_inner_core,c44store_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval, betaval, gammaval
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: b_alphaval, b_betaval, b_gammaval
-
-  real(kind=CUSTOM_REAL) deltat,b_deltat
-
-  character(len=150) LOCAL_PATH
-
   ! local parameters
   double precision, dimension(ATT1,ATT2,ATT3,ATT4) :: omsb_crust_mantle_dble, factor_scale_crust_mantle_dble
   double precision, dimension(ATT1,ATT2,ATT3,ATT5) :: omsb_inner_core_dble, factor_scale_inner_core_dble
@@ -570,18 +662,18 @@
   double precision :: scale_factor,scale_factor_minus_one
   real(kind=CUSTOM_REAL) :: mul
   integer :: ispec,i,j,k
-  character(len=150) :: prname
+  character(len=150) :: prnamel
 
   ! get and store PREM attenuation model
 
   ! CRUST_MANTLE ATTENUATION
-  call create_name_database(prname, myrank, IREGION_CRUST_MANTLE, LOCAL_PATH)
-  call get_attenuation_model_3D(myrank, prname, omsb_crust_mantle_dble, &
+  call create_name_database(prnamel, myrank, IREGION_CRUST_MANTLE, LOCAL_PATH)
+  call get_attenuation_model_3D(myrank, prnamel, omsb_crust_mantle_dble, &
            factor_common_crust_mantle_dble,factor_scale_crust_mantle_dble,tau_sigma_dble,NSPEC_CRUST_MANTLE)
 
   ! INNER_CORE ATTENUATION
-  call create_name_database(prname, myrank, IREGION_INNER_CORE, LOCAL_PATH)
-  call get_attenuation_model_3D(myrank, prname, omsb_inner_core_dble, &
+  call create_name_database(prnamel, myrank, IREGION_INNER_CORE, LOCAL_PATH)
+  call get_attenuation_model_3D(myrank, prnamel, omsb_inner_core_dble, &
            factor_common_inner_core_dble,factor_scale_inner_core_dble,tau_sigma_dble,NSPEC_INNER_CORE)
 
   if(CUSTOM_REAL == SIZE_REAL) then
@@ -715,3 +807,484 @@
   endif
 
   end subroutine prepare_timerun_attenuation
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_init_wavefield()
+
+! initializes arrays
+ 
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  use specfem_par_movie
+  implicit none
+
+  ! local parameters
+  integer :: ier
+
+  ! initialize arrays to zero
+  displ_crust_mantle(:,:) = 0._CUSTOM_REAL
+  veloc_crust_mantle(:,:) = 0._CUSTOM_REAL
+  accel_crust_mantle(:,:) = 0._CUSTOM_REAL
+
+  displ_outer_core(:) = 0._CUSTOM_REAL
+  veloc_outer_core(:) = 0._CUSTOM_REAL
+  accel_outer_core(:) = 0._CUSTOM_REAL
+
+  displ_inner_core(:,:) = 0._CUSTOM_REAL
+  veloc_inner_core(:,:) = 0._CUSTOM_REAL
+  accel_inner_core(:,:) = 0._CUSTOM_REAL
+
+  ! put negligible initial value to avoid very slow underflow trapping
+  if(FIX_UNDERFLOW_PROBLEM) then
+    displ_crust_mantle(:,:) = VERYSMALLVAL
+    displ_outer_core(:) = VERYSMALLVAL
+    displ_inner_core(:,:) = VERYSMALLVAL
+  endif
+
+  ! if doing benchmark runs to measure scaling of the code,
+  ! set the initial field to 1 to make sure gradual underflow trapping does not slow down the code
+  if (DO_BENCHMARK_RUN_ONLY .and. SET_INITIAL_FIELD_TO_1_IN_BENCH) then
+    displ_crust_mantle(:,:) = 1._CUSTOM_REAL
+    veloc_crust_mantle(:,:) = 1._CUSTOM_REAL
+    accel_crust_mantle(:,:) = 1._CUSTOM_REAL
+
+    displ_outer_core(:) = 1._CUSTOM_REAL
+    veloc_outer_core(:) = 1._CUSTOM_REAL
+    accel_outer_core(:) = 1._CUSTOM_REAL
+
+    displ_inner_core(:,:) = 1._CUSTOM_REAL
+    veloc_inner_core(:,:) = 1._CUSTOM_REAL
+    accel_inner_core(:,:) = 1._CUSTOM_REAL
+  endif
+
+  if (SIMULATION_TYPE == 3) then
+    rho_kl_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+    beta_kl_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+    alpha_kl_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+    if (NOISE_TOMOGRAPHY == 3) Sigma_kl_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+
+    ! approximate hessian
+    if( APPROXIMATE_HESS_KL ) then
+      allocate( hess_kl_crust_mantle(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT),stat=ier)
+      if( ier /= 0 ) call exit_MPI(myrank,'error allocating hessian')
+      hess_kl_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+    endif
+
+    ! For anisotropic kernels (in crust_mantle only)
+    cijkl_kl_crust_mantle(:,:,:,:,:) = 0._CUSTOM_REAL
+
+    rho_kl_outer_core(:,:,:,:) = 0._CUSTOM_REAL
+    alpha_kl_outer_core(:,:,:,:) = 0._CUSTOM_REAL
+
+    rho_kl_inner_core(:,:,:,:) = 0._CUSTOM_REAL
+    beta_kl_inner_core(:,:,:,:) = 0._CUSTOM_REAL
+    alpha_kl_inner_core(:,:,:,:) = 0._CUSTOM_REAL
+
+    div_displ_outer_core(:,:,:,:) = 0._CUSTOM_REAL
+    b_div_displ_outer_core(:,:,:,:) = 0._CUSTOM_REAL
+
+    ! deviatoric kernel check
+    if( deviatoric_outercore) then
+      nspec_beta_kl_outer_core = NSPEC_OUTER_CORE_ADJOINT
+    else
+      nspec_beta_kl_outer_core = 1
+    endif
+    allocate(beta_kl_outer_core(NGLLX,NGLLY,NGLLZ,nspec_beta_kl_outer_core),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating beta outercore')
+    beta_kl_outer_core(:,:,:,:) = 0._CUSTOM_REAL
+  endif
+
+  ! initialize to be on the save side for adjoint runs SIMULATION_TYPE==2
+  eps_trace_over_3_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+
+  epsilondev_xx_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+  epsilondev_yy_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+  epsilondev_xy_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+  epsilondev_xz_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+  epsilondev_yz_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
+
+  eps_trace_over_3_inner_core(:,:,:,:) = 0._CUSTOM_REAL
+
+  epsilondev_xx_inner_core(:,:,:,:) = 0._CUSTOM_REAL
+  epsilondev_yy_inner_core(:,:,:,:) = 0._CUSTOM_REAL
+  epsilondev_xy_inner_core(:,:,:,:) = 0._CUSTOM_REAL
+  epsilondev_xz_inner_core(:,:,:,:) = 0._CUSTOM_REAL
+  epsilondev_yz_inner_core(:,:,:,:) = 0._CUSTOM_REAL
+
+  if(FIX_UNDERFLOW_PROBLEM) then
+    eps_trace_over_3_crust_mantle(:,:,:,:) = VERYSMALLVAL
+
+    epsilondev_xx_crust_mantle(:,:,:,:) = VERYSMALLVAL
+    epsilondev_yy_crust_mantle(:,:,:,:) = VERYSMALLVAL
+    epsilondev_xy_crust_mantle(:,:,:,:) = VERYSMALLVAL
+    epsilondev_xz_crust_mantle(:,:,:,:) = VERYSMALLVAL
+    epsilondev_yz_crust_mantle(:,:,:,:) = VERYSMALLVAL
+
+    eps_trace_over_3_inner_core(:,:,:,:) = VERYSMALLVAL
+
+    epsilondev_xx_inner_core(:,:,:,:) = VERYSMALLVAL
+    epsilondev_yy_inner_core(:,:,:,:) = VERYSMALLVAL
+    epsilondev_xy_inner_core(:,:,:,:) = VERYSMALLVAL
+    epsilondev_xz_inner_core(:,:,:,:) = VERYSMALLVAL
+    epsilondev_yz_inner_core(:,:,:,:) = VERYSMALLVAL
+
+  endif
+
+  if (COMPUTE_AND_STORE_STRAIN) then
+    if(MOVIE_VOLUME .and. (MOVIE_VOLUME_TYPE == 2 .or. MOVIE_VOLUME_TYPE == 3)) then
+      Iepsilondev_crust_mantle(:,:,:,:,:) = 0._CUSTOM_REAL
+      Ieps_trace_over_3_crust_mantle(:,:,:,:)=0._CUSTOM_REAL
+    endif
+  endif
+
+  ! clear memory variables if attenuation
+  if(ATTENUATION_VAL) then
+    R_xx_crust_mantle(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_yy_crust_mantle(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_xy_crust_mantle(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_xz_crust_mantle(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_yz_crust_mantle(:,:,:,:,:) = 0._CUSTOM_REAL
+
+    R_xx_inner_core(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_yy_inner_core(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_xy_inner_core(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_xz_inner_core(:,:,:,:,:) = 0._CUSTOM_REAL
+    R_yz_inner_core(:,:,:,:,:) = 0._CUSTOM_REAL
+
+    if(FIX_UNDERFLOW_PROBLEM) then
+      R_xx_crust_mantle(:,:,:,:,:) = VERYSMALLVAL
+      R_yy_crust_mantle(:,:,:,:,:) = VERYSMALLVAL
+      R_xy_crust_mantle(:,:,:,:,:) = VERYSMALLVAL
+      R_xz_crust_mantle(:,:,:,:,:) = VERYSMALLVAL
+      R_yz_crust_mantle(:,:,:,:,:) = VERYSMALLVAL
+
+      R_xx_inner_core(:,:,:,:,:) = VERYSMALLVAL
+      R_yy_inner_core(:,:,:,:,:) = VERYSMALLVAL
+      R_xy_inner_core(:,:,:,:,:) = VERYSMALLVAL
+      R_xz_inner_core(:,:,:,:,:) = VERYSMALLVAL
+      R_yz_inner_core(:,:,:,:,:) = VERYSMALLVAL
+    endif
+  endif
+
+  end subroutine prepare_timerun_init_wavefield
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_noise()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_movie
+  implicit none
+  ! local parameters
+  integer :: ier
+
+  ! NOISE TOMOGRAPHY
+  if ( NOISE_TOMOGRAPHY /= 0 ) then
+    allocate(noise_sourcearray(NDIM,NGLLX,NGLLY,NGLLZ,NSTEP), &
+            normal_x_noise(nmovie_points), &
+            normal_y_noise(nmovie_points), &
+            normal_z_noise(nmovie_points), &
+            mask_noise(nmovie_points), &
+            noise_surface_movie(NDIM,NGLLX,NGLLY,NSPEC2D_TOP(IREGION_CRUST_MANTLE)),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating noise arrays')
+
+    noise_sourcearray(:,:,:,:,:) = 0._CUSTOM_REAL
+    normal_x_noise(:)            = 0._CUSTOM_REAL
+    normal_y_noise(:)            = 0._CUSTOM_REAL
+    normal_z_noise(:)            = 0._CUSTOM_REAL
+    mask_noise(:)                = 0._CUSTOM_REAL
+    noise_surface_movie(:,:,:,:) = 0._CUSTOM_REAL
+
+    call read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, &
+                              islice_selected_rec,xi_receiver,eta_receiver,gamma_receiver,nu, &
+                              noise_sourcearray,xigll,yigll,zigll,NSPEC2D_TOP(IREGION_CRUST_MANTLE), &
+                              NIT, ibool_crust_mantle, ibelm_top_crust_mantle, &
+                              xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+                              irec_master_noise,normal_x_noise,normal_y_noise,normal_z_noise,mask_noise)
+
+    call check_parameters_noise(myrank,NOISE_TOMOGRAPHY,SIMULATION_TYPE,SAVE_FORWARD, &
+                              NUMBER_OF_RUNS, NUMBER_OF_THIS_RUN,ROTATE_SEISMOGRAMS_RT, &
+                              SAVE_ALL_SEISMOS_IN_ONE_FILE, USE_BINARY_FOR_LARGE_FILE, &
+                              MOVIE_COARSE,LOCAL_PATH,NSPEC2D_TOP(IREGION_CRUST_MANTLE),NSTEP)
+  endif
+
+  end subroutine prepare_timerun_noise
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_GPU()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  use specfem_par_movie
+  implicit none
+  include 'mpif.h'
+
+  ! local parameters
+  integer :: ier
+  real :: free_mb,used_mb,total_mb
+  integer :: ncuda_devices,ncuda_devices_min,ncuda_devices_max
+  ! dummy custom_real variables to convert from double precision
+  real(kind=CUSTOM_REAL),dimension(:,:,:),allocatable:: cr_wgll_cube
+  real(kind=CUSTOM_REAL),dimension(:),allocatable:: &
+    cr_d_ln_density_dr_table,cr_minus_rho_g_over_kappa_fluid, &
+    cr_minus_gravity_table,cr_minus_deriv_gravity_table, &
+    cr_density_table
+
+  ! GPU_MODE now defined in Par_file
+  if(myrank == 0 ) then
+    write(IMAIN,*)
+    write(IMAIN,*) "GPU_MODE Active. Preparing Fields and Constants on Device."
+    write(IMAIN,*)
+  endif
+
+  ! initializes GPU and outputs info to files for all processes
+  call prepare_cuda_device(myrank,ncuda_devices)
+
+  ! collects min/max of local devices found for statistics
+  call MPI_REDUCE(ncuda_devices,ncuda_devices_min,1,MPI_INTEGER,MPI_MIN,0,MPI_COMM_WORLD,ier)
+  call MPI_REDUCE(ncuda_devices,ncuda_devices_max,1,MPI_INTEGER,MPI_MAX,0,MPI_COMM_WORLD,ier)
+
+  ! prepares general fields on GPU
+  call prepare_constants_device(Mesh_pointer,NGLLX, &
+                                  hprime_xx, hprime_yy, hprime_zz, &
+                                  hprimewgll_xx, hprimewgll_yy, hprimewgll_zz, &
+                                  wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
+                                  NSOURCES, nsources_local, &
+                                  sourcearrays, islice_selected_source, ispec_selected_source, &
+                                  number_receiver_global, ispec_selected_rec, &
+                                  nrec, nrec_local, &
+                                  NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
+                                  NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
+                                  NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
+                                  SIMULATION_TYPE, &
+                                  SAVE_FORWARD, &
+                                  ABSORBING_CONDITIONS, &
+                                  GRAVITY_VAL,ROTATION_VAL, &
+                                  ATTENUATION_VAL,USE_ATTENUATION_MIMIC, &
+                                  COMPUTE_AND_STORE_STRAIN, &
+                                  ANISOTROPIC_3D_MANTLE_VAL, &
+                                  ANISOTROPIC_INNER_CORE_VAL, &
+                                  SAVE_BOUNDARY_MESH, &
+                                  USE_MESH_COLORING_GPU)
+  call sync_all()
+
+  ! prepares rotation arrays
+  if( ROTATION_VAL ) then
+    if(myrank == 0 ) write(IMAIN,*) "  loading rotation arrays"
+
+    call prepare_fields_rotation_device(Mesh_pointer, &
+                                  two_omega_earth,deltat, &
+                                  A_array_rotation,B_array_rotation, &
+                                  b_two_omega_earth,b_deltat, &
+                                  b_A_array_rotation,b_B_array_rotation, &
+                                  NSPEC_OUTER_CORE_ROTATION)
+  endif
+  call sync_all()
+
+  ! prepares arrays related to gravity
+  ! note: GPU will use only single-precision (or double precision) for all calculations
+  !          we convert to wgll_cube to custom real (by default single-precision),
+  !          using implicit conversion
+  if(myrank == 0 ) write(IMAIN,*) "  loading non-gravity/gravity arrays"
+
+  allocate(cr_d_ln_density_dr_table(NRAD_GRAVITY), &
+          cr_minus_rho_g_over_kappa_fluid(NRAD_GRAVITY), &
+          cr_minus_gravity_table(NRAD_GRAVITY), &
+          cr_minus_deriv_gravity_table(NRAD_GRAVITY), &
+          cr_density_table(NRAD_GRAVITY), &
+          stat=ier)
+  if( ier /= 0 ) stop 'error allocating cr_minus_rho_g_over_kappa_fluid, etc...'
+  ! d_ln_density_dr_table needed for no gravity case
+  cr_d_ln_density_dr_table(:) = d_ln_density_dr_table(:)
+  ! these are needed for gravity cases only
+  cr_minus_rho_g_over_kappa_fluid(:) = minus_rho_g_over_kappa_fluid(:)
+  cr_minus_gravity_table(:) = minus_gravity_table(:)
+  cr_minus_deriv_gravity_table(:) = minus_deriv_gravity_table(:)
+  cr_density_table(:) = density_table(:)
+
+  allocate(cr_wgll_cube(NGLLX,NGLLY,NGLLZ),stat=ier)
+  if( ier /= 0 ) stop 'error allocating cr_wgll_cube'
+  cr_wgll_cube(:,:,:) = wgll_cube(:,:,:)
+
+  ! prepares on GPU
+  call prepare_fields_gravity_device(Mesh_pointer, &
+                                    cr_d_ln_density_dr_table, &
+                                    cr_minus_rho_g_over_kappa_fluid, &
+                                    cr_minus_gravity_table, &
+                                    cr_minus_deriv_gravity_table, &
+                                    cr_density_table, &
+                                    cr_wgll_cube, &
+                                    NRAD_GRAVITY)
+  deallocate(cr_d_ln_density_dr_table,cr_minus_rho_g_over_kappa_fluid, &
+            cr_minus_gravity_table,cr_minus_deriv_gravity_table, &
+            cr_density_table)
+  deallocate(cr_wgll_cube)
+  call sync_all()
+
+  ! prepares attenuation arrays
+  if( ATTENUATION_VAL ) then
+    if(myrank == 0 ) write(IMAIN,*) "  loading attenuation"
+
+    call prepare_fields_attenuat_device(Mesh_pointer, &
+                                        R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
+                                        R_xz_crust_mantle,R_yz_crust_mantle, &
+                                        factor_common_crust_mantle, &
+                                        one_minus_sum_beta_crust_mantle, &
+                                        R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
+                                        R_xz_inner_core,R_yz_inner_core, &
+                                        factor_common_inner_core, &
+                                        one_minus_sum_beta_inner_core, &
+                                        alphaval,betaval,gammaval, &
+                                        b_alphaval,b_betaval,b_gammaval)
+  endif
+  call sync_all()
+
+
+  ! prepares attenuation arrays
+  if( COMPUTE_AND_STORE_STRAIN ) then
+    if(myrank == 0 ) write(IMAIN,*) "  loading strain"
+
+    call prepare_fields_strain_device(Mesh_pointer, &
+                                    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                                    epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                                    b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,b_epsilondev_xy_crust_mantle, &
+                                    b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
+                                    eps_trace_over_3_crust_mantle, &
+                                    b_eps_trace_over_3_crust_mantle, &
+                                    epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+                                    epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+                                    b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+                                    b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
+                                    eps_trace_over_3_inner_core, &
+                                    b_eps_trace_over_3_inner_core)
+  endif
+  call sync_all()
+
+
+  ! prepares MPI interfaces
+  if(myrank == 0 ) write(IMAIN,*) "  loading mpi interfaces"
+  
+  call prepare_mpi_buffers_device(Mesh_pointer, &
+                                num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
+                                nibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle, &
+                                num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
+                                nibool_interfaces_inner_core,ibool_interfaces_inner_core, &
+                                num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
+                                nibool_interfaces_outer_core,ibool_interfaces_outer_core)
+
+  ! crust/mantle region
+  if(myrank == 0 ) write(IMAIN,*) "  loading crust/mantle region"
+  call prepare_crust_mantle_device(Mesh_pointer, &
+                                  xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+                                  etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+                                  gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+                                  rhostore_crust_mantle, &
+                                  kappavstore_crust_mantle,muvstore_crust_mantle, &
+                                  kappahstore_crust_mantle,muhstore_crust_mantle, &
+                                  eta_anisostore_crust_mantle, &
+                                  rmass_crust_mantle, &
+                                  ibool_crust_mantle, &
+                                  xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+                                  ispec_is_tiso_crust_mantle, &
+                                  c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
+                                  c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
+                                  c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
+                                  c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
+                                  c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
+                                  c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+                                  c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
+                                  num_phase_ispec_crust_mantle,phase_ispec_inner_crust_mantle, &
+                                  nspec_outer_crust_mantle,nspec_inner_crust_mantle)
+  call sync_all()
+
+
+  ! outer core region
+  if(myrank == 0 ) write(IMAIN,*) "  loading outer core region"
+  call prepare_outer_core_device(Mesh_pointer, &
+                                  xix_outer_core,xiy_outer_core,xiz_outer_core, &
+                                  etax_outer_core,etay_outer_core,etaz_outer_core, &
+                                  gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
+                                  rhostore_outer_core,kappavstore_outer_core, &
+                                  rmass_outer_core, &
+                                  ibool_outer_core, &
+                                  xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+                                  num_phase_ispec_outer_core,phase_ispec_inner_outer_core, &
+                                  nspec_outer_outer_core,nspec_inner_outer_core)
+  call sync_all()
+
+
+  ! inner core region
+  if(myrank == 0 ) write(IMAIN,*) "  loading inner core region"
+  call prepare_inner_core_device(Mesh_pointer, &
+                                  xix_inner_core,xiy_inner_core,xiz_inner_core, &
+                                  etax_inner_core,etay_inner_core,etaz_inner_core, &
+                                  gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+                                  rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
+                                  rmass_inner_core, &
+                                  ibool_inner_core, &
+                                  xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+                                  c11store_inner_core,c12store_inner_core,c13store_inner_core, &
+                                  c33store_inner_core,c44store_inner_core, &
+                                  idoubling_inner_core, &
+                                  num_phase_ispec_inner_core,phase_ispec_inner_inner_core, &
+                                  nspec_outer_inner_core,nspec_inner_inner_core)
+  call sync_all()
+
+
+  ! transfer forward and backward fields to device with initial values
+  if(myrank == 0 ) write(IMAIN,*) "  transfering initial wavefield"
+  call transfer_fields_cm_to_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
+                                   Mesh_pointer)
+
+  call transfer_fields_ic_to_device(NDIM*NGLOB_INNER_CORE,displ_inner_core,veloc_inner_core,accel_inner_core, &
+                                   Mesh_pointer)
+
+  call transfer_fields_oc_to_device(NGLOB_OUTER_CORE,displ_outer_core,veloc_outer_core,accel_outer_core, &
+                                   Mesh_pointer)
+
+  if(SIMULATION_TYPE == 3) then
+    call transfer_b_fields_cm_to_device(NDIM*NGLOB_CRUST_MANTLE, &
+                                    b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
+                                    Mesh_pointer)
+
+    call transfer_b_fields_ic_to_device(NDIM*NGLOB_INNER_CORE, &
+                                    b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
+                                    Mesh_pointer)
+
+    call transfer_b_fields_oc_to_device(NGLOB_OUTER_CORE, &
+                                    b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
+                                    Mesh_pointer)
+  endif
+
+
+  ! outputs GPU usage to files for all processes
+  call output_free_device_memory(myrank)
+
+  ! outputs usage for main process
+  if( myrank == 0 ) then
+    write(IMAIN,*)"  GPU number of devices per node: min =",ncuda_devices_min
+    write(IMAIN,*)"                                  max =",ncuda_devices_max
+    write(IMAIN,*)
+
+    call get_free_device_memory(free_mb,used_mb,total_mb)
+    write(IMAIN,*)"  GPU usage: free  =",free_mb," MB",nint(free_mb/total_mb*100.0),"%"
+    write(IMAIN,*)"             used  =",used_mb," MB",nint(used_mb/total_mb*100.0),"%"
+    write(IMAIN,*)"             total =",total_mb," MB",nint(total_mb/total_mb*100.0),"%"
+    write(IMAIN,*)
+  endif
+
+  end subroutine prepare_timerun_GPU

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_arrays_buffers_solver.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_arrays_buffers_solver.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_arrays_buffers_solver.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -26,13 +26,13 @@
 !=====================================================================
 
   subroutine read_arrays_buffers_solver(iregion_code,myrank, &
-     iboolleft_xi,iboolright_xi,iboolleft_eta,iboolright_eta, &
-     npoin2D_xi,npoin2D_eta, &
-     iprocfrom_faces,iprocto_faces,imsg_type, &
-     iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-     iboolfaces,npoin2D_faces,iboolcorner, &
-     NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB2DMAX_XY,NGLOB1D_RADIAL, &
-     NUMMSGS_FACES,NCORNERSCHUNKS,NPROCTOT,NPROC_XI,NPROC_ETA,LOCAL_PATH,NCHUNKS)
+                               iboolleft_xi,iboolright_xi,iboolleft_eta,iboolright_eta, &
+                               npoin2D_xi,npoin2D_eta, &
+                               iprocfrom_faces,iprocto_faces,imsg_type, &
+                               iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+                               iboolfaces,npoin2D_faces,iboolcorner, &
+                               NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB2DMAX_XY,NGLOB1D_RADIAL, &
+                               NUMMSGS_FACES,NCORNERSCHUNKS,NPROCTOT,NPROC_XI,NPROC_ETA,LOCAL_PATH,NCHUNKS)
 
   implicit none
 
@@ -117,10 +117,9 @@
   close(IIN)
 
   if(myrank == 0) then
-    write(IMAIN,*)
-    write(IMAIN,*) '# max of points in MPI buffers along xi npoin2D_xi = ', &
+    write(IMAIN,*) '  #max of points in MPI buffers along xi npoin2D_xi = ', &
                                 maxval(npoin2D_xi(:))
-    write(IMAIN,*) '# max of array elements transferred npoin2D_xi*NDIM = ', &
+    write(IMAIN,*) '  #max of array elements transferred npoin2D_xi*NDIM = ', &
                                 maxval(npoin2D_xi(:))*NDIM
     write(IMAIN,*)
   endif
@@ -164,10 +163,9 @@
   close(IIN)
 
   if(myrank == 0) then
-    write(IMAIN,*)
-    write(IMAIN,*) '#max of points in MPI buffers along eta npoin2D_eta = ', &
+    write(IMAIN,*) '  #max of points in MPI buffers along eta npoin2D_eta = ', &
                                 maxval(npoin2D_eta(:))
-    write(IMAIN,*) '#max of array elements transferred npoin2D_eta*NDIM = ', &
+    write(IMAIN,*) '  #max of array elements transferred npoin2D_eta*NDIM = ', &
                                 maxval(npoin2D_eta(:))*NDIM
     write(IMAIN,*)
   endif
@@ -182,34 +180,34 @@
 
   if(myrank == 0) then
 
-! file with the list of processors for each message for faces
-  open(unit=IIN,file=trim(OUTPUT_FILES)//'/list_messages_faces.txt',status='old',action='read')
-  do imsg = 1,NUMMSGS_FACES
-  read(IIN,*) imsg_type(imsg),iprocfrom_faces(imsg),iprocto_faces(imsg)
-  if      (iprocfrom_faces(imsg) < 0 &
-      .or. iprocto_faces(imsg) < 0 &
-      .or. iprocfrom_faces(imsg) > NPROCTOT-1 &
-      .or. iprocto_faces(imsg) > NPROCTOT-1) &
-    call exit_MPI(myrank,'incorrect chunk faces numbering')
-  if (imsg_type(imsg) < 1 .or. imsg_type(imsg) > 3) &
-    call exit_MPI(myrank,'incorrect message type labeling')
-  enddo
-  close(IIN)
+    ! file with the list of processors for each message for faces
+    open(unit=IIN,file=trim(OUTPUT_FILES)//'/list_messages_faces.txt',status='old',action='read')
+    do imsg = 1,NUMMSGS_FACES
+      read(IIN,*) imsg_type(imsg),iprocfrom_faces(imsg),iprocto_faces(imsg)
+      if      (iprocfrom_faces(imsg) < 0 &
+          .or. iprocto_faces(imsg) < 0 &
+          .or. iprocfrom_faces(imsg) > NPROCTOT-1 &
+          .or. iprocto_faces(imsg) > NPROCTOT-1) &
+        call exit_MPI(myrank,'incorrect chunk faces numbering')
+      if (imsg_type(imsg) < 1 .or. imsg_type(imsg) > 3) &
+        call exit_MPI(myrank,'incorrect message type labeling')
+    enddo
+    close(IIN)
 
-! file with the list of processors for each message for corners
-  open(unit=IIN,file=trim(OUTPUT_FILES)//'/list_messages_corners.txt',status='old',action='read')
-  do imsg = 1,NCORNERSCHUNKS
-  read(IIN,*) iproc_master_corners(imsg),iproc_worker1_corners(imsg), &
-                          iproc_worker2_corners(imsg)
-  if    (iproc_master_corners(imsg) < 0 &
-    .or. iproc_worker1_corners(imsg) < 0 &
-    .or. iproc_worker2_corners(imsg) < 0 &
-    .or. iproc_master_corners(imsg) > NPROCTOT-1 &
-    .or. iproc_worker1_corners(imsg) > NPROCTOT-1 &
-    .or. iproc_worker2_corners(imsg) > NPROCTOT-1) &
-      call exit_MPI(myrank,'incorrect chunk corner numbering')
-  enddo
-  close(IIN)
+    ! file with the list of processors for each message for corners
+    open(unit=IIN,file=trim(OUTPUT_FILES)//'/list_messages_corners.txt',status='old',action='read')
+    do imsg = 1,NCORNERSCHUNKS
+      read(IIN,*) iproc_master_corners(imsg),iproc_worker1_corners(imsg), &
+                            iproc_worker2_corners(imsg)
+      if    (iproc_master_corners(imsg) < 0 &
+        .or. iproc_worker1_corners(imsg) < 0 &
+        .or. iproc_worker2_corners(imsg) < 0 &
+        .or. iproc_master_corners(imsg) > NPROCTOT-1 &
+        .or. iproc_worker1_corners(imsg) > NPROCTOT-1 &
+        .or. iproc_worker2_corners(imsg) > NPROCTOT-1) &
+        call exit_MPI(myrank,'incorrect chunk corner numbering')
+    enddo
+    close(IIN)
 
   endif
 
@@ -254,33 +252,33 @@
 !---- a given slice can belong to at most one corner
   icount_corners = 0
   do imsg = 1,NCORNERSCHUNKS
-  if(myrank == iproc_master_corners(imsg) .or. &
-       myrank == iproc_worker1_corners(imsg) .or. &
-       myrank == iproc_worker2_corners(imsg)) then
-    icount_corners = icount_corners + 1
-    if(icount_corners>1 .and. (NPROC_XI > 1 .or. NPROC_ETA > 1)) &
-      call exit_MPI(myrank,'more than one corner for this slice')
-    if(icount_corners>4) call exit_MPI(myrank,'more than four corners for this slice')
+    if(myrank == iproc_master_corners(imsg) .or. &
+         myrank == iproc_worker1_corners(imsg) .or. &
+         myrank == iproc_worker2_corners(imsg)) then
+      icount_corners = icount_corners + 1
+      if(icount_corners>1 .and. (NPROC_XI > 1 .or. NPROC_ETA > 1)) &
+        call exit_MPI(myrank,'more than one corner for this slice')
+      if(icount_corners>4) call exit_MPI(myrank,'more than four corners for this slice')
 
-! read file with 1D buffer for corner
-    if(myrank == iproc_master_corners(imsg)) then
-      write(filename,"('buffer_corners_chunks_master_msg',i6.6,'.txt')") imsg
-    else if(myrank == iproc_worker1_corners(imsg)) then
-      write(filename,"('buffer_corners_chunks_worker1_msg',i6.6,'.txt')") imsg
-    else if(myrank == iproc_worker2_corners(imsg)) then
-      write(filename,"('buffer_corners_chunks_worker2_msg',i6.6,'.txt')") imsg
-    endif
+      ! read file with 1D buffer for corner
+      if(myrank == iproc_master_corners(imsg)) then
+        write(filename,"('buffer_corners_chunks_master_msg',i6.6,'.txt')") imsg
+      else if(myrank == iproc_worker1_corners(imsg)) then
+        write(filename,"('buffer_corners_chunks_worker1_msg',i6.6,'.txt')") imsg
+      else if(myrank == iproc_worker2_corners(imsg)) then
+        write(filename,"('buffer_corners_chunks_worker2_msg',i6.6,'.txt')") imsg
+      endif
 
-! matching codes
-    open(unit=IIN,file=prname(1:len_trim(prname))//filename,status='old',action='read')
-    read(IIN,*) npoin1D_corner
-    if(npoin1D_corner /= NGLOB1D_RADIAL) &
-      call exit_MPI(myrank,'incorrect nb of points in corner buffer')
-    do ipoin1D = 1,npoin1D_corner
-      read(IIN,*) iboolcorner(ipoin1D,icount_corners),xdummy,ydummy,zdummy
-    enddo
-    close(IIN)
-  endif
+      ! matching codes
+      open(unit=IIN,file=prname(1:len_trim(prname))//filename,status='old',action='read')
+      read(IIN,*) npoin1D_corner
+      if(npoin1D_corner /= NGLOB1D_RADIAL) &
+        call exit_MPI(myrank,'incorrect nb of points in corner buffer')
+      do ipoin1D = 1,npoin1D_corner
+        read(IIN,*) iboolcorner(ipoin1D,icount_corners),xdummy,ydummy,zdummy
+      enddo
+      close(IIN)
+    endif
   enddo
 
   endif

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -25,77 +25,17 @@
 !
 !=====================================================================
 
-  subroutine read_forward_arrays_startrun(myrank,NSTEP, &
-                    SIMULATION_TYPE,NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN, &
-                    it_begin,it_end, &
-                    displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
-                    displ_inner_core,veloc_inner_core,accel_inner_core, &
-                    displ_outer_core,veloc_outer_core,accel_outer_core, &
-                    R_memory_crust_mantle,R_memory_inner_core, &
-                    epsilondev_crust_mantle,epsilondev_inner_core, &
-                    A_array_rotation,B_array_rotation, &
-                    b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
-                    b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
-                    b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
-                    b_R_memory_crust_mantle,b_R_memory_inner_core, &
-                    b_epsilondev_crust_mantle,b_epsilondev_inner_core, &
-                    b_A_array_rotation,b_B_array_rotation,LOCAL_PATH)
+  subroutine read_forward_arrays_startrun()
 
 ! reads in saved wavefields
 
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
-
-  integer myrank,NSTEP
-
-  integer SIMULATION_TYPE
-
-  integer NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN,it_begin,it_end
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: &
-    displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: &
-    displ_inner_core,veloc_inner_core,accel_inner_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: &
-    displ_outer_core,veloc_outer_core,accel_outer_core
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: &
-    R_memory_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
-    epsilondev_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
-    R_memory_inner_core
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: &
-    epsilondev_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROTATION) :: &
-    A_array_rotation,B_array_rotation
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE_ADJOINT) :: &
-    b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE_ADJOINT) :: &
-    b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE_ADJOINT) :: &
-    b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT) :: &
-    b_R_memory_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
-    b_epsilondev_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT) :: &
-    b_R_memory_inner_core
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: &
-    b_epsilondev_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROT_ADJOINT) :: &
-    b_A_array_rotation,b_B_array_rotation
-
-  character(len=150) LOCAL_PATH
-
-  !local parameters
+  ! local parameters
   character(len=150) outputname
 
   ! define correct time steps if restart files
@@ -133,6 +73,7 @@
   if(NUMBER_OF_THIS_RUN > 1) then
     write(outputname,"('dump_all_arrays',i6.6)") myrank
     open(unit=55,file=trim(LOCAL_PATH)//'/'//outputname,status='old',action='read',form='unformatted')
+    
     read(55) displ_crust_mantle
     read(55) veloc_crust_mantle
     read(55) accel_crust_mantle
@@ -142,12 +83,34 @@
     read(55) displ_outer_core
     read(55) veloc_outer_core
     read(55) accel_outer_core
-    read(55) epsilondev_crust_mantle
-    read(55) epsilondev_inner_core
+
+    read(55) epsilondev_xx_crust_mantle
+    read(55) epsilondev_yy_crust_mantle
+    read(55) epsilondev_xy_crust_mantle
+    read(55) epsilondev_xz_crust_mantle
+    read(55) epsilondev_yz_crust_mantle
+
+    read(55) epsilondev_xx_inner_core
+    read(55) epsilondev_yy_inner_core
+    read(55) epsilondev_xy_inner_core
+    read(55) epsilondev_xz_inner_core
+    read(55) epsilondev_yz_inner_core
+
     read(55) A_array_rotation
     read(55) B_array_rotation
-    read(55) R_memory_crust_mantle
-    read(55) R_memory_inner_core
+
+    read(55) R_xx_crust_mantle
+    read(55) R_yy_crust_mantle
+    read(55) R_xy_crust_mantle
+    read(55) R_xz_crust_mantle
+    read(55) R_yz_crust_mantle
+
+    read(55) R_xx_inner_core
+    read(55) R_yy_inner_core
+    read(55) R_xy_inner_core
+    read(55) R_xz_inner_core
+    read(55) R_yz_inner_core
+
     close(55)
   endif
 
@@ -162,15 +125,36 @@
     b_displ_outer_core = 0._CUSTOM_REAL
     b_veloc_outer_core = 0._CUSTOM_REAL
     b_accel_outer_core = 0._CUSTOM_REAL
-    b_epsilondev_crust_mantle = 0._CUSTOM_REAL
-    b_epsilondev_inner_core = 0._CUSTOM_REAL
+
+    b_epsilondev_xx_crust_mantle = 0._CUSTOM_REAL
+    b_epsilondev_yy_crust_mantle = 0._CUSTOM_REAL
+    b_epsilondev_xy_crust_mantle = 0._CUSTOM_REAL
+    b_epsilondev_xz_crust_mantle = 0._CUSTOM_REAL
+    b_epsilondev_yz_crust_mantle = 0._CUSTOM_REAL
+
+    b_epsilondev_xx_inner_core = 0._CUSTOM_REAL
+    b_epsilondev_yy_inner_core = 0._CUSTOM_REAL
+    b_epsilondev_xy_inner_core = 0._CUSTOM_REAL
+    b_epsilondev_xz_inner_core = 0._CUSTOM_REAL
+    b_epsilondev_yz_inner_core = 0._CUSTOM_REAL
+
     if (ROTATION_VAL) then
       b_A_array_rotation = 0._CUSTOM_REAL
       b_B_array_rotation = 0._CUSTOM_REAL
     endif
     if (ATTENUATION_VAL) then
-      b_R_memory_crust_mantle = 0._CUSTOM_REAL
-      b_R_memory_inner_core = 0._CUSTOM_REAL
+      b_R_xx_crust_mantle = 0._CUSTOM_REAL
+      b_R_yy_crust_mantle = 0._CUSTOM_REAL
+      b_R_xy_crust_mantle = 0._CUSTOM_REAL
+      b_R_xz_crust_mantle = 0._CUSTOM_REAL
+      b_R_yz_crust_mantle = 0._CUSTOM_REAL
+
+      b_R_xx_inner_core = 0._CUSTOM_REAL
+      b_R_yy_inner_core = 0._CUSTOM_REAL
+      b_R_xy_inner_core = 0._CUSTOM_REAL
+      b_R_xz_inner_core = 0._CUSTOM_REAL
+      b_R_yz_inner_core = 0._CUSTOM_REAL
+
     endif
   endif
 
@@ -180,51 +164,22 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine read_forward_arrays(myrank, &
-                    b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
-                    b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
-                    b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
-                    b_R_memory_crust_mantle,b_R_memory_inner_core, &
-                    b_epsilondev_crust_mantle,b_epsilondev_inner_core, &
-                    b_A_array_rotation,b_B_array_rotation,LOCAL_PATH)
+  subroutine read_forward_arrays()
 
-! reads in saved wavefields
+! reads in saved wavefields to reconstruct/backward wavefield
 
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
-
-  integer myrank
-
-  ! backward/reconstructed wavefields
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE_ADJOINT) :: &
-    b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE_ADJOINT) :: &
-    b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE_ADJOINT) :: &
-    b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT) :: &
-    b_R_memory_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
-    b_epsilondev_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT) :: &
-    b_R_memory_inner_core
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: &
-    b_epsilondev_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROT_ADJOINT) :: &
-    b_A_array_rotation,b_B_array_rotation
-
-  character(len=150) LOCAL_PATH
-
   !local parameters
   character(len=150) outputname
 
   write(outputname,'(a,i6.6,a)') 'proc',myrank,'_save_forward_arrays.bin'
   open(unit=55,file=trim(LOCAL_PATH)//'/'//outputname,status='old',action='read',form='unformatted')
+  
   read(55) b_displ_crust_mantle
   read(55) b_veloc_crust_mantle
   read(55) b_accel_crust_mantle
@@ -234,16 +189,38 @@
   read(55) b_displ_outer_core
   read(55) b_veloc_outer_core
   read(55) b_accel_outer_core
-  read(55) b_epsilondev_crust_mantle
-  read(55) b_epsilondev_inner_core
+
+  read(55) b_epsilondev_xx_crust_mantle
+  read(55) b_epsilondev_yy_crust_mantle
+  read(55) b_epsilondev_xy_crust_mantle
+  read(55) b_epsilondev_xz_crust_mantle
+  read(55) b_epsilondev_yz_crust_mantle
+
+  read(55) b_epsilondev_xx_inner_core
+  read(55) b_epsilondev_yy_inner_core
+  read(55) b_epsilondev_xy_inner_core
+  read(55) b_epsilondev_xz_inner_core
+  read(55) b_epsilondev_yz_inner_core
+
   if (ROTATION_VAL) then
     read(55) b_A_array_rotation
     read(55) b_B_array_rotation
   endif
   if (ATTENUATION_VAL) then
-    read(55) b_R_memory_crust_mantle
-    read(55) b_R_memory_inner_core
+    read(55) b_R_xx_crust_mantle
+    read(55) b_R_yy_crust_mantle
+    read(55) b_R_xy_crust_mantle
+    read(55) b_R_xz_crust_mantle
+    read(55) b_R_yz_crust_mantle
+
+    read(55) b_R_xx_inner_core
+    read(55) b_R_yy_inner_core
+    read(55) b_R_xy_inner_core
+    read(55) b_R_xz_inner_core
+    read(55) b_R_yz_inner_core
+
   endif
+  
   close(55)
 
   end subroutine read_forward_arrays

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -25,136 +25,97 @@
 !
 !=====================================================================
 
-  subroutine read_mesh_databases(myrank,rho_vp_crust_mantle,rho_vs_crust_mantle, &
-            xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-            xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-            etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-            gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-            rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-            kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-            nspec_iso,nspec_tiso,nspec_ani, &
-            c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-            c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-            c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-            c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-            c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-            c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-            c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-            ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-         ! -- idoubling_crust_mantle
-            is_on_a_slice_edge_crust_mantle,rmass_crust_mantle,rmass_ocean_load, &
-            vp_outer_core,xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-            xix_outer_core,xiy_outer_core,xiz_outer_core, &
-            etax_outer_core,etay_outer_core,etaz_outer_core, &
-            gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-            rhostore_outer_core,kappavstore_outer_core, &
-            ibool_outer_core,idoubling_outer_core,ispec_is_tiso_outer_core, &
-            is_on_a_slice_edge_outer_core,rmass_outer_core, &
-            xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-            xix_inner_core,xiy_inner_core,xiz_inner_core, &
-            etax_inner_core,etay_inner_core,etaz_inner_core, &
-            gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-            rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
-            c11store_inner_core,c12store_inner_core,c13store_inner_core, &
-            c33store_inner_core,c44store_inner_core, &
-            ibool_inner_core,idoubling_inner_core,ispec_is_tiso_inner_core, &
-            is_on_a_slice_edge_inner_core,rmass_inner_core, &
-            ABSORBING_CONDITIONS,LOCAL_PATH)
+  subroutine read_mesh_databases()
 
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
+  include 'mpif.h'
+  
+  ! local parameters
+  integer :: ier
 
-  integer myrank
+  ! get MPI starting time
+  time_start = MPI_WTIME()
 
-  ! Stacey
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STACEY) :: &
-    rho_vp_crust_mantle,rho_vs_crust_mantle
+  ! allocates temporary arrays
+  allocate( is_on_a_slice_edge_crust_mantle(NSPEC_CRUST_MANTLE), &
+           is_on_a_slice_edge_inner_core(NSPEC_INNER_CORE), &
+           is_on_a_slice_edge_outer_core(NSPEC_OUTER_CORE), &
+           stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating temporary is_on_a_slice_edge arrays')
 
-  ! mesh parameters
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: &
-        xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
-        xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle,&
-        etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-        gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
-        rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle
-  ! arrays for anisotropic elements stored only where needed to save space
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_TISO_MANTLE) :: &
-        kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle
+  ! start reading the databases
+  ! read arrays created by the mesher
 
-  ! arrays for full anisotropy only when needed
-  integer nspec_iso,nspec_tiso,nspec_ani
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_MANTLE) :: &
-        c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-        c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-        c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-        c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-        c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-        c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-        c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle
+  ! reads "solver_data_1.bin" & "solver_data_2.bin" files for crust and mantle
+  call read_mesh_databases_CM()
+  
+  ! reads "solver_data_1.bin" & "solver_data_2.bin" files for outer core
+  call read_mesh_databases_OC()
+  
+  ! reads "solver_data_1.bin" & "solver_data_2.bin" files for inner core
+  call read_mesh_databases_IC()
 
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
+  ! reads "boundary.bin" files to couple mantle with outer core and inner core boundaries
+  call read_mesh_databases_coupling()
+  
+  ! reads "addressing.txt" 2-D addressing for summation between slices with MPI
+  call read_mesh_databases_addressing()
 
-!  integer, dimension(NSPEC_CRUST_MANTLE) :: idoubling_crust_mantle
-  logical, dimension(NSPEC_CRUST_MANTLE) :: ispec_is_tiso_crust_mantle
+  ! reads "iboolleft_..txt", "iboolright_..txt" (and "list_messages_..txt", "buffer_...txt") files and sets up MPI buffers
+  call read_mesh_databases_MPIbuffers()
 
-  ! mass matrix
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: rmass_crust_mantle
-  ! additional mass matrix for ocean load
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE_OCEANS) :: rmass_ocean_load
+  ! sets up MPI interfaces
+  call read_mesh_databases_MPIinter()
 
-  ! stacy outer core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_STACEY) :: vp_outer_core
-  ! mesh parameters
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: &
-        xstore_outer_core,ystore_outer_core,zstore_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: &
-        xix_outer_core,xiy_outer_core,xiz_outer_core,&
-        etax_outer_core,etay_outer_core,etaz_outer_core, &
-        gammax_outer_core,gammay_outer_core,gammaz_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: &
-        rhostore_outer_core,kappavstore_outer_core
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: ibool_outer_core
-  integer, dimension(NSPEC_OUTER_CORE) :: idoubling_outer_core
-  logical, dimension(NSPEC_OUTER_CORE) :: ispec_is_tiso_outer_core
+  ! sets up inner/outer element arrays
+  call read_mesh_databases_InnerOuter()
 
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: rmass_outer_core
+  ! absorbing boundaries
+  if(ABSORBING_CONDITIONS) then
+    ! reads "stacey.bin" files and sets up arrays for Stacey conditions
+    call read_mesh_databases_stacey()
+  endif
 
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
-        xix_inner_core,xiy_inner_core,xiz_inner_core,&
-        etax_inner_core,etay_inner_core,etaz_inner_core, &
-        gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-        rhostore_inner_core, kappavstore_inner_core,muvstore_inner_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: &
-        xstore_inner_core,ystore_inner_core,zstore_inner_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_IC) :: &
-        c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-        c13store_inner_core,c44store_inner_core
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
-  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
-  logical, dimension(NSPEC_INNER_CORE) :: ispec_is_tiso_inner_core
+  ! user output
+  if( myrank == 0 ) then
+    ! elapsed time since beginning of mesh generation
+    tCPU = MPI_WTIME() - time_start
+    write(IMAIN,*)
+    write(IMAIN,*) 'Elapsed time for reading mesh in seconds = ',sngl(tCPU)
+    write(IMAIN,*)
+  endif
+  
+  ! frees temporary allocated arrays
+  deallocate(is_on_a_slice_edge_crust_mantle, &
+            is_on_a_slice_edge_outer_core, &
+            is_on_a_slice_edge_inner_core)
 
-  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: rmass_inner_core
+  end subroutine read_mesh_databases
 
-  logical ABSORBING_CONDITIONS
-  character(len=150) LOCAL_PATH
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-  !local parameters
-  logical READ_KAPPA_MU,READ_TISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
-  integer, dimension(NSPEC_CRUST_MANTLE) :: dummy_i
+  subroutine read_mesh_databases_CM()
 
-! this for non blocking MPI
-  logical, dimension(NSPEC_CRUST_MANTLE) :: is_on_a_slice_edge_crust_mantle
-  logical, dimension(NSPEC_OUTER_CORE) :: is_on_a_slice_edge_outer_core
-  logical, dimension(NSPEC_INNER_CORE) :: is_on_a_slice_edge_inner_core
+! mesh for CRUST MANTLE region
 
-  ! start reading the databases
-  ! read arrays created by the mesher
+  use specfem_par
+  use specfem_par_crustmantle
+  implicit none
 
+  ! local parameters
+  integer :: nspec_iso,nspec_tiso,nspec_ani
+  logical :: READ_KAPPA_MU,READ_TISO
+
+  ! dummy array that does not need to be actually read
+  integer, dimension(NSPEC_CRUST_MANTLE) :: dummy_i
+
   ! crust and mantle
   if(ANISOTROPIC_3D_MANTLE_VAL) then
     READ_KAPPA_MU = .false.
@@ -173,6 +134,7 @@
     READ_KAPPA_MU = .true.
     READ_TISO = .true.
   endif
+  
   call read_arrays_solver(IREGION_CRUST_MANTLE,myrank, &
             rho_vp_crust_mantle,rho_vs_crust_mantle, &
             xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
@@ -190,13 +152,42 @@
             c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
             c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
             ibool_crust_mantle,dummy_i, &
-          ! --idoubling_crust_mantle,
             ispec_is_tiso_crust_mantle, &
             is_on_a_slice_edge_crust_mantle,rmass_crust_mantle,rmass_ocean_load, &
             NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
             READ_KAPPA_MU,READ_TISO,TRANSVERSE_ISOTROPY_VAL,ANISOTROPIC_3D_MANTLE_VAL, &
             ANISOTROPIC_INNER_CORE_VAL,OCEANS_VAL,LOCAL_PATH,ABSORBING_CONDITIONS)
 
+  ! check that the number of points in this slice is correct
+  if(minval(ibool_crust_mantle(:,:,:,:)) /= 1 .or. &
+    maxval(ibool_crust_mantle(:,:,:,:)) /= NGLOB_CRUST_MANTLE) &
+      call exit_MPI(myrank,'incorrect global numbering: iboolmax does not equal nglob in crust and mantle')
+
+
+  end subroutine read_mesh_databases_CM
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_OC()
+
+! mesh for OUTER CORE region
+
+  use specfem_par
+  use specfem_par_outercore
+  implicit none
+
+  ! local parameters
+  integer :: nspec_iso,nspec_tiso,nspec_ani
+  logical :: READ_KAPPA_MU,READ_TISO
+  
+  ! dummy array that does not need to be actually read
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
+
+  logical, dimension(:),allocatable:: dummy_ispec_is_tiso
+  integer, dimension(:),allocatable :: dummy_idoubling_outer_core
+  
   ! outer core (no anisotropy nor S velocity)
   ! rmass_ocean_load is not used in this routine because it is meaningless in the outer core
   READ_KAPPA_MU = .false.
@@ -204,6 +195,10 @@
   nspec_iso = NSPEC_OUTER_CORE
   nspec_tiso = 1
   nspec_ani = 1
+  
+  ! dummy allocation
+  allocate(dummy_ispec_is_tiso(NSPEC_OUTER_CORE))
+  allocate(dummy_idoubling_outer_core(NSPEC_OUTER_CORE))
 
   call read_arrays_solver(IREGION_OUTER_CORE,myrank, &
             vp_outer_core,dummy_array, &
@@ -221,12 +216,42 @@
             dummy_array,dummy_array,dummy_array, &
             dummy_array,dummy_array,dummy_array, &
             dummy_array,dummy_array,dummy_array, &
-            ibool_outer_core,idoubling_outer_core,ispec_is_tiso_outer_core, &
+            ibool_outer_core,dummy_idoubling_outer_core,dummy_ispec_is_tiso, &
             is_on_a_slice_edge_outer_core,rmass_outer_core,rmass_ocean_load, &
             NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
             READ_KAPPA_MU,READ_TISO,TRANSVERSE_ISOTROPY_VAL,ANISOTROPIC_3D_MANTLE_VAL, &
             ANISOTROPIC_INNER_CORE_VAL,OCEANS_VAL,LOCAL_PATH,ABSORBING_CONDITIONS)
 
+  deallocate(dummy_idoubling_outer_core)
+  deallocate(dummy_ispec_is_tiso)
+
+  ! check that the number of points in this slice is correct
+  if(minval(ibool_outer_core(:,:,:,:)) /= 1 .or. &
+     maxval(ibool_outer_core(:,:,:,:)) /= NGLOB_OUTER_CORE) &
+    call exit_MPI(myrank,'incorrect global numbering: iboolmax does not equal nglob in outer core')
+
+  end subroutine read_mesh_databases_OC
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_IC()
+
+! mesh for INNER CORE region
+
+  use specfem_par
+  use specfem_par_innercore
+  implicit none
+
+  ! local parameters
+  integer :: nspec_iso,nspec_tiso,nspec_ani
+  logical :: READ_KAPPA_MU,READ_TISO
+
+  ! dummy array that does not need to be actually read
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
+  logical, dimension(:),allocatable:: dummy_ispec_is_tiso
+  
   ! inner core (no anisotropy)
   ! rmass_ocean_load is not used in this routine because it is meaningless in the inner core
   READ_KAPPA_MU = .true.
@@ -238,7 +263,10 @@
   else
     nspec_ani = 1
   endif
-
+  
+  ! dummy allocation
+  allocate(dummy_ispec_is_tiso(NSPEC_INNER_CORE))
+  
   call read_arrays_solver(IREGION_INNER_CORE,myrank, &
             dummy_array,dummy_array, &
             xstore_inner_core,ystore_inner_core,zstore_inner_core, &
@@ -255,104 +283,193 @@
             dummy_array,dummy_array,dummy_array, &
             c44store_inner_core,dummy_array,dummy_array, &
             dummy_array,dummy_array,dummy_array, &
-            ibool_inner_core,idoubling_inner_core,ispec_is_tiso_inner_core, &
+            ibool_inner_core,idoubling_inner_core,dummy_ispec_is_tiso, &
             is_on_a_slice_edge_inner_core,rmass_inner_core,rmass_ocean_load, &
             NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
             READ_KAPPA_MU,READ_TISO,TRANSVERSE_ISOTROPY_VAL,ANISOTROPIC_3D_MANTLE_VAL, &
             ANISOTROPIC_INNER_CORE_VAL,OCEANS_VAL,LOCAL_PATH,ABSORBING_CONDITIONS)
 
+  deallocate(dummy_ispec_is_tiso)
+
   ! check that the number of points in this slice is correct
-  if(minval(ibool_crust_mantle(:,:,:,:)) /= 1 .or. &
-    maxval(ibool_crust_mantle(:,:,:,:)) /= NGLOB_CRUST_MANTLE) &
-      call exit_MPI(myrank,'incorrect global numbering: iboolmax does not equal nglob in crust and mantle')
-
-  if(minval(ibool_outer_core(:,:,:,:)) /= 1 .or. &
-     maxval(ibool_outer_core(:,:,:,:)) /= NGLOB_OUTER_CORE) &
-    call exit_MPI(myrank,'incorrect global numbering: iboolmax does not equal nglob in outer core')
-
   if(minval(ibool_inner_core(:,:,:,:)) /= 1 .or. maxval(ibool_inner_core(:,:,:,:)) /= NGLOB_INNER_CORE) &
     call exit_MPI(myrank,'incorrect global numbering: iboolmax does not equal nglob in inner core')
 
-  end subroutine read_mesh_databases
+  end subroutine read_mesh_databases_IC
 
 !
 !-------------------------------------------------------------------------------------------------
 !
 
+  subroutine read_mesh_databases_coupling()
 
-  subroutine read_mesh_databases_addressing(myrank, &
-                    iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle, &
-                    iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-                    npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-                    iboolfaces_crust_mantle,npoin2D_faces_crust_mantle, &
-                    iboolcorner_crust_mantle, &
-                    iboolleft_xi_outer_core,iboolright_xi_outer_core, &
-                    iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-                    npoin2D_xi_outer_core,npoin2D_eta_outer_core,&
-                    iboolfaces_outer_core,npoin2D_faces_outer_core, &
-                    iboolcorner_outer_core, &
-                    iboolleft_xi_inner_core,iboolright_xi_inner_core, &
-                    iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-                    npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-                    iboolfaces_inner_core,npoin2D_faces_inner_core, &
-                    iboolcorner_inner_core, &
-                    iprocfrom_faces,iprocto_faces,imsg_type, &
-                    iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-                    LOCAL_PATH,OUTPUT_FILES, &
-                    NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB1D_RADIAL, &
-                    NGLOB2DMAX_XY,NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
-                    addressing,ichunk_slice,iproc_xi_slice,iproc_eta_slice, &
-                    ichunk,iproc_xi,iproc_eta)
+! to couple mantle with outer core
 
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+
   implicit none
 
   include 'mpif.h'
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
+  
+  ! local parameters
+  integer njunk1,njunk2,njunk3
 
-  integer myrank
+  ! crust and mantle
+  ! create name of database
+  call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
 
-  ! 2-D addressing and buffers for summation between slices
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_OC) :: iboolleft_xi_outer_core,iboolright_xi_outer_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_OC) :: iboolleft_eta_outer_core,iboolright_eta_outer_core
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
+  ! Stacey put back
+  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
+        status='old',form='unformatted',action='read')
+  read(27) nspec2D_xmin_crust_mantle
+  read(27) nspec2D_xmax_crust_mantle
+  read(27) nspec2D_ymin_crust_mantle
+  read(27) nspec2D_ymax_crust_mantle
+  read(27) njunk1
+  read(27) njunk2
 
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_outer_core,npoin2D_eta_outer_core
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_inner_core,npoin2D_eta_inner_core
+! boundary parameters
+  read(27) ibelm_xmin_crust_mantle
+  read(27) ibelm_xmax_crust_mantle
+  read(27) ibelm_ymin_crust_mantle
+  read(27) ibelm_ymax_crust_mantle
+  read(27) ibelm_bottom_crust_mantle
+  read(27) ibelm_top_crust_mantle
 
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle, &
-      iboolfaces_outer_core,iboolfaces_inner_core
+  read(27) normal_xmin_crust_mantle
+  read(27) normal_xmax_crust_mantle
+  read(27) normal_ymin_crust_mantle
+  read(27) normal_ymax_crust_mantle
+  read(27) normal_bottom_crust_mantle
+  read(27) normal_top_crust_mantle
 
-  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
-  integer npoin2D_faces_outer_core(NUMFACES_SHARED)
-  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
+  read(27) jacobian2D_xmin_crust_mantle
+  read(27) jacobian2D_xmax_crust_mantle
+  read(27) jacobian2D_ymin_crust_mantle
+  read(27) jacobian2D_ymax_crust_mantle
+  read(27) jacobian2D_bottom_crust_mantle
+  read(27) jacobian2D_top_crust_mantle
+  close(27)
 
-  ! indirect addressing for each corner of the chunks
-  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
-  integer, dimension(NGLOB1D_RADIAL_OC,NUMCORNERS_SHARED) :: iboolcorner_outer_core
-  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
 
-  ! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces,imsg_type
-  ! communication pattern for corners between chunks
-  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
+  ! read parameters to couple fluid and solid regions
+  !
+  ! outer core
 
-  character(len=150) LOCAL_PATH,OUTPUT_FILES
+  ! create name of database
+  call create_name_database(prname,myrank,IREGION_OUTER_CORE,LOCAL_PATH)
 
-  integer, dimension(MAX_NUM_REGIONS) :: NGLOB1D_RADIAL,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
-  integer NGLOB2DMAX_XY
+  ! boundary parameters
 
-  integer NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS
+  ! Stacey put back
+  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
+        status='old',form='unformatted',action='read')
+  read(27) nspec2D_xmin_outer_core
+  read(27) nspec2D_xmax_outer_core
+  read(27) nspec2D_ymin_outer_core
+  read(27) nspec2D_ymax_outer_core
+  read(27) njunk1
+  read(27) njunk2
 
-  ! for addressing of the slices
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
-  integer, dimension(0:NPROCTOT_VAL-1) :: ichunk_slice,iproc_xi_slice,iproc_eta_slice
-  integer ichunk,iproc_xi,iproc_eta
+  read(27) ibelm_xmin_outer_core
+  read(27) ibelm_xmax_outer_core
+  read(27) ibelm_ymin_outer_core
+  read(27) ibelm_ymax_outer_core
+  read(27) ibelm_bottom_outer_core
+  read(27) ibelm_top_outer_core
 
+  read(27) normal_xmin_outer_core
+  read(27) normal_xmax_outer_core
+  read(27) normal_ymin_outer_core
+  read(27) normal_ymax_outer_core
+  read(27) normal_bottom_outer_core
+  read(27) normal_top_outer_core
+
+  read(27) jacobian2D_xmin_outer_core
+  read(27) jacobian2D_xmax_outer_core
+  read(27) jacobian2D_ymin_outer_core
+  read(27) jacobian2D_ymax_outer_core
+  read(27) jacobian2D_bottom_outer_core
+  read(27) jacobian2D_top_outer_core
+  close(27)
+
+
+  !
+  ! inner core
+  !
+
+  ! create name of database
+  call create_name_database(prname,myrank,IREGION_INNER_CORE,LOCAL_PATH)
+
+  ! read info for vertical edges for central cube matching in inner core
+  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
+        status='old',form='unformatted',action='read')
+  read(27) nspec2D_xmin_inner_core
+  read(27) nspec2D_xmax_inner_core
+  read(27) nspec2D_ymin_inner_core
+  read(27) nspec2D_ymax_inner_core
+  read(27) njunk1
+  read(27) njunk2
+
+  ! boundary parameters
+  read(27) ibelm_xmin_inner_core
+  read(27) ibelm_xmax_inner_core
+  read(27) ibelm_ymin_inner_core
+  read(27) ibelm_ymax_inner_core
+  read(27) ibelm_bottom_inner_core
+  read(27) ibelm_top_inner_core
+  close(27)
+
+
+  ! -- Boundary Mesh for crust and mantle ---
+  if (SAVE_BOUNDARY_MESH .and. SIMULATION_TYPE == 3) then
+
+    call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
+
+    open(unit=27,file=prname(1:len_trim(prname))//'boundary_disc.bin', &
+          status='old',form='unformatted',action='read')
+    read(27) njunk1,njunk2,njunk3
+    if (njunk1 /= NSPEC2D_MOHO .and. njunk2 /= NSPEC2D_400 .and. njunk3 /= NSPEC2D_670) &
+               call exit_mpi(myrank, 'Error reading ibelm_disc.bin file')
+    read(27) ibelm_moho_top
+    read(27) ibelm_moho_bot
+    read(27) ibelm_400_top
+    read(27) ibelm_400_bot
+    read(27) ibelm_670_top
+    read(27) ibelm_670_bot
+    read(27) normal_moho
+    read(27) normal_400
+    read(27) normal_670
+    close(27)
+
+    k_top = 1
+    k_bot = NGLLZ
+
+    ! initialization
+    moho_kl = 0.; d400_kl = 0.; d670_kl = 0.; cmb_kl = 0.; icb_kl = 0.
+  endif
+
+  end subroutine read_mesh_databases_coupling
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+
+  subroutine read_mesh_databases_addressing()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+
+  implicit none
+
+  include 'mpif.h'
+
   ! local parameters
   integer :: ier,iproc,iproc_read
   integer :: NUM_FACES,NPROC_ONE_DIRECTION
@@ -462,12 +579,32 @@
   ! debug checks with compiled value
   !if( NUMMSGS_FACES /= NUMMSGS_FACES_VAL ) then
   !  print*,'check: NUMMSGS_FACES',NUMMSGS_FACES,NUMMSGS_FACES_VAL
-  !  stop 'error NUMMSGS_FACES_VAL, please recompile solver'
+  !  call exit_mpi(myrank,'error NUMMSGS_FACES_VAL, please recompile solver')
   !endif
 
+  end subroutine read_mesh_databases_addressing
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_MPIbuffers()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore  
+  implicit none
+
+  ! local parameters
+  integer :: ier
+  character(len=150) :: filename
+
   ! read 2-D addressing for summation between slices with MPI
 
   ! mantle and crust
+  if(myrank == 0) write(IMAIN,*) 'crust/mantle region:'
+  
   call read_arrays_buffers_solver(IREGION_CRUST_MANTLE,myrank,iboolleft_xi_crust_mantle, &
      iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
      npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
@@ -480,6 +617,8 @@
      NUMMSGS_FACES,NCORNERSCHUNKS,NPROCTOT_VAL,NPROC_XI_VAL,NPROC_ETA_VAL,LOCAL_PATH,NCHUNKS_VAL)
 
   ! outer core
+  if(myrank == 0) write(IMAIN,*) 'outer core region:'
+
   call read_arrays_buffers_solver(IREGION_OUTER_CORE,myrank, &
      iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
      npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
@@ -492,6 +631,8 @@
      NUMMSGS_FACES,NCORNERSCHUNKS,NPROCTOT_VAL,NPROC_XI_VAL,NPROC_ETA_VAL,LOCAL_PATH,NCHUNKS_VAL)
 
   ! inner core
+  if(myrank == 0) write(IMAIN,*) 'inner core region:'
+  
   call read_arrays_buffers_solver(IREGION_INNER_CORE,myrank, &
      iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
      npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
@@ -503,322 +644,1326 @@
      NGLOB2DMAX_YMIN_YMAX(IREGION_INNER_CORE),NGLOB2DMAX_XY,NGLOB1D_RADIAL(IREGION_INNER_CORE), &
      NUMMSGS_FACES,NCORNERSCHUNKS,NPROCTOT_VAL,NPROC_XI_VAL,NPROC_ETA_VAL,LOCAL_PATH,NCHUNKS_VAL)
 
+  ! added this to reduce the size of the buffers
+  ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  npoin2D_max_all_CM_IC = max(maxval(npoin2D_xi_crust_mantle(:) + npoin2D_xi_inner_core(:)), &
+                        maxval(npoin2D_eta_crust_mantle(:) + npoin2D_eta_inner_core(:)))
 
-  end subroutine read_mesh_databases_addressing
+  allocate(buffer_send_faces(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED), &
+          buffer_received_faces(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating mpi buffer')
 
+  allocate(b_buffer_send_faces(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED), &
+          b_buffer_received_faces(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating mpi b_buffer')
 
+  ! central cube buffers
+  if(INCLUDE_CENTRAL_CUBE) then
+
+    if(myrank == 0) write(IMAIN,*) 'including central cube'
+
+    ! compute number of messages to expect in cube as well as their size
+    call comp_central_cube_buffer_size(iproc_xi,iproc_eta,ichunk, &
+                NPROC_XI_VAL,NPROC_ETA_VAL,NSPEC2D_BOTTOM(IREGION_INNER_CORE), &
+                nb_msgs_theor_in_cube,npoin2D_cube_from_slices)
+
+    ! this value is used for dynamic memory allocation, therefore make sure it is never zero
+    if(nb_msgs_theor_in_cube > 0) then
+      non_zero_nb_msgs_theor_in_cube = nb_msgs_theor_in_cube
+    else
+      non_zero_nb_msgs_theor_in_cube = 1
+    endif
+
+    ! allocate buffers for cube and slices
+    allocate(sender_from_slices_to_cube(non_zero_nb_msgs_theor_in_cube), &
+            buffer_all_cube_from_slices(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices,NDIM), &
+            b_buffer_all_cube_from_slices(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices,NDIM), &
+            buffer_slices(npoin2D_cube_from_slices,NDIM), &
+            b_buffer_slices(npoin2D_cube_from_slices,NDIM), &
+            buffer_slices2(npoin2D_cube_from_slices,NDIM), &
+            ibool_central_cube(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating cube buffers')
+
+    ! handles the communications with the central cube if it was included in the mesh
+    ! create buffers to assemble with the central cube
+    call create_central_cube_buffers(myrank,iproc_xi,iproc_eta,ichunk, &
+               NPROC_XI_VAL,NPROC_ETA_VAL,NCHUNKS_VAL,NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
+               NSPEC2DMAX_XMIN_XMAX(IREGION_INNER_CORE),NSPEC2DMAX_YMIN_YMAX(IREGION_INNER_CORE), &
+               NSPEC2D_BOTTOM(IREGION_INNER_CORE), &
+               addressing,ibool_inner_core,idoubling_inner_core, &
+               xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+               nspec2D_xmin_inner_core,nspec2D_xmax_inner_core, &
+               nspec2D_ymin_inner_core,nspec2D_ymax_inner_core, &
+               ibelm_xmin_inner_core,ibelm_xmax_inner_core, &
+               ibelm_ymin_inner_core,ibelm_ymax_inner_core,ibelm_bottom_inner_core, &
+               nb_msgs_theor_in_cube,non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices, &
+               receiver_cube_from_slices,sender_from_slices_to_cube,ibool_central_cube, &
+               buffer_slices,buffer_slices2,buffer_all_cube_from_slices)
+
+    if(myrank == 0) write(IMAIN,*) ''
+
+  else
+
+    ! allocate fictitious buffers for cube and slices with a dummy size
+    ! just to be able to use them as arguments in subroutine calls
+    allocate(sender_from_slices_to_cube(1), &
+            buffer_all_cube_from_slices(1,1,1), &
+            b_buffer_all_cube_from_slices(1,1,1), &
+            buffer_slices(1,1), &
+            b_buffer_slices(1,1), &
+            buffer_slices2(1,1), &
+            ibool_central_cube(1,1),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating dummy buffers')
+
+  endif
+
+  ! note: fix_... routines below update is_on_a_slice_edge_.. arrays:
+  !          assign flags for each element which is on a rim of the slice
+  !          thus, they include elements on top and bottom not shared with other MPI partitions  
+  !
+  !          we will re-set these flags when setting up inner/outer elements, but will
+  !          use these arrays for now as initial guess for the search for elements which share a global point 
+  !          between different MPI processes  
+  call fix_non_blocking_slices(is_on_a_slice_edge_crust_mantle,iboolright_xi_crust_mantle, &
+         iboolleft_xi_crust_mantle,iboolright_eta_crust_mantle,iboolleft_eta_crust_mantle, &
+         npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle,ibool_crust_mantle, &
+         mask_ibool,NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE,NGLOB2DMAX_XMIN_XMAX_CM,NGLOB2DMAX_YMIN_YMAX_CM)
+
+  call fix_non_blocking_slices(is_on_a_slice_edge_outer_core,iboolright_xi_outer_core, &
+         iboolleft_xi_outer_core,iboolright_eta_outer_core,iboolleft_eta_outer_core, &
+         npoin2D_xi_outer_core,npoin2D_eta_outer_core,ibool_outer_core, &
+         mask_ibool,NSPEC_OUTER_CORE,NGLOB_OUTER_CORE,NGLOB2DMAX_XMIN_XMAX_OC,NGLOB2DMAX_YMIN_YMAX_OC)
+
+  call fix_non_blocking_slices(is_on_a_slice_edge_inner_core,iboolright_xi_inner_core, &
+         iboolleft_xi_inner_core,iboolright_eta_inner_core,iboolleft_eta_inner_core, &
+         npoin2D_xi_inner_core,npoin2D_eta_inner_core,ibool_inner_core, &
+         mask_ibool,NSPEC_INNER_CORE,NGLOB_INNER_CORE,NGLOB2DMAX_XMIN_XMAX_IC,NGLOB2DMAX_YMIN_YMAX_IC)
+
+  if(INCLUDE_CENTRAL_CUBE) then
+    ! updates flags for elements on slice boundaries
+    call fix_non_blocking_central_cube(is_on_a_slice_edge_inner_core, &
+         ibool_inner_core,NSPEC_INNER_CORE,NGLOB_INNER_CORE,nb_msgs_theor_in_cube,ibelm_bottom_inner_core, &
+         idoubling_inner_core,npoin2D_cube_from_slices,ibool_central_cube, &
+         NSPEC2D_BOTTOM(IREGION_INNER_CORE),ichunk)
+  endif
+  
+  ! debug: saves element flags
+  ! crust mantle
+  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_is_on_a_slice_edge_crust_mantle_proc',myrank  
+  !call write_VTK_data_elem_l(NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
+  !                          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+  !                          ibool_crust_mantle, &
+  !                          is_on_a_slice_edge_crust_mantle,filename)  
+  ! outer core
+  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_is_on_a_slice_edge_outer_core_proc',myrank  
+  !call write_VTK_data_elem_l(NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
+  !                          xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+  !                          ibool_outer_core, &
+  !                          is_on_a_slice_edge_outer_core,filename)
+!daniel  
+  ! inner core
+  write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_is_on_a_slice_edge_inner_core_proc',myrank  
+  call write_VTK_data_elem_l(NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
+                            xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+                            ibool_inner_core, &
+                            is_on_a_slice_edge_inner_core,filename)
+  
+  end subroutine read_mesh_databases_MPIbuffers
+  
+  
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine read_mesh_databases_coupling(myrank, &
-              nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-              nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
-              ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle,ibelm_ymin_crust_mantle, &
-              ibelm_ymax_crust_mantle,ibelm_bottom_crust_mantle,ibelm_top_crust_mantle, &
-              normal_xmin_crust_mantle,normal_xmax_crust_mantle,normal_ymin_crust_mantle, &
-              normal_ymax_crust_mantle,normal_bottom_crust_mantle,normal_top_crust_mantle, &
-              jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle,jacobian2D_ymin_crust_mantle, &
-              jacobian2D_ymax_crust_mantle,jacobian2D_bottom_crust_mantle,jacobian2D_top_crust_mantle, &
-              nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-              nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
-              ibelm_xmin_outer_core,ibelm_xmax_outer_core,ibelm_ymin_outer_core, &
-              ibelm_ymax_outer_core,ibelm_bottom_outer_core,ibelm_top_outer_core, &
-              normal_xmin_outer_core,normal_xmax_outer_core,normal_ymin_outer_core, &
-              normal_ymax_outer_core,normal_bottom_outer_core,normal_top_outer_core, &
-              jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core,jacobian2D_ymin_outer_core, &
-              jacobian2D_ymax_outer_core,jacobian2D_bottom_outer_core,jacobian2D_top_outer_core, &
-              nspec2D_xmin_inner_core,nspec2D_xmax_inner_core, &
-              nspec2D_ymin_inner_core,nspec2D_ymax_inner_core, &
-              ibelm_xmin_inner_core,ibelm_xmax_inner_core,ibelm_ymin_inner_core, &
-              ibelm_ymax_inner_core,ibelm_bottom_inner_core,ibelm_top_inner_core, &
-              ibelm_moho_top,ibelm_moho_bot,ibelm_400_top,ibelm_400_bot, &
-              ibelm_670_top,ibelm_670_bot,normal_moho,normal_400,normal_670, &
-              k_top,k_bot,moho_kl,d400_kl,d670_kl,cmb_kl,icb_kl, &
-              LOCAL_PATH,SIMULATION_TYPE)
+  subroutine read_mesh_databases_MPIinter()
 
-! to couple mantle with outer core
+! sets up interfaces between MPI processes
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore  
   implicit none
 
   include 'mpif.h'
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
+  
+  ! local parameters  
+  integer :: ier,ndim_assemble
+  character(len=150) :: filename
 
-  integer myrank
+  ! temporary buffers for send and receive between faces of the slices and the chunks
+  real(kind=CUSTOM_REAL), dimension(npoin2D_max_all_CM_IC) ::  &
+    buffer_send_faces_scalar,buffer_received_faces_scalar
+  
+  ! assigns initial maximum arrays
+  ! for global slices, maximum number of neighbor is around 17 ( 8 horizontal, max of 8 on bottom )
+  integer, parameter :: MAX_NEIGHBOURS = 8 + NCORNERSCHUNKS_VAL
+  integer, dimension(MAX_NEIGHBOURS) :: my_neighbours,nibool_neighbours
+  integer, dimension(:,:),allocatable :: ibool_neighbours
+  integer :: max_nibool  
+  real(kind=CUSTOM_REAL),dimension(:),allocatable :: test_flag
+  real(kind=CUSTOM_REAL),dimension(:),allocatable :: test_flag_cc
+  integer :: i,j,k,ispec,iglob
+  
+  ! estimates initial maximum ibool array
+  max_nibool = npoin2D_max_all_CM_IC * NUMFACES_SHARED &
+               + non_zero_nb_msgs_theor_in_cube*npoin2D_cube_from_slices
 
-  ! for crust/oceans coupling
-  integer nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-    nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle
-  integer, dimension(NSPEC2DMAX_XMIN_XMAX_CM) :: ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle
-  integer, dimension(NSPEC2DMAX_YMIN_YMAX_CM) :: ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle
-  integer, dimension(NSPEC2D_BOTTOM_CM) :: ibelm_bottom_crust_mantle
-  integer, dimension(NSPEC2D_TOP_CM) :: ibelm_top_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_BOTTOM_CM) :: jacobian2D_bottom_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_TOP_CM) :: jacobian2D_top_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_CM) :: &
-    jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_CM) :: &
-    jacobian2D_ymin_crust_mantle,jacobian2D_ymax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_CM) :: &
-    normal_xmin_crust_mantle,normal_xmax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2DMAX_YMIN_YMAX_CM) :: &
-    normal_ymin_crust_mantle,normal_ymax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_BOTTOM_CM) :: normal_bottom_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_TOP_CM) :: normal_top_crust_mantle
+  allocate(ibool_neighbours(max_nibool,MAX_NEIGHBOURS), stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating ibool_neighbours')
 
-  ! arrays to couple with the fluid regions by pointwise matching
-  integer nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-    nspec2D_ymin_outer_core,nspec2D_ymax_outer_core
-  integer, dimension(NSPEC2DMAX_XMIN_XMAX_OC) :: ibelm_xmin_outer_core,ibelm_xmax_outer_core
-  integer, dimension(NSPEC2DMAX_YMIN_YMAX_OC) :: ibelm_ymin_outer_core,ibelm_ymax_outer_core
-  integer, dimension(NSPEC2D_BOTTOM_OC) :: ibelm_bottom_outer_core
-  integer, dimension(NSPEC2D_TOP_OC) :: ibelm_top_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: jacobian2D_bottom_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_TOP_OC) :: jacobian2D_top_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_OC) :: &
-    jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_OC) :: &
-    jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_OC) :: &
-    normal_xmin_outer_core,normal_xmax_outer_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_OC) :: &
-    normal_ymin_outer_core,normal_ymax_outer_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: normal_bottom_outer_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_TOP_OC) :: normal_top_outer_core
 
-  ! inner core
-  integer nspec2D_xmin_inner_core,nspec2D_xmax_inner_core, &
-    nspec2D_ymin_inner_core,nspec2D_ymax_inner_core
-  integer, dimension(NSPEC2DMAX_XMIN_XMAX_IC) :: ibelm_xmin_inner_core,ibelm_xmax_inner_core
-  integer, dimension(NSPEC2DMAX_YMIN_YMAX_IC) :: ibelm_ymin_inner_core,ibelm_ymax_inner_core
-  integer, dimension(NSPEC2D_BOTTOM_IC) :: ibelm_bottom_inner_core
-  integer, dimension(NSPEC2D_TOP_IC) :: ibelm_top_inner_core
+! sets up MPI interfaces  
+! crust mantle region  
+  if( myrank == 0 ) write(IMAIN,*) 'crust mantle mpi:'
+  allocate(test_flag(NGLOB_CRUST_MANTLE), &
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating test_flag')
+  
+  ! sets flag to rank id (+1 to avoid problems with zero rank)
+  test_flag(:) = myrank + 1.0
 
-  ! boundary
-  integer, dimension(NSPEC2D_MOHO) :: ibelm_moho_top,ibelm_moho_bot
-  integer, dimension(NSPEC2D_400) :: ibelm_400_top,ibelm_400_bot
-  integer, dimension(NSPEC2D_670) :: ibelm_670_top,ibelm_670_bot
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_MOHO) :: normal_moho
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_400) :: normal_400
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_670) :: normal_670
+  ! assembles values
+  call assemble_MPI_scalar_block(myrank,test_flag, &
+            NGLOB_CRUST_MANTLE, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
+            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
+            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces_scalar,buffer_received_faces_scalar,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
+            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
+            NGLOB2DMAX_XMIN_XMAX(IREGION_CRUST_MANTLE),NGLOB2DMAX_YMIN_YMAX(IREGION_CRUST_MANTLE),NGLOB2DMAX_XY,NCHUNKS_VAL)
+    
+  ! removes own myrank id (+1)
+  test_flag(:) = test_flag(:) - ( myrank + 1.0)
 
-  integer k_top,k_bot
+  ! debug: saves array
+  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_test_flag_crust_mantle_proc',myrank
+  !call write_VTK_glob_points(NGLOB_CRUST_MANTLE, &
+  !                      xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+  !                      test_flag,filename)
+  
+  ! determines neighbor rank for shared faces 
+  call rmd_get_MPI_interfaces(myrank,NGLOB_CRUST_MANTLE,NSPEC_CRUST_MANTLE, &
+                            test_flag,my_neighbours,nibool_neighbours,ibool_neighbours, &
+                            num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
+                            max_nibool,MAX_NEIGHBOURS, &
+                            ibool_crust_mantle,&
+                            is_on_a_slice_edge_crust_mantle, &
+                            IREGION_CRUST_MANTLE,.false.)  
 
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_MOHO) :: moho_kl
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_400) :: d400_kl
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_670) ::  d670_kl
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_CMB) :: cmb_kl
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_ICB) :: icb_kl
+  deallocate(test_flag)
+    
+  ! stores MPI interfaces informations
+  allocate(my_neighbours_crust_mantle(num_interfaces_crust_mantle), &
+          nibool_interfaces_crust_mantle(num_interfaces_crust_mantle), &          
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array my_neighbours_crust_mantle etc.')
 
-  character(len=150) LOCAL_PATH
-  integer SIMULATION_TYPE
+  ! copies interfaces arrays
+  if( num_interfaces_crust_mantle > 0 ) then
+    allocate(ibool_interfaces_crust_mantle(max_nibool_interfaces_crust_mantle,num_interfaces_crust_mantle), &
+           stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array ibool_interfaces_crust_mantle')
 
-  ! local parameters
-  integer njunk1,njunk2,njunk3
-  character(len=150) prname
+    ! ranks of neighbour processes
+    my_neighbours_crust_mantle(:) = my_neighbours(1:num_interfaces_crust_mantle)
+    ! number of global ibool entries on each interface
+    nibool_interfaces_crust_mantle(:) = nibool_neighbours(1:num_interfaces_crust_mantle)
+    ! global iglob point ids on each interface
+    ibool_interfaces_crust_mantle(:,:) = ibool_neighbours(1:max_nibool_interfaces_crust_mantle,1:num_interfaces_crust_mantle)    
+  else
+    ! dummy allocation (fortran90 should allow allocate statement with zero array size)
+    max_nibool_interfaces_crust_mantle = 0
+    allocate(ibool_interfaces_crust_mantle(0,0),stat=ier)
+  endif
+  
+  ! debug: saves 1. MPI interface
+  !if( num_interfaces_crust_mantle >= 1 ) then
+  !  write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_1_points_crust_mantle_proc',myrank
+  !  call write_VTK_data_points(NGLOB_CRUST_MANTLE, &
+  !                      xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+  !                      ibool_interfaces_crust_mantle(1:nibool_interfaces_crust_mantle(1),1), &
+  !                      nibool_interfaces_crust_mantle(1),filename)
+  !endif
 
 
-  ! crust and mantle
-  ! create name of database
-  call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
+! outer core region  
+  if( myrank == 0 ) write(IMAIN,*) 'outer core mpi:'
 
-  ! Stacey put back
-  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
-        status='old',form='unformatted',action='read')
-  read(27) nspec2D_xmin_crust_mantle
-  read(27) nspec2D_xmax_crust_mantle
-  read(27) nspec2D_ymin_crust_mantle
-  read(27) nspec2D_ymax_crust_mantle
-  read(27) njunk1
-  read(27) njunk2
+  allocate(test_flag(NGLOB_OUTER_CORE), &
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating test_flag outer core')
+  
+  ! sets flag to rank id (+1 to avoid problems with zero rank)
+  test_flag(:) = myrank + 1.0
 
-! boundary parameters
-  read(27) ibelm_xmin_crust_mantle
-  read(27) ibelm_xmax_crust_mantle
-  read(27) ibelm_ymin_crust_mantle
-  read(27) ibelm_ymax_crust_mantle
-  read(27) ibelm_bottom_crust_mantle
-  read(27) ibelm_top_crust_mantle
+  ! assembles values
+  call assemble_MPI_scalar_block(myrank,test_flag, &
+            NGLOB_OUTER_CORE, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
+            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
+            iboolfaces_outer_core,iboolcorner_outer_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces_scalar,buffer_received_faces_scalar,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
+            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
+            NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE),NGLOB2DMAX_XY,NCHUNKS_VAL)
 
-  read(27) normal_xmin_crust_mantle
-  read(27) normal_xmax_crust_mantle
-  read(27) normal_ymin_crust_mantle
-  read(27) normal_ymax_crust_mantle
-  read(27) normal_bottom_crust_mantle
-  read(27) normal_top_crust_mantle
+    
+  ! removes own myrank id (+1)
+  test_flag(:) = test_flag(:) - ( myrank + 1.0)
 
-  read(27) jacobian2D_xmin_crust_mantle
-  read(27) jacobian2D_xmax_crust_mantle
-  read(27) jacobian2D_ymin_crust_mantle
-  read(27) jacobian2D_ymax_crust_mantle
-  read(27) jacobian2D_bottom_crust_mantle
-  read(27) jacobian2D_top_crust_mantle
-  close(27)
+  ! debug: saves array
+  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_test_flag_outer_core_proc',myrank
+  !call write_VTK_glob_points(NGLOB_OUTER_CORE, &
+  !                      xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+  !                      test_flag,filename)
 
+  
+  ! determines neighbor rank for shared faces 
+  call rmd_get_MPI_interfaces(myrank,NGLOB_OUTER_CORE,NSPEC_OUTER_CORE, &
+                            test_flag,my_neighbours,nibool_neighbours,ibool_neighbours, &
+                            num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
+                            max_nibool,MAX_NEIGHBOURS, &
+                            ibool_outer_core,&
+                            is_on_a_slice_edge_outer_core, &
+                            IREGION_OUTER_CORE,.false.)  
 
-  ! read parameters to couple fluid and solid regions
-  !
-  ! outer core
+  deallocate(test_flag)
+    
+  ! stores MPI interfaces informations
+  allocate(my_neighbours_outer_core(num_interfaces_outer_core), &
+          nibool_interfaces_outer_core(num_interfaces_outer_core), &          
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array my_neighbours_outer_core etc.')
 
-  ! create name of database
-  call create_name_database(prname,myrank,IREGION_OUTER_CORE,LOCAL_PATH)
+  ! copies interfaces arrays
+  if( num_interfaces_outer_core > 0 ) then
+    allocate(ibool_interfaces_outer_core(max_nibool_interfaces_outer_core,num_interfaces_outer_core), &
+           stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array ibool_interfaces_outer_core')
 
-  ! boundary parameters
+    ! ranks of neighbour processes
+    my_neighbours_outer_core(:) = my_neighbours(1:num_interfaces_outer_core)
+    ! number of global ibool entries on each interface
+    nibool_interfaces_outer_core(:) = nibool_neighbours(1:num_interfaces_outer_core)
+    ! global iglob point ids on each interface
+    ibool_interfaces_outer_core(:,:) = ibool_neighbours(1:max_nibool_interfaces_outer_core,1:num_interfaces_outer_core)    
+  else
+    ! dummy allocation (fortran90 should allow allocate statement with zero array size)
+    max_nibool_interfaces_outer_core = 0
+    allocate(ibool_interfaces_outer_core(0,0),stat=ier)
+  endif
 
-  ! Stacey put back
-  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
-        status='old',form='unformatted',action='read')
-  read(27) nspec2D_xmin_outer_core
-  read(27) nspec2D_xmax_outer_core
-  read(27) nspec2D_ymin_outer_core
-  read(27) nspec2D_ymax_outer_core
-  read(27) njunk1
-  read(27) njunk2
+  ! debug: saves 1. MPI interface
+  !if( num_interfaces_outer_core >= 1 ) then
+  !  write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_1_points_outer_core_proc',myrank
+  !  call write_VTK_data_points(NGLOB_OUTER_CORE, &
+  !                      xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+  !                      ibool_interfaces_outer_core(1:nibool_interfaces_outer_core(1),1), &
+  !                      nibool_interfaces_outer_core(1),filename)
+  !endif
 
-  read(27) ibelm_xmin_outer_core
-  read(27) ibelm_xmax_outer_core
-  read(27) ibelm_ymin_outer_core
-  read(27) ibelm_ymax_outer_core
-  read(27) ibelm_bottom_outer_core
-  read(27) ibelm_top_outer_core
+! inner core
+  if( myrank == 0 ) write(IMAIN,*) 'inner core mpi:'
 
-  read(27) normal_xmin_outer_core
-  read(27) normal_xmax_outer_core
-  read(27) normal_ymin_outer_core
-  read(27) normal_ymax_outer_core
-  read(27) normal_bottom_outer_core
-  read(27) normal_top_outer_core
+  allocate(test_flag(NGLOB_INNER_CORE), &
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating test_flag inner core')
+  
+  ! sets flag to rank id (+1 to avoid problems with zero rank)
+  test_flag(:) = 0.0
+  do ispec=1,NSPEC_INNER_CORE
+    ! suppress fictitious elements in central cube
+    if(idoubling_inner_core(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle    
+    do k = 1,NGLLZ
+      do j = 1,NGLLY
+        do i = 1,NGLLX
+          iglob = ibool_inner_core(i,j,k,ispec)          
+          test_flag(iglob) = myrank + 1.0
+        enddo
+      enddo
+    enddo
+  enddo
 
-  read(27) jacobian2D_xmin_outer_core
-  read(27) jacobian2D_xmax_outer_core
-  read(27) jacobian2D_ymin_outer_core
-  read(27) jacobian2D_ymax_outer_core
-  read(27) jacobian2D_bottom_outer_core
-  read(27) jacobian2D_top_outer_core
-  close(27)
+  ! assembles values
+  call assemble_MPI_scalar_block(myrank,test_flag, &
+            NGLOB_INNER_CORE, &
+            iproc_xi,iproc_eta,ichunk,addressing, &
+            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
+            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
+            iboolfaces_inner_core,iboolcorner_inner_core, &
+            iprocfrom_faces,iprocto_faces,imsg_type, &
+            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
+            buffer_send_faces_scalar,buffer_received_faces_scalar,npoin2D_max_all_CM_IC, &
+            buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
+            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
+            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_INNER_CORE), &
+            NGLOB2DMAX_XMIN_XMAX(IREGION_INNER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_INNER_CORE),NGLOB2DMAX_XY,NCHUNKS_VAL)
 
+  ! removes own myrank id (+1)
+  test_flag(:) = test_flag(:) - ( myrank + 1.0)  
+  where( test_flag(:) < 0.0 ) test_flag(:) = 0.0
 
-  !
+  ! debug: saves array
+  write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_test_flag_inner_core_A_proc',myrank
+  call write_VTK_glob_points(NGLOB_INNER_CORE, &
+                        xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+                        test_flag,filename)
+
+!  ! gets new interfaces for inner_core without central cube yet
+!  ! determines neighbor rank for shared faces 
+!  call rmd_get_MPI_interfaces(myrank,NGLOB_INNER_CORE,NSPEC_INNER_CORE, &
+!                            test_flag,my_neighbours,nibool_neighbours,ibool_neighbours, &
+!                            num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
+!                            max_nibool,MAX_NEIGHBOURS, &
+!                            ibool_inner_core,&
+!                            is_on_a_slice_edge_inner_core, &
+!                            IREGION_INNER_CORE,.false.,idoubling_inner_core)  
+
+  ! including central cube
+  if(INCLUDE_CENTRAL_CUBE) then
+    if( myrank == 0 ) write(IMAIN,*) 'inner core with central cube mpi:'
+
+    allocate(test_flag_cc(NGLOB_INNER_CORE), &
+            stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating test_flag_cc inner core')
+  
+    ! re-sets flag to rank id (+1 to avoid problems with zero rank)
+    test_flag_cc(:) = 0.0
+    do ispec=1,NSPEC_INNER_CORE
+      ! suppress fictitious elements in central cube
+      if(idoubling_inner_core(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle    
+      do k = 1,NGLLZ
+        do j = 1,NGLLY
+          do i = 1,NGLLX
+            iglob = ibool_inner_core(i,j,k,ispec)          
+            test_flag_cc(iglob) = myrank + 1.0
+          enddo
+        enddo
+      enddo
+    enddo  
+  
+    ! test_flag is a scalar, not a vector
+    ndim_assemble = 1
+    ! use central cube buffers to assemble the inner core mass matrix with the central cube
+    call assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube, sender_from_slices_to_cube, &
+                 npoin2D_cube_from_slices, buffer_all_cube_from_slices, &
+                 buffer_slices, buffer_slices2, ibool_central_cube, &
+                 receiver_cube_from_slices, ibool_inner_core, &
+                 idoubling_inner_core, NSPEC_INNER_CORE, &
+                 ibelm_bottom_inner_core, NSPEC2D_BOTTOM(IREGION_INNER_CORE), &
+                 NGLOB_INNER_CORE, &
+                 test_flag_cc,ndim_assemble)
+                 
+
+    ! removes own myrank id (+1)
+    test_flag_cc(:) = test_flag_cc(:) - ( myrank + 1.0)  
+    where( test_flag_cc(:) < 0.0 ) test_flag_cc(:) = 0.0
+                 
+    write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_test_flag_inner_core_B_proc',myrank
+    call write_VTK_glob_points(NGLOB_INNER_CORE, &
+                        xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+                        test_flag_cc,filename)
+
+!    ! adds additional inner core points
+!    call rmd_get_MPI_interfaces(myrank,NGLOB_INNER_CORE,NSPEC_INNER_CORE, &
+!                            test_flag,my_neighbours,nibool_neighbours,ibool_neighbours, &
+!                            num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
+!                            max_nibool,MAX_NEIGHBOURS, &
+!                            ibool_inner_core,&
+!                            is_on_a_slice_edge_inner_core, &
+!                            IREGION_INNER_CORE,.true.,idoubling_inner_core)
+     
+    ! adds both together 
+    test_flag(:) = test_flag(:) + test_flag_cc(:)
+
+    deallocate(test_flag_cc)    
+
+    ! debug: saves array
+    write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_test_flag_inner_core_C_proc',myrank
+    call write_VTK_glob_points(NGLOB_INNER_CORE, &
+                              xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+                              test_flag,filename)
+  
+  endif
+
+  ! gets new interfaces for inner_core without central cube yet
+  ! determines neighbor rank for shared faces 
+  call rmd_get_MPI_interfaces(myrank,NGLOB_INNER_CORE,NSPEC_INNER_CORE, &
+                            test_flag,my_neighbours,nibool_neighbours,ibool_neighbours, &
+                            num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
+                            max_nibool,MAX_NEIGHBOURS, &
+                            ibool_inner_core,&
+                            is_on_a_slice_edge_inner_core, &
+                            IREGION_INNER_CORE,.false.,idoubling_inner_core)  
+      
+  deallocate(test_flag)
+  
+  ! stores MPI interfaces informations
+  allocate(my_neighbours_inner_core(num_interfaces_inner_core), &
+          nibool_interfaces_inner_core(num_interfaces_inner_core), &          
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array my_neighbours_inner_core etc.')
+
+  ! copies interfaces arrays
+  if( num_interfaces_inner_core > 0 ) then
+    allocate(ibool_interfaces_inner_core(max_nibool_interfaces_inner_core,num_interfaces_inner_core), &
+           stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array ibool_interfaces_inner_core')
+
+    ! ranks of neighbour processes
+    my_neighbours_inner_core(:) = my_neighbours(1:num_interfaces_inner_core)
+    ! number of global ibool entries on each interface
+    nibool_interfaces_inner_core(:) = nibool_neighbours(1:num_interfaces_inner_core)
+    ! global iglob point ids on each interface
+    ibool_interfaces_inner_core(:,:) = ibool_neighbours(1:max_nibool_interfaces_inner_core,1:num_interfaces_inner_core)    
+  else
+    ! dummy allocation (fortran90 should allow allocate statement with zero array size)
+    max_nibool_interfaces_inner_core = 0
+    allocate(ibool_interfaces_inner_core(0,0),stat=ier)
+  endif
+
+  ! debug: saves 1. MPI interface
+  if( myrank == 0 .and. num_interfaces_inner_core >= 1 ) then
+    write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_1_points_inner_core_proc',myrank  
+    call write_VTK_data_points(NGLOB_INNER_CORE, &
+                        xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+                        ibool_interfaces_inner_core(1:nibool_interfaces_inner_core(1),1), &
+                        nibool_interfaces_inner_core(1),filename)
+    !print*,'saved: ',trim(filename)//'.vtk'
+  endif  
+
+  ! synchronizes MPI processes
+  call sync_all()
+
+  ! frees temporary array
+  deallocate(ibool_neighbours)  
+
+
+  ! allocates MPI buffers
+  ! crust mantle
+  allocate(buffer_send_vector_crust_mantle(NDIM,max_nibool_interfaces_crust_mantle,num_interfaces_crust_mantle), &
+          buffer_recv_vector_crust_mantle(NDIM,max_nibool_interfaces_crust_mantle,num_interfaces_crust_mantle), &
+          request_send_vector_crust_mantle(num_interfaces_crust_mantle), &
+          request_recv_vector_crust_mantle(num_interfaces_crust_mantle), &
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_crust_mantle etc.')
+  if( SIMULATION_TYPE == 3 ) then
+    allocate(b_buffer_send_vector_crust_mantle(NDIM,max_nibool_interfaces_crust_mantle,num_interfaces_crust_mantle), &
+            b_buffer_recv_vector_crust_mantle(NDIM,max_nibool_interfaces_crust_mantle,num_interfaces_crust_mantle), &
+            b_request_send_vector_crust_mantle(num_interfaces_crust_mantle), &
+            b_request_recv_vector_crust_mantle(num_interfaces_crust_mantle), &
+            stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_crust_mantle etc.')
+  endif
+
+  ! outer core
+  allocate(buffer_send_scalar_outer_core(max_nibool_interfaces_outer_core,num_interfaces_outer_core), &
+          buffer_recv_scalar_outer_core(max_nibool_interfaces_outer_core,num_interfaces_outer_core), &
+          request_send_scalar_outer_core(num_interfaces_outer_core), &
+          request_recv_scalar_outer_core(num_interfaces_outer_core), &
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_outer_core etc.')
+  if( SIMULATION_TYPE == 3 ) then
+    allocate(b_buffer_send_scalar_outer_core(max_nibool_interfaces_outer_core,num_interfaces_outer_core), &
+            b_buffer_recv_scalar_outer_core(max_nibool_interfaces_outer_core,num_interfaces_outer_core), &
+            b_request_send_scalar_outer_core(num_interfaces_outer_core), &
+            b_request_recv_scalar_outer_core(num_interfaces_outer_core), &
+            stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_outer_core etc.')
+  endif
+  
   ! inner core
-  !
+  allocate(buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_inner_core,num_interfaces_inner_core), &
+          buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_inner_core,num_interfaces_inner_core), &
+          request_send_vector_inner_core(num_interfaces_inner_core), &
+          request_recv_vector_inner_core(num_interfaces_inner_core), &
+          stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_inner_core etc.')
+  if( SIMULATION_TYPE == 3 ) then
+    allocate(b_buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_inner_core,num_interfaces_inner_core), &
+            b_buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_inner_core,num_interfaces_inner_core), &
+            b_request_send_vector_inner_core(num_interfaces_inner_core), &
+            b_request_recv_vector_inner_core(num_interfaces_inner_core), &
+            stat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_inner_core etc.')
+  endif
+  
+  end subroutine read_mesh_databases_MPIinter
 
-  ! create name of database
-  call create_name_database(prname,myrank,IREGION_INNER_CORE,LOCAL_PATH)
 
-  ! read info for vertical edges for central cube matching in inner core
-  open(unit=27,file=prname(1:len_trim(prname))//'boundary.bin', &
-        status='old',form='unformatted',action='read')
-  read(27) nspec2D_xmin_inner_core
-  read(27) nspec2D_xmax_inner_core
-  read(27) nspec2D_ymin_inner_core
-  read(27) nspec2D_ymax_inner_core
-  read(27) njunk1
-  read(27) njunk2
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-  ! boundary parameters
-  read(27) ibelm_xmin_inner_core
-  read(27) ibelm_xmax_inner_core
-  read(27) ibelm_ymin_inner_core
-  read(27) ibelm_ymax_inner_core
-  read(27) ibelm_bottom_inner_core
-  read(27) ibelm_top_inner_core
-  close(27)
+  subroutine rmd_get_MPI_interfaces(myrank,NGLOB,NSPEC, &
+                                    test_flag,my_neighbours,nibool_neighbours,ibool_neighbours, &
+                                    num_interfaces,max_nibool_interfaces, &
+                                    max_nibool,MAX_NEIGHBOURS, &
+                                    ibool,&
+                                    is_on_a_slice_edge, &
+                                    IREGION,add_central_cube,idoubling)
 
+  use constants
+  
+  implicit none
 
-  ! -- Boundary Mesh for crust and mantle ---
-  if (SAVE_BOUNDARY_MESH .and. SIMULATION_TYPE == 3) then
+  include 'mpif.h'
 
-    call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
+  integer,intent(in) :: myrank,NGLOB,NSPEC
 
-    open(unit=27,file=prname(1:len_trim(prname))//'boundary_disc.bin', &
-          status='old',form='unformatted',action='read')
-    read(27) njunk1,njunk2,njunk3
-    if (njunk1 /= NSPEC2D_MOHO .and. njunk2 /= NSPEC2D_400 .and. njunk3 /= NSPEC2D_670) &
-               call exit_mpi(myrank, 'Error reading ibelm_disc.bin file')
-    read(27) ibelm_moho_top
-    read(27) ibelm_moho_bot
-    read(27) ibelm_400_top
-    read(27) ibelm_400_bot
-    read(27) ibelm_670_top
-    read(27) ibelm_670_bot
-    read(27) normal_moho
-    read(27) normal_400
-    read(27) normal_670
-    close(27)
+  real(kind=CUSTOM_REAL),dimension(NGLOB),intent(in) :: test_flag
 
-    k_top = 1
-    k_bot = NGLLZ
+  integer,intent(in) :: max_nibool
+  integer,intent(in) :: MAX_NEIGHBOURS
+  integer, dimension(MAX_NEIGHBOURS),intent(inout) :: my_neighbours,nibool_neighbours
+  integer, dimension(max_nibool,MAX_NEIGHBOURS),intent(inout) :: ibool_neighbours
 
-    ! initialization
-    moho_kl = 0.; d400_kl = 0.; d670_kl = 0.; cmb_kl = 0.; icb_kl = 0.
+  integer,intent(out) :: num_interfaces,max_nibool_interfaces
+
+  integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC),intent(in) :: ibool
+  
+  logical,dimension(NSPEC),intent(inout) :: is_on_a_slice_edge
+  
+  integer,intent(in) :: IREGION  
+  logical,intent(in) :: add_central_cube
+  integer,dimension(NSPEC),optional:: idoubling
+  
+  ! local parameters  
+  integer :: ispec,iglob,j,k
+  integer :: iface,iedge,icorner
+  integer :: ii,iinterface,icurrent,rank
+  integer :: npoin
+  logical :: is_done,ispec_is_outer
+  integer,dimension(NGLOB) :: work_test_flag
+  logical,dimension(NSPEC) :: work_ispec_is_outer
+  
+  ! initializes 
+  if( add_central_cube) then
+    ! adds points to existing inner_core interfaces
+    iinterface = num_interfaces
+    work_ispec_is_outer(:) = is_on_a_slice_edge(:)    
+  else
+    ! creates new interfaces
+    iinterface = 0
+    num_interfaces = 0
+    max_nibool_interfaces = 0
+    my_neighbours(:) = -1
+    nibool_neighbours(:) = 0
+    ibool_neighbours(:,:) = 0
+    work_ispec_is_outer(:) = .false.
   endif
+  
+  ! makes working copy (converted to nearest integers)
+  work_test_flag(:) = nint( test_flag(:) )
 
-  end subroutine read_mesh_databases_coupling
+  ! loops over all elements
+  do ispec = 1,NSPEC
 
+    ! exclude elements in inner part of slice
+    !if( .not. is_on_a_slice_edge(ispec) ) cycle
 
+    ! exclude elements in fictitious core
+    if( IREGION == IREGION_INNER_CORE) then
+      if( idoubling(ispec) == IFLAG_IN_FICTITIOUS_CUBE ) cycle
+    endif
+
+    ! sets flag if element has global points shared with other processes    
+    ispec_is_outer = .false.    
+
+    ! 1. finds neighbours which share a whole face with this process
+    ! (faces are shared only with 1 other neighbour process)
+        
+    ! loops over all faces of element
+    do iface = 1, 6
+    
+      ! chooses a point inside face
+      select case( iface )
+      case( 1 )
+        ! face I == 1    
+        iglob = ibool(1,2,2,ispec) 
+      case( 2 )
+        ! face I == NGLLX
+        iglob = ibool(NGLLX,2,2,ispec) 
+      case( 3 )
+        ! face J == 1
+        iglob = ibool(2,1,2,ispec) 
+      case( 4 )
+        ! face J == NGLLY
+        iglob = ibool(2,NGLLY,2,ispec) 
+      case( 5 )
+        ! face K == 1
+        iglob = ibool(2,2,1,ispec)
+      case( 6 )
+        ! face K == NGLLZ
+        iglob = ibool(2,2,NGLLZ,ispec)
+      end select      
+
+      ! checks assembled flag on global point
+      if( work_test_flag(iglob) > 0 ) then
+        ispec_is_outer = .true.
+        
+        ! rank of neighbor process
+        rank = work_test_flag(iglob) - 1
+        
+        ! checks ranks range
+        if( rank < 0 .or. rank >= NPROCTOT_VAL ) then
+          print*,'error face rank: ',myrank,'ispec=',ispec
+          print*,'  neighbor rank = ',rank,'exceeds total nproc:',NPROCTOT_VAL
+          print*,'  face ',iface
+          call exit_mpi(myrank,'error face neighbor mpi rank')
+        endif
+        
+        ! checks if already stored
+        icurrent = 0
+        is_done = .false.
+        do ii = 1,iinterface
+          if( rank == my_neighbours(ii) ) then 
+            icurrent = ii
+            is_done = .true.
+            exit
+          endif
+        enddo
+        
+        ! updates interfaces array
+        if( .not. is_done ) then
+          iinterface = iinterface + 1
+          if( iinterface > MAX_NEIGHBOURS ) &
+            call exit_mpi(myrank,'interface face exceeds MAX_NEIGHBOURS range')
+          ! adds as neighbor new interface
+          my_neighbours(iinterface) = rank
+          icurrent = iinterface
+        endif
+        if( icurrent == 0 ) &
+          call exit_mpi(myrank,'could not find current interface for this neighbor, please check my_neighbours')
+        
+        ! adds interface points and removes neighbor flag from face
+        ! assumes NGLLX == NGLLY == NGLLZ
+        do k=1,NGLLX
+          do j=1,NGLLX
+            select case( iface )
+            case( 1 )
+              ! face I == 1    
+              iglob = ibool(1,j,k,ispec) 
+            case( 2 )
+              ! face I == NGLLX
+              iglob = ibool(NGLLX,j,k,ispec) 
+            case( 3 )
+              ! face J == 1
+              iglob = ibool(j,1,k,ispec) 
+            case( 4 )
+              ! face J == NGLLY
+              iglob = ibool(j,NGLLY,k,ispec) 
+            case( 5 )
+              ! face K == 1
+              iglob = ibool(j,k,1,ispec)
+            case( 6 )
+              ! face K == NGLLZ
+              iglob = ibool(j,k,NGLLZ,ispec)
+            end select      
+            
+            ! checks that we take each global point (on edges and corners) only once
+            if( work_test_flag(iglob) <= 0 ) cycle ! continues to next point
+            
+            ! increases number of total points on this interface
+            nibool_neighbours(icurrent) = nibool_neighbours(icurrent) + 1
+            if( nibool_neighbours(icurrent) > max_nibool) &
+              call exit_mpi(myrank,'interface face exceeds max_nibool range')
+              
+            ! stores interface iglob index
+            ibool_neighbours( nibool_neighbours(icurrent),icurrent ) = iglob
+            
+            ! re-sets flag
+            work_test_flag(iglob) = work_test_flag(iglob) - ( rank + 1 )
+            ! debug
+            if( work_test_flag(iglob) < 0 ) then
+              print*,'error face flag:',myrank,'ispec=',ispec,'rank=',rank
+              print*,'  flag=',work_test_flag(iglob),'iface jk=',iface,j,k
+              call exit_mpi(myrank,'error face flag')
+            endif
+          enddo
+        enddo
+      endif    
+    enddo ! iface
+
+    ! 2. finds neighbours which share a single edge with this process
+    ! note: by now, faces have subtracted their neighbours, edges can hold only one more process info
+
+    ! loops over all edges of element
+    do iedge = 1, 12
+    
+      ! chooses a point inside edge but not corner
+      select case( iedge )
+      case( 1 )
+        ! face I == 1, J == 1
+        iglob = ibool(1,1,2,ispec) 
+      case( 2 )
+        ! face I == 1, J == NGLLY
+        iglob = ibool(1,NGLLY,2,ispec)
+      case( 3 )
+        ! face I == 1, K == 1
+        iglob = ibool(1,2,1,ispec)
+      case( 4 )
+        ! face I == 1, K == NGLLZ
+        iglob = ibool(1,2,NGLLZ,ispec)
+      case( 5 )
+        ! face I == NGLLX, J == 1
+        iglob = ibool(NGLLX,1,2,ispec)
+      case( 6 )
+        ! face I == NGLLX, J == NGLLY
+        iglob = ibool(NGLLX,NGLLY,2,ispec)
+      case( 7 )
+        ! face I == NGLLX, K == 1
+        iglob = ibool(NGLLX,2,1,ispec)
+      case( 8 )
+        ! face I == NGLLX, K == NGLLZ
+        iglob = ibool(NGLLX,2,NGLLZ,ispec)
+      case( 9 )
+        ! face J == 1, K == 1
+        iglob = ibool(2,1,1,ispec)
+      case( 10 )
+        ! face J == 1, K == NGLLZ
+        iglob = ibool(2,1,NGLLZ,ispec)
+      case( 11 )
+        ! face J == NGLLY, K == 1
+        iglob = ibool(2,NGLLY,1,ispec)
+      case( 12 )
+        ! face J == NGLLY, K == NGLLZ
+        iglob = ibool(2,NGLLY,NGLLZ,ispec)
+      end select
+
+      ! checks assembled flag on global point
+      if( work_test_flag(iglob) > 0 ) then
+        ispec_is_outer = .true.
+        
+        ! rank of neighbor process
+        rank = work_test_flag(iglob) - 1
+        
+        ! checks ranks range
+        if( rank < 0 .or. rank >= NPROCTOT_VAL ) then
+          print*,'error egde rank: ',myrank
+          print*,'  neighbor rank = ',rank,'exceeds total nproc:',NPROCTOT_VAL
+          print*,'  edge ',iedge
+          call exit_mpi(myrank,'error edge neighbor mpi rank')
+        endif
+        
+        ! checks if already stored
+        icurrent = 0
+        is_done = .false.
+        do ii = 1,iinterface
+          if( rank == my_neighbours(ii) ) then 
+            icurrent = ii
+            is_done = .true.
+            exit
+          endif
+        enddo
+        
+        ! updates interfaces array
+        if( .not. is_done ) then
+          iinterface = iinterface + 1
+          if( iinterface > MAX_NEIGHBOURS ) &
+            call exit_mpi(myrank,'interface edge exceeds MAX_NEIGHBOURS range')
+          ! adds as neighbor new interface
+          my_neighbours(iinterface) = rank
+          icurrent = iinterface
+        endif
+        if( icurrent == 0 ) &
+          call exit_mpi(myrank,'could not find current interface for this neighbor, please check my_neighbours')
+        
+        ! adds interface points and removes neighbor flag from edge
+        ! assumes NGLLX == NGLLY == NGLLZ
+        do k = 1,NGLLX
+          select case( iedge )
+          case( 1 )
+            ! face I == 1, J == 1
+            iglob = ibool(1,1,k,ispec) 
+          case( 2 )
+            ! face I == 1, J == NGLLY
+            iglob = ibool(1,NGLLY,k,ispec)
+          case( 3 )
+            ! face I == 1, K == 1
+            iglob = ibool(1,k,1,ispec)
+          case( 4 )
+            ! face I == 1, K == NGLLZ
+            iglob = ibool(1,k,NGLLZ,ispec)
+          case( 5 )
+            ! face I == NGLLX, J == 1
+            iglob = ibool(NGLLX,1,k,ispec)
+          case( 6 )
+            ! face I == NGLLX, J == NGLLY
+            iglob = ibool(NGLLX,NGLLY,k,ispec)
+          case( 7 )
+            ! face I == NGLLX, K == 1
+            iglob = ibool(NGLLX,k,1,ispec)
+          case( 8 )
+            ! face I == NGLLX, K == NGLLZ
+            iglob = ibool(NGLLX,k,NGLLZ,ispec)
+          case( 9 )
+            ! face J == 1, K == 1
+            iglob = ibool(k,1,1,ispec)
+          case( 10 )
+            ! face J == 1, K == NGLLZ
+            iglob = ibool(k,1,NGLLZ,ispec)
+          case( 11 )
+            ! face J == NGLLY, K == 1
+            iglob = ibool(k,NGLLY,1,ispec)
+          case( 12 )
+            ! face J == NGLLY, K == NGLLZ
+            iglob = ibool(k,NGLLY,NGLLZ,ispec)
+          end select
+
+          ! checks that we take each global point (on edges and corners) only once
+          if( work_test_flag(iglob) <= 0 ) cycle ! continues to next point
+
+          ! increases number of total points on this interface
+          nibool_neighbours(icurrent) = nibool_neighbours(icurrent) + 1
+          if( nibool_neighbours(icurrent) > max_nibool) &
+            call exit_mpi(myrank,'interface edge exceeds max_nibool range')
+            
+          ! stores interface iglob index          
+          ibool_neighbours( nibool_neighbours(icurrent),icurrent ) = iglob
+            
+          ! re-sets flag
+          work_test_flag(iglob) = work_test_flag(iglob) - ( rank + 1 )
+
+          ! debug
+          if( work_test_flag(iglob) < 0 ) call exit_mpi(myrank,'error edge flag')
+          
+        enddo        
+      endif    
+    enddo ! iedge
+
+
+    ! 3. finds neighbours which share a single corner with this process
+    ! note: faces and edges have subtracted their neighbors, only one more process left possible
+
+    ! loops over all corners of element
+    do icorner = 1, 8
+    
+      ! chooses a corner point
+      select case( icorner )
+      case( 1 )
+        ! face I == 1
+        iglob = ibool(1,1,1,ispec)
+      case( 2 )
+        ! face I == 1
+        iglob = ibool(1,NGLLY,1,ispec)
+      case( 3 )
+        ! face I == 1
+        iglob = ibool(1,1,NGLLZ,ispec)
+      case( 4 )
+        ! face I == 1
+        iglob = ibool(1,NGLLY,NGLLZ,ispec)
+      case( 5 )
+        ! face I == NGLLX
+        iglob = ibool(NGLLX,1,1,ispec)
+      case( 6 )
+        ! face I == NGLLX
+        iglob = ibool(NGLLX,NGLLY,1,ispec)
+      case( 7 )
+        ! face I == NGLLX
+        iglob = ibool(NGLLX,1,NGLLZ,ispec)
+      case( 8 )
+        ! face I == NGLLX
+        iglob = ibool(NGLLX,NGLLY,NGLLZ,ispec)
+      end select
+
+      ! makes sure that all elements on mpi interfaces are included
+      ! uses original test_flag array, since the working copy reduces values
+      ! note: there can be elements which have an edge or corner shared with
+      !          other mpi partitions, but have the work_test_flag value already set to zero
+      !          since the iglob point was found before.
+      !          also, this check here would suffice to determine the outer flag, but we also include the 
+      !          check everywhere we encounter it too
+      if( test_flag(iglob) > 0.5 ) then
+        ispec_is_outer = .true.
+      endif
+
+      ! checks assembled flag on global point
+      if( work_test_flag(iglob) > 0 ) then
+        ispec_is_outer = .true.
+        
+        ! rank of neighbor process
+        rank = work_test_flag(iglob) - 1
+        
+        ! checks ranks range
+        if( rank < 0 .or. rank >= NPROCTOT_VAL ) then
+          print*,'error corner: ',myrank
+          print*,'  neighbor rank = ',rank,'exceeds total nproc:',NPROCTOT_VAL
+          print*,'  corner ',icorner
+          call exit_mpi(myrank,'error corner neighbor mpi rank')
+        endif
+        
+        ! checks if already stored
+        icurrent = 0
+        is_done = .false.
+        do ii = 1,iinterface
+          if( rank == my_neighbours(ii) ) then 
+            icurrent = ii
+            is_done = .true.
+            exit
+          endif
+        enddo
+        
+        ! updates interfaces array
+        if( .not. is_done ) then
+          iinterface = iinterface + 1
+          if( iinterface > MAX_NEIGHBOURS ) &
+            call exit_mpi(myrank,'interface corner exceed MAX_NEIGHBOURS range')
+          ! adds as neighbor new interface
+          my_neighbours(iinterface) = rank
+          icurrent = iinterface
+        endif
+        if( icurrent == 0 ) &
+          call exit_mpi(myrank,'could not find current interface for this neighbor, please check my_neighbours')
+        
+        ! adds this corner as interface point and removes neighbor flag from face
+        ! increases number of total points on this interface
+        nibool_neighbours(icurrent) = nibool_neighbours(icurrent) + 1
+        if( nibool_neighbours(icurrent) > max_nibool) &
+          call exit_mpi(myrank,'interface corner exceeds max_nibool range')
+          
+        ! stores interface iglob index        
+        ibool_neighbours( nibool_neighbours(icurrent),icurrent ) = iglob
+            
+        ! re-sets flag
+        work_test_flag(iglob) = work_test_flag(iglob) - ( rank + 1 )
+
+        ! debug
+        if( work_test_flag(iglob) < 0 ) call exit_mpi(myrank,'error corner flag')
+        
+      endif
+      
+    enddo ! icorner
+        
+    ! stores flags for outer elements when recognized as such 
+    ! (inner/outer elements separated for non-blocking mpi communications)
+    if( ispec_is_outer ) then
+      work_ispec_is_outer(ispec) = .true.
+    endif
+    
+  enddo 
+
+  ! number of outer elements (on MPI interfaces)
+  npoin = count( work_ispec_is_outer )
+    
+  ! debug: user output
+  if( myrank == 0 ) then
+    write(IMAIN,*) '  interfaces : ',iinterface
+    write(IMAIN,*) '  my_neighbours: ',my_neighbours(1:iinterface)  
+    write(IMAIN,*) '  nibool_neighbours: ',nibool_neighbours(1:iinterface)
+    write(IMAIN,*) '  test flag min/max: ',minval(work_test_flag),maxval(work_test_flag)
+    write(IMAIN,*) '  outer elements: ',npoin
+    write(IMAIN,*)
+  endif
+  call sync_all()
+
+  ! checks if all points were recognized
+  if( maxval(work_test_flag) > 0 ) then
+    print*,'error mpi interface rank: ',myrank
+    print*,'  work_test_flag min/max :',minval(work_test_flag),maxval(work_test_flag)
+    call exit_mpi(myrank,'error: mpi points remain unrecognized, please check mesh interfaces')
+  endif
+
+  ! checks if all points were taken only once
+  if( minval(work_test_flag) < 0 ) then
+    print*,'error mpi interface rank: ',myrank
+    print*,'  work_test_flag min/max :',minval(work_test_flag),maxval(work_test_flag)
+    call exit_mpi(myrank,'error: mpi points counted more than once, please check mesh interfaces')
+  endif
+
+  ! sets interfaces infos
+  num_interfaces = iinterface
+  max_nibool_interfaces = maxval( nibool_neighbours(1:num_interfaces) )
+
+  ! optional: ibool usually is already sorted,
+  !                this makes sure ibool_neighbours arrays are still sorted 
+  !               (iglob indices in increasing order; we will access acceleration fields accel(:,iglob), 
+  !                thus it helps if iglob strides are short and accesses are close-by)
+  do iinterface = 1,num_interfaces
+    npoin = nibool_neighbours(iinterface)    
+    call heap_sort( npoin, ibool_neighbours(1:npoin,iinterface) )
+    
+    ! debug: checks if unique set of iglob values
+    do j=1,npoin-1
+      if( ibool_neighbours(j,iinterface) == ibool_neighbours(j+1,iinterface) ) then
+        print*,'error mpi interface rank:',myrank
+        print*,'  interface: ',my_neighbours(iinterface),'point: ',j,'of',npoin
+        call exit_mpi(myrank,'error: mpi points not unique on interface')
+      endif
+    enddo
+  enddo
+
+
+  ! re-sets flags for outer elements
+  is_on_a_slice_edge(:) = work_ispec_is_outer(:)
+
+  end subroutine rmd_get_MPI_interfaces
+
 !
 !-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
 !
 
+  subroutine read_mesh_databases_InnerOuter()
 
-  subroutine read_mesh_databases_stacey(myrank, &
-                      nimin_crust_mantle,nimax_crust_mantle,njmin_crust_mantle, &
-                      njmax_crust_mantle,nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
-                      nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-                      nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
-                      reclen_xmin_crust_mantle,reclen_xmax_crust_mantle, &
-                      reclen_ymin_crust_mantle,reclen_ymax_crust_mantle, &
-                      nimin_outer_core,nimax_outer_core,njmin_outer_core, &
-                      njmax_outer_core,nkmin_xi_outer_core,nkmin_eta_outer_core, &
-                      nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-                      nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
-                      reclen_xmin_outer_core,reclen_xmax_outer_core, &
-                      reclen_ymin_outer_core,reclen_ymax_outer_core, &
-                      reclen_zmin,NSPEC2D_BOTTOM, &
-                      SIMULATION_TYPE,SAVE_FORWARD,LOCAL_PATH,NSTEP)
+! sets up inner/outer elements for non-blocking MPI communication
 
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore  
   implicit none
+  
+  ! local parameters
+  real :: percentage_edge
+  integer :: ier,ispec,iinner,iouter
+  character(len=150) :: filename
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
+  ! stores inner / outer elements
+  !
+  ! note: arrays is_on_a_slice_edge_.. have flags set for elements which need to
+  !         communicate with other MPI processes
+  
+  ! crust_mantle
+  nspec_outer_crust_mantle = count( is_on_a_slice_edge_crust_mantle )
+  nspec_inner_crust_mantle = NSPEC_CRUST_MANTLE - nspec_outer_crust_mantle
+  
+  num_phase_ispec_crust_mantle = max(nspec_inner_crust_mantle,nspec_outer_crust_mantle)
+  
+  allocate(phase_ispec_inner_crust_mantle(num_phase_ispec_crust_mantle,2),stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array phase_ispec_inner_crust_mantle')
+  
+  phase_ispec_inner_crust_mantle(:,:) = 0
+  iinner = 0
+  iouter = 0
+  do ispec=1,NSPEC_CRUST_MANTLE
+    if( is_on_a_slice_edge_crust_mantle(ispec) ) then
+      ! outer element
+      iouter = iouter + 1
+      phase_ispec_inner_crust_mantle(iouter,1) = ispec
+    else
+      ! inner element
+      iinner = iinner + 1
+      phase_ispec_inner_crust_mantle(iinner,2) = ispec
+    endif
+  enddo
 
-  integer myrank
+  ! outer_core
+  nspec_outer_outer_core = count( is_on_a_slice_edge_outer_core )
+  nspec_inner_outer_core = NSPEC_OUTER_CORE - nspec_outer_outer_core
 
-  integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_CM) ::  &
-    nimin_crust_mantle,nimax_crust_mantle,nkmin_eta_crust_mantle
-  integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_CM) ::  &
-    njmin_crust_mantle,njmax_crust_mantle,nkmin_xi_crust_mantle
-  integer nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-    nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle
-  integer reclen_xmin_crust_mantle, reclen_xmax_crust_mantle, reclen_ymin_crust_mantle, &
-    reclen_ymax_crust_mantle
+  num_phase_ispec_outer_core = max(nspec_inner_outer_core,nspec_outer_outer_core)
+  
+  allocate(phase_ispec_inner_outer_core(num_phase_ispec_outer_core,2),stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array phase_ispec_inner_outer_core')
+  
+  phase_ispec_inner_outer_core(:,:) = 0
+  iinner = 0
+  iouter = 0
+  do ispec=1,NSPEC_OUTER_CORE
+    if( is_on_a_slice_edge_outer_core(ispec) ) then
+      ! outer element
+      iouter = iouter + 1
+      phase_ispec_inner_outer_core(iouter,1) = ispec
+    else
+      ! inner element
+      iinner = iinner + 1
+      phase_ispec_inner_outer_core(iinner,2) = ispec
+    endif
+  enddo
 
-  integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_OC) :: nimin_outer_core,nimax_outer_core,nkmin_eta_outer_core
-  integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_OC) :: njmin_outer_core,njmax_outer_core,nkmin_xi_outer_core
-  integer nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-    nspec2D_ymin_outer_core,nspec2D_ymax_outer_core
-  integer reclen_xmin_outer_core, reclen_xmax_outer_core,reclen_ymin_outer_core, &
-    reclen_ymax_outer_core
+  ! inner_core
+  nspec_outer_inner_core = count( is_on_a_slice_edge_inner_core )
+  nspec_inner_inner_core = NSPEC_INNER_CORE - nspec_outer_inner_core
 
-  integer reclen_zmin
-  integer, dimension(MAX_NUM_REGIONS) :: NSPEC2D_BOTTOM
+  num_phase_ispec_inner_core = max(nspec_inner_inner_core,nspec_outer_inner_core)
+  
+  allocate(phase_ispec_inner_inner_core(num_phase_ispec_inner_core,2),stat=ier)
+  if( ier /= 0 ) call exit_mpi(myrank,'error allocating array phase_ispec_inner_inner_core')
+  
+  phase_ispec_inner_inner_core(:,:) = 0
+  iinner = 0
+  iouter = 0
+  do ispec=1,NSPEC_INNER_CORE
+    if( is_on_a_slice_edge_inner_core(ispec) ) then
+      ! outer element
+      iouter = iouter + 1
+      phase_ispec_inner_inner_core(iouter,1) = ispec
+    else
+      ! inner element
+      iinner = iinner + 1
+      phase_ispec_inner_inner_core(iinner,2) = ispec
+    endif
+  enddo
+  
+  ! user output
+  if(myrank == 0) then
 
-  integer SIMULATION_TYPE
-  logical SAVE_FORWARD
-  character(len=150) LOCAL_PATH
-  integer NSTEP
+    write(IMAIN,*) 'for overlapping of communications with calculations:'
+    write(IMAIN,*)
 
+    percentage_edge = 100.0 - 100. * nspec_outer_crust_mantle / real(NSPEC_CRUST_MANTLE)
+    write(IMAIN,*) 'percentage of edge elements in crust/mantle ',percentage_edge,'%'
+    write(IMAIN,*) 'percentage of volume elements in crust/mantle ',100. - percentage_edge,'%'
+    write(IMAIN,*)
+
+    percentage_edge = 100.0 - 100.* nspec_outer_outer_core / real(NSPEC_OUTER_CORE)
+    write(IMAIN,*) 'percentage of edge elements in outer core ',percentage_edge,'%'
+    write(IMAIN,*) 'percentage of volume elements in outer core ',100. - percentage_edge,'%'
+    write(IMAIN,*)
+
+    percentage_edge = 100.0 - 100. * nspec_outer_inner_core / real(NSPEC_INNER_CORE)
+    write(IMAIN,*) 'percentage of edge elements in inner core ',percentage_edge,'%'
+    write(IMAIN,*) 'percentage of volume elements in inner core ',100. - percentage_edge,'%'
+    write(IMAIN,*)
+
+  endif
+
+  ! debug: saves element flags
+  ! crust mantle
+  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_innerouter_crust_mantle_proc',myrank  
+  !call write_VTK_data_elem_l(NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
+  !                          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+  !                          ibool_crust_mantle, &
+  !                          is_on_a_slice_edge_crust_mantle,filename)  
+  ! outer core
+  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_innerouter_outer_core_proc',myrank  
+  !call write_VTK_data_elem_l(NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
+  !                          xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+  !                          ibool_outer_core, &
+  !                          is_on_a_slice_edge_outer_core,filename)
+  ! inner core
+  write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_innerouter_inner_core_proc',myrank  
+  call write_VTK_data_elem_l(NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
+                            xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+                            ibool_inner_core, &
+                            is_on_a_slice_edge_inner_core,filename)
+
+  end subroutine read_mesh_databases_InnerOuter
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine read_mesh_databases_stacey()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+
+  implicit none
+
   ! local parameters
   integer(kind=8) :: filesize
-  character(len=150) prname
+  integer :: ier
 
+  ! sets up absorbing boundary buffer arrays
 
+  ! crust_mantle
+  if (nspec2D_xmin_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
+    .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    nabs_xmin_cm = nspec2D_xmin_crust_mantle
+  else
+    nabs_xmin_cm = 1
+  endif
+  !daniel: not sure why dimensions are (..,..,..,..,8)  ?
+  !allocate(absorb_xmin_crust_mantle5(NDIM,NGLLY,NGLLZ,nabs_xmin_cm,8),stat=ier)
+  allocate(absorb_xmin_crust_mantle(NDIM,NGLLY,NGLLZ,nabs_xmin_cm),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb xmin')
+
+  if (nspec2D_xmax_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
+    .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    nabs_xmax_cm = nspec2D_xmax_crust_mantle
+  else
+    nabs_xmax_cm = 1
+  endif
+  !daniel: not sure why dimensions are (..,..,..,..,8)
+  !allocate(absorb_xmax_crust_mantle5(NDIM,NGLLY,NGLLZ,nabs_xmax_cm,8),stat=ier)
+  allocate(absorb_xmax_crust_mantle(NDIM,NGLLY,NGLLZ,nabs_xmax_cm),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb xmax')
+
+  if (nspec2D_ymin_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
+    .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    nabs_ymin_cm = nspec2D_ymin_crust_mantle
+  else
+    nabs_ymin_cm = 1
+  endif
+  !daniel: not sure why dimensions are (..,..,..,..,8)
+  !allocate(absorb_ymin_crust_mantle5(NDIM,NGLLX,NGLLZ,nabs_ymin_cm,8),stat=ier)
+  allocate(absorb_ymin_crust_mantle(NDIM,NGLLX,NGLLZ,nabs_ymin_cm),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb ymin')
+
+  if (nspec2D_ymax_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
+    .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    nabs_ymax_cm = nspec2D_ymax_crust_mantle
+  else
+    nabs_ymax_cm = 1
+  endif
+  !daniel: not sure why dimensions are (..,..,..,..,8)
+  !allocate(absorb_ymax_crust_mantle5(NDIM,NGLLX,NGLLZ,nabs_ymax_cm,8),stat=ier)
+  allocate(absorb_ymax_crust_mantle(NDIM,NGLLX,NGLLZ,nabs_ymax_cm),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb ymax')
+
+  ! outer_core
+  if (nspec2D_xmin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
+    .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    nabs_xmin_oc = nspec2D_xmin_outer_core
+  else
+    nabs_xmin_oc = 1
+  endif
+  allocate(absorb_xmin_outer_core(NGLLY,NGLLZ,nabs_xmin_oc),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb xmin')
+
+  if (nspec2D_xmax_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
+    .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    nabs_xmax_oc = nspec2D_xmax_outer_core
+  else
+    nabs_xmax_oc = 1
+  endif
+  allocate(absorb_xmax_outer_core(NGLLY,NGLLZ,nabs_xmax_oc),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb xmax')
+
+  if (nspec2D_ymin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
+    .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    nabs_ymin_oc = nspec2D_ymin_outer_core
+  else
+    nabs_ymin_oc = 1
+  endif
+  allocate(absorb_ymin_outer_core(NGLLX,NGLLZ,nabs_ymin_oc),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb ymin')
+
+  if (nspec2D_ymax_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
+    .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    nabs_ymax_oc = nspec2D_ymax_outer_core
+  else
+    nabs_ymax_oc = 1
+  endif
+  allocate(absorb_ymax_outer_core(NGLLX,NGLLZ,nabs_ymax_oc),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb ymax')
+
+  if (NSPEC2D_BOTTOM(IREGION_OUTER_CORE) > 0 .and. &
+     (SIMULATION_TYPE == 3 .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
+    nabs_zmin_oc = NSPEC2D_BOTTOM(IREGION_OUTER_CORE)
+  else
+    nabs_zmin_oc = 1
+  endif
+  allocate(absorb_zmin_outer_core(NGLLX,NGLLY,nabs_zmin_oc),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb zmin')
+
+
   ! crust and mantle
+  
   ! create name of database
   call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
 
@@ -944,6 +2089,7 @@
 
 
   ! outer core
+  
   ! create name of database
   call create_name_database(prname,myrank,IREGION_OUTER_CORE,LOCAL_PATH)
 
@@ -1097,3 +2243,274 @@
   endif
 
   end subroutine read_mesh_databases_stacey
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!-------------------------------------------------------------------------------------------------
+!-------------------------------------------------------------------------------------------------
+!
+
+!daniel: TODO - place this auxiliary function...
+
+  subroutine heap_sort( N, array )
+
+! heap sort algorithm
+! sorts integer array (in increasing order, like 1 - 5 - 6 - 9 - 12 - 13 - 14 -...)
+  
+  implicit none
+  integer,intent(in) :: N
+  integer,dimension(N),intent(inout) :: array
+  
+  ! local parameters
+  integer :: tmp
+  integer :: i
+  
+  ! checks if anything to do
+  if( N < 2 ) return
+  
+  ! builds heap
+  do i = N/2, 1, -1  
+    call heap_sort_siftdown(N,array,i,N)
+  enddo
+    
+  ! sorts array
+  do i = N, 2, -1
+    ! swaps last and first entry in this section
+    tmp = array(1)
+    array(1) = array(i)
+    array(i) = tmp
+    call heap_sort_siftdown(N,array,1,i-1)
+  enddo
+  
+  end subroutine heap_sort
+
+!
+!----
+!  
+
+  subroutine heap_sort_siftdown(N,array,start,bottom)
+
+  implicit none
+  
+  integer,intent(in):: N
+  integer,dimension(N),intent(inout) :: array
+  integer :: start,bottom
+  
+  ! local parameters
+  integer :: i,j
+  integer :: tmp
+  
+  i = start
+  tmp = array(i)  
+  j = 2*i
+  do while( j <= bottom )
+    ! chooses larger value first in this section
+    if( j < bottom ) then
+      if( array(j) <= array(j+1) ) j = j + 1
+    endif
+    
+    ! checks if section already smaller than inital value
+    if( array(j) < tmp ) exit
+        
+    array(i) = array(j)
+    i = j
+    j = 2*i
+  enddo
+  
+  array(i) = tmp      
+  return
+  
+  end subroutine heap_sort_siftdown
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+!daniel: TODO - place this auxiliary function...
+
+! external mesh routine for saving vtk files for points locations
+
+  subroutine write_VTK_data_points(nglob, &
+                                  xstore_dummy,ystore_dummy,zstore_dummy, &
+                                  points_globalindices,num_points_globalindices, &
+                                  prname_file)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: nglob
+
+! global coordinates
+  real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy
+
+! gll data values array
+  integer :: num_points_globalindices
+  integer, dimension(num_points_globalindices) :: points_globalindices
+
+! file name
+  character(len=150) prname_file
+
+  integer :: i,iglob
+
+! write source and receiver VTK files for Paraview
+  !debug
+  !write(IMAIN,*) '  vtk file: '
+  !write(IMAIN,*) '    ',prname_file(1:len_trim(prname_file))//'.vtk'
+
+  open(IOVTK,file=prname_file(1:len_trim(prname_file))//'.vtk',status='unknown')
+  write(IOVTK,'(a)') '# vtk DataFile Version 3.1'
+  write(IOVTK,'(a)') 'material model VTK file'
+  write(IOVTK,'(a)') 'ASCII'
+  write(IOVTK,'(a)') 'DATASET UNSTRUCTURED_GRID'
+  write(IOVTK, '(a,i12,a)') 'POINTS ', num_points_globalindices, ' float'
+  do i=1,num_points_globalindices
+    iglob = points_globalindices(i)
+    if( iglob <= 0 .or. iglob > nglob ) then
+      print*,'error: '//prname_file(1:len_trim(prname_file))//'.vtk'
+      print*,'error global index: ',iglob,i
+      close(IOVTK)
+      stop 'error vtk points file'
+    endif
+
+    write(IOVTK,'(3e18.6)') xstore_dummy(iglob),ystore_dummy(iglob),zstore_dummy(iglob)
+  enddo
+  write(IOVTK,*) ""
+  
+  close(IOVTK)
+
+  end subroutine write_VTK_data_points
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! external mesh routine for saving vtk files for points locations
+
+  subroutine write_VTK_glob_points(nglob, &
+                                  xstore_dummy,ystore_dummy,zstore_dummy, &
+                                  glob_values, &
+                                  prname_file)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: nglob
+
+  ! global coordinates
+  real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy
+
+  ! gll data values array
+  real(kind=CUSTOM_REAL), dimension(nglob) :: glob_values
+
+  ! file name
+  character(len=150) prname_file
+
+  integer :: iglob
+
+  ! write source and receiver VTK files for Paraview
+  !debug
+  !write(IMAIN,*) '  vtk file: '
+  !write(IMAIN,*) '    ',prname_file(1:len_trim(prname_file))//'.vtk'
+
+  open(IOVTK,file=prname_file(1:len_trim(prname_file))//'.vtk',status='unknown')
+  write(IOVTK,'(a)') '# vtk DataFile Version 3.1'
+  write(IOVTK,'(a)') 'material model VTK file'
+  write(IOVTK,'(a)') 'ASCII'
+  write(IOVTK,'(a)') 'DATASET UNSTRUCTURED_GRID'
+  write(IOVTK, '(a,i12,a)') 'POINTS ', nglob, ' float'
+  do iglob=1,nglob
+    write(IOVTK,*) xstore_dummy(iglob),ystore_dummy(iglob),zstore_dummy(iglob)
+  enddo
+  write(IOVTK,*) ""
+
+  ! writes out gll-data (velocity) for each element point
+  write(IOVTK,'(a,i12)') "POINT_DATA ",nglob
+  write(IOVTK,'(a)') "SCALARS glob_data float"
+  write(IOVTK,'(a)') "LOOKUP_TABLE default"
+  do iglob=1,nglob
+    write(IOVTK,*) glob_values(iglob)
+  enddo
+  write(IOVTK,*) ""
+  
+  close(IOVTK)
+
+  end subroutine write_VTK_glob_points
+
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! routine for saving vtk file holding logical flag on each spectral element
+
+  subroutine write_VTK_data_elem_l(nspec,nglob, &
+                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                        elem_flag,prname_file)
+
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: nspec,nglob
+
+! global coordinates
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+  real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy
+
+! element flag array
+  logical, dimension(nspec) :: elem_flag
+  integer :: ispec,i
+
+! file name
+  character(len=150) prname_file
+
+! write source and receiver VTK files for Paraview
+  !debug
+  !write(IMAIN,*) '  vtk file: '
+  !write(IMAIN,*) '    ',prname_file(1:len_trim(prname_file))//'.vtk'
+
+  open(IOVTK,file=prname_file(1:len_trim(prname_file))//'.vtk',status='unknown')
+  write(IOVTK,'(a)') '# vtk DataFile Version 3.1'
+  write(IOVTK,'(a)') 'material model VTK file'
+  write(IOVTK,'(a)') 'ASCII'
+  write(IOVTK,'(a)') 'DATASET UNSTRUCTURED_GRID'
+  write(IOVTK, '(a,i12,a)') 'POINTS ', nglob, ' float'
+  do i=1,nglob
+    write(IOVTK,'(3e18.6)') xstore_dummy(i),ystore_dummy(i),zstore_dummy(i)
+  enddo
+  write(IOVTK,*) ""
+
+  ! note: indices for vtk start at 0
+  write(IOVTK,'(a,i12,i12)') "CELLS ",nspec,nspec*9
+  do ispec=1,nspec
+    write(IOVTK,'(9i12)') 8,ibool(1,1,1,ispec)-1,ibool(NGLLX,1,1,ispec)-1,ibool(NGLLX,NGLLY,1,ispec)-1,ibool(1,NGLLY,1,ispec)-1,&
+          ibool(1,1,NGLLZ,ispec)-1,ibool(NGLLX,1,NGLLZ,ispec)-1,ibool(NGLLX,NGLLY,NGLLZ,ispec)-1,ibool(1,NGLLY,NGLLZ,ispec)-1
+  enddo
+  write(IOVTK,*) ""
+
+  ! type: hexahedrons
+  write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec
+  write(IOVTK,*) (12,ispec=1,nspec)
+  write(IOVTK,*) ""
+
+  write(IOVTK,'(a,i12)') "CELL_DATA ",nspec
+  write(IOVTK,'(a)') "SCALARS elem_flag integer"
+  write(IOVTK,'(a)') "LOOKUP_TABLE default"
+  do ispec = 1,nspec
+    if( elem_flag(ispec) .eqv. .true. ) then
+      write(IOVTK,*) 1
+    else
+      write(IOVTK,*) 0
+    endif
+  enddo
+  write(IOVTK,*) ""
+  close(IOVTK)
+
+
+  end subroutine write_VTK_data_elem_l
+

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_topography_bathymetry.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_topography_bathymetry.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_topography_bathymetry.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,66 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  5 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!                        Princeton University, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+  subroutine read_topography_bathymetry()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  implicit none
+
+  include 'mpif.h'
+
+  ! local parameters
+  integer :: ier
+  
+  ! get MPI starting time
+  time_start = MPI_WTIME()
+
+  ! make ellipticity
+  if(ELLIPTICITY_VAL) then
+    call make_ellipticity(nspl,rspl,espl,espl2,ONE_CRUST)
+  endif
+
+  ! read topography and bathymetry file
+  if(myrank == 0 .and. (TOPOGRAPHY .or. OCEANS_VAL)) then
+    call read_topo_bathy_file(ibathy_topo)
+  endif
+
+  ! broadcast the information read on the master to the nodes
+  call MPI_BCAST(ibathy_topo,NX_BATHY*NY_BATHY,MPI_INTEGER,0,MPI_COMM_WORLD,ier)
+
+  ! user output
+  if( myrank == 0 .and. (TOPOGRAPHY .or. OCEANS_VAL)) then
+    ! elapsed time since beginning of mesh generation
+    tCPU = MPI_WTIME() - time_start
+    write(IMAIN,*)
+    write(IMAIN,*) 'Elapsed time for reading topo/bathy in seconds = ',sngl(tCPU)
+    write(IMAIN,*)
+  endif
+
+  end subroutine read_topography_bathymetry

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_forward_arrays.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_forward_arrays.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_forward_arrays.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -25,56 +25,23 @@
 !
 !=====================================================================
 
-  subroutine save_forward_arrays(myrank,SIMULATION_TYPE,SAVE_FORWARD, &
-                    NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN, &
-                    displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
-                    displ_inner_core,veloc_inner_core,accel_inner_core, &
-                    displ_outer_core,veloc_outer_core,accel_outer_core, &
-                    R_memory_crust_mantle,R_memory_inner_core, &
-                    epsilondev_crust_mantle,epsilondev_inner_core, &
-                    A_array_rotation,B_array_rotation, &
-                    LOCAL_PATH)
+  subroutine save_forward_arrays()
 
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
-
-  integer myrank
-
-  integer SIMULATION_TYPE
-  logical SAVE_FORWARD
-  integer NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: &
-    displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: &
-    displ_inner_core,veloc_inner_core,accel_inner_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: &
-    displ_outer_core,veloc_outer_core,accel_outer_core
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: &
-    R_memory_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
-    epsilondev_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
-    R_memory_inner_core
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: &
-    epsilondev_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROTATION) :: &
-    A_array_rotation,B_array_rotation
-
-  character(len=150) LOCAL_PATH
-
   ! local parameters
   character(len=150) outputname
 
-
   ! save files to local disk or tape system if restart file
   if(NUMBER_OF_RUNS > 1 .and. NUMBER_OF_THIS_RUN < NUMBER_OF_RUNS) then
     write(outputname,"('dump_all_arrays',i6.6)") myrank
     open(unit=55,file=trim(LOCAL_PATH)//'/'//outputname,status='unknown',form='unformatted',action='write')
+    
     write(55) displ_crust_mantle
     write(55) veloc_crust_mantle
     write(55) accel_crust_mantle
@@ -84,12 +51,34 @@
     write(55) displ_outer_core
     write(55) veloc_outer_core
     write(55) accel_outer_core
-    write(55) epsilondev_crust_mantle
-    write(55) epsilondev_inner_core
+
+    write(55) epsilondev_xx_crust_mantle
+    write(55) epsilondev_yy_crust_mantle
+    write(55) epsilondev_xy_crust_mantle
+    write(55) epsilondev_xz_crust_mantle
+    write(55) epsilondev_yz_crust_mantle
+
+    write(55) epsilondev_xx_inner_core
+    write(55) epsilondev_yy_inner_core
+    write(55) epsilondev_xy_inner_core
+    write(55) epsilondev_xz_inner_core
+    write(55) epsilondev_yz_inner_core
+
     write(55) A_array_rotation
     write(55) B_array_rotation
-    write(55) R_memory_crust_mantle
-    write(55) R_memory_inner_core
+
+    write(55) R_xx_crust_mantle
+    write(55) R_yy_crust_mantle
+    write(55) R_xy_crust_mantle
+    write(55) R_xz_crust_mantle
+    write(55) R_yz_crust_mantle
+
+    write(55) R_xx_inner_core
+    write(55) R_yy_inner_core
+    write(55) R_xy_inner_core
+    write(55) R_xz_inner_core
+    write(55) R_yz_inner_core
+
     close(55)
   endif
 
@@ -97,6 +86,7 @@
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
     write(outputname,'(a,i6.6,a)') 'proc',myrank,'_save_forward_arrays.bin'
     open(unit=55,file=trim(LOCAL_PATH)//'/'//outputname,status='unknown',form='unformatted',action='write')
+    
     write(55) displ_crust_mantle
     write(55) veloc_crust_mantle
     write(55) accel_crust_mantle
@@ -106,15 +96,36 @@
     write(55) displ_outer_core
     write(55) veloc_outer_core
     write(55) accel_outer_core
-    write(55) epsilondev_crust_mantle
-    write(55) epsilondev_inner_core
+
+    write(55) epsilondev_xx_crust_mantle
+    write(55) epsilondev_yy_crust_mantle
+    write(55) epsilondev_xy_crust_mantle
+    write(55) epsilondev_xz_crust_mantle
+    write(55) epsilondev_yz_crust_mantle
+
+    write(55) epsilondev_xx_inner_core
+    write(55) epsilondev_yy_inner_core
+    write(55) epsilondev_xy_inner_core
+    write(55) epsilondev_xz_inner_core
+    write(55) epsilondev_yz_inner_core
+
     if (ROTATION_VAL) then
       write(55) A_array_rotation
       write(55) B_array_rotation
     endif
     if (ATTENUATION_VAL) then
-      write(55) R_memory_crust_mantle
-      write(55) R_memory_inner_core
+      write(55) R_xx_crust_mantle
+      write(55) R_yy_crust_mantle
+      write(55) R_xy_crust_mantle
+      write(55) R_xz_crust_mantle
+      write(55) R_yz_crust_mantle
+
+      write(55) R_xx_inner_core
+      write(55) R_yy_inner_core
+      write(55) R_xy_inner_core
+      write(55) R_xz_inner_core
+      write(55) R_yz_inner_core
+
     endif
     close(55)
   endif

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_GLL_points.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_GLL_points.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_GLL_points.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,57 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  5 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!                        Princeton University, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+  subroutine setup_GLL_points()
+
+  use specfem_par
+  implicit none
+
+  ! local parameters
+  integer :: i,j
+
+  ! set up GLL points, weights and derivation matrices
+  call define_derivation_matrices(xigll,yigll,zigll,wxgll,wygll,wzgll, &
+         hprime_xx,hprime_yy,hprime_zz, &
+         hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+         wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube)
+
+  if( USE_DEVILLE_PRODUCTS_VAL ) then
+
+  ! check that optimized routines from Deville et al. (2002) can be used
+    if(NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5) &
+      stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ = 5'
+
+    ! define transpose of derivation matrix
+    do j = 1,NGLLY
+      do i = 1,NGLLX
+        hprime_xxT(j,i) = hprime_xx(i,j)
+        hprimewgll_xxT(j,i) = hprimewgll_xx(i,j)
+      enddo
+    enddo
+  endif
+
+  end subroutine setup_GLL_points

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -25,94 +25,79 @@
 !
 !=====================================================================
 
-  subroutine setup_sources_receivers(NSOURCES,myrank,ibool_crust_mantle, &
-                      xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-                      xigll,yigll,zigll,TOPOGRAPHY, &
-                      sec,tshift_cmt,theta_source,phi_source, &
-                      NSTEP,DT,hdur,hdur_gaussian,t0,Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
-                      islice_selected_source,ispec_selected_source, &
-                      xi_source,eta_source,gamma_source,nu_source, &
-                      rspl,espl,espl2,nspl,ibathy_topo,NEX_XI,PRINT_SOURCE_TIME_FUNCTION, &
-                      rec_filename,nrec,islice_selected_rec,ispec_selected_rec, &
-                      xi_receiver,eta_receiver,gamma_receiver,station_name,network_name, &
-                      stlat,stlon,stele,stbur,nu, &
-                      nrec_local,nadj_rec_local,nrec_simulation, &
-                      SIMULATION_TYPE,RECEIVERS_CAN_BE_BURIED,MOVIE_SURFACE,MOVIE_VOLUME, &
-                      HDUR_MOVIE,OUTPUT_FILES,LOCAL_PATH)
+  subroutine setup_sources_receivers()
 
-
+  use specfem_par
   implicit none
 
-  include 'mpif.h'
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
+  ! locates sources and determines simulation start time t0
+  call setup_sources()
 
-  integer NSOURCES,myrank
+  ! reads in stations file and locates receivers
+  call setup_receivers()
+  
+  ! write source and receiver VTK files for Paraview
+  call setup_sources_receivers_VTKfile()
+  
+  ! pre-compute source arrays
+  call setup_sources_precompute_arrays()
 
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: &
-        xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle
+  ! pre-compute receiver interpolation factors
+  call setup_receivers_precompute_intp()
 
-  double precision, dimension(NGLLX) :: xigll
-  double precision, dimension(NGLLY) :: yigll
-  double precision, dimension(NGLLZ) :: zigll
+  ! user output
+  if(myrank == 0) then  
+    write(IMAIN,*)
+    write(IMAIN,*) 'Total number of samples for seismograms = ',NSTEP
+    write(IMAIN,*)
+    if(NSOURCES > 1) write(IMAIN,*) 'Using ',NSOURCES,' point sources'
+  endif
 
-  logical TOPOGRAPHY
+  end subroutine setup_sources_receivers
 
-  double precision sec,DT,t0,min_tshift_cmt_original
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-  double precision, dimension(NSOURCES) :: tshift_cmt,hdur,hdur_gaussian
-  double precision, dimension(NSOURCES) :: theta_source,phi_source
-  double precision, dimension(NSOURCES) :: Mxx,Myy,Mzz,Mxy,Mxz,Myz
-  double precision, dimension(NSOURCES) :: xi_source,eta_source,gamma_source,nu_source
+  subroutine setup_sources()
 
-  integer, dimension(NSOURCES) :: islice_selected_source,ispec_selected_source
-  integer NSTEP
+  use specfem_par
+  use specfem_par_crustmantle  
+  use specfem_par_movie
+  implicit none
 
-  ! for ellipticity
-  integer nspl
-  double precision rspl(NR),espl(NR),espl2(NR)
-
-  integer, dimension(NX_BATHY,NY_BATHY) :: ibathy_topo
-
-  integer NEX_XI
-  logical PRINT_SOURCE_TIME_FUNCTION
-
-  character(len=150) rec_filename
-
-  integer nrec
-  integer, dimension(nrec) :: islice_selected_rec,ispec_selected_rec
-
-  double precision, dimension(nrec) :: xi_receiver,eta_receiver,gamma_receiver
-  character(len=MAX_LENGTH_STATION_NAME), dimension(nrec) :: station_name
-  character(len=MAX_LENGTH_NETWORK_NAME), dimension(nrec) :: network_name
-
-  double precision, dimension(nrec) :: stlat,stlon,stele,stbur
-  double precision, dimension(NDIM,NDIM,nrec) :: nu
-
-  integer nrec_local,nadj_rec_local,nrec_simulation
-
-  integer SIMULATION_TYPE
-
-  logical RECEIVERS_CAN_BE_BURIED,MOVIE_SURFACE,MOVIE_VOLUME
-
-  double precision HDUR_MOVIE
-
-  character(len=150) OUTPUT_FILES
-  character(len=150) LOCAL_PATH
-
   ! local parameters
-  double precision :: junk
+  double precision :: min_tshift_cmt_original
+  double precision :: sec
   integer :: yr,jda,ho,mi
-  integer :: irec,isource,nrec_tot_found,ier
-  integer :: icomp,itime,nadj_files_found,nadj_files_found_tot
-  character(len=3),dimension(NDIM) :: comp
-  character(len=256) :: filename,adj_source_file,system_command,filename_new
-  character(len=2) :: bic
+  integer :: isource 
+  character(len=256) :: filename
+  integer :: ier
+
   ! makes smaller hdur for movies
   logical,parameter :: USE_SMALLER_HDUR_MOVIE = .false.
 
-! sources
+  ! allocate arrays for source
+  allocate(islice_selected_source(NSOURCES), &
+          ispec_selected_source(NSOURCES), &
+          Mxx(NSOURCES), &
+          Myy(NSOURCES), &
+          Mzz(NSOURCES), &
+          Mxy(NSOURCES), &
+          Mxz(NSOURCES), &
+          Myz(NSOURCES), &
+          xi_source(NSOURCES), &
+          eta_source(NSOURCES), &
+          gamma_source(NSOURCES), &
+          tshift_cmt(NSOURCES), &
+          hdur(NSOURCES), &
+          hdur_gaussian(NSOURCES), &
+          theta_source(NSOURCES), &
+          phi_source(NSOURCES), &
+          nu_source(NDIM,NDIM,NSOURCES),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating source arrays')
+
+  ! sources
   ! BS BS moved open statement and writing of first lines into sr.vtk before the
   ! call to locate_sources, where further write statements to that file follow
   if(myrank == 0) then
@@ -140,6 +125,15 @@
 
   if(abs(minval(tshift_cmt)) > TINYVAL) call exit_MPI(myrank,'one tshift_cmt must be zero, others must be positive')
 
+  ! count number of sources located in this slice
+  nsources_local = 0
+  if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+    do isource = 1,NSOURCES
+      if(myrank == islice_selected_source(isource)) nsources_local = nsources_local + 1
+    enddo
+  endif
+
+
   ! filter source time function by Gaussian with hdur = HDUR_MOVIE when outputing movies or shakemaps
   if (MOVIE_SURFACE .or. MOVIE_VOLUME ) then
     ! smaller hdur_movie will do
@@ -223,6 +217,57 @@
     call exit_mpi(myrank,'error negative USER_T0 parameter in constants.h')
   endif
 
+  ! get information about event name and location for SAC seismograms
+
+  ! The following line is added for get_event_info subroutine.
+  ! Because the way NSOURCES_SAC was declared has been changed.
+  ! The rest of the changes in this program is just the updates of the subroutines that
+  ! I did changes, e.g., adding/removing parameters. by Ebru Bozdag
+  call get_event_info_parallel(myrank,yr_SAC,jda_SAC,ho_SAC,mi_SAC,sec_SAC,&
+                              event_name_SAC,t_cmt_SAC,t_shift_SAC, &
+                              elat_SAC,elon_SAC,depth_SAC,mb_SAC,cmt_lat_SAC,&
+                              cmt_lon_SAC,cmt_depth_SAC,cmt_hdur_SAC,NSOURCES)
+
+  end subroutine setup_sources
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine setup_receivers()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  implicit none
+
+  include 'mpif.h'
+
+  ! local parameters
+  double precision :: sec
+  double precision :: junk
+  integer :: yr,jda,ho,mi
+  integer :: irec,isource,nrec_tot_found
+  integer :: icomp,itime,nadj_files_found,nadj_files_found_tot
+  character(len=3),dimension(NDIM) :: comp
+  character(len=256) :: filename,adj_source_file 
+  character(len=2) :: bic
+  integer :: ier
+
+  ! allocate memory for receiver arrays
+  allocate(islice_selected_rec(nrec), &
+          ispec_selected_rec(nrec), &
+          xi_receiver(nrec), &
+          eta_receiver(nrec), &
+          gamma_receiver(nrec), &
+          station_name(nrec), &
+          network_name(nrec), &
+          stlat(nrec), &
+          stlon(nrec), &
+          stele(nrec), &
+          stbur(nrec), &
+          nu(NDIM,NDIM,nrec),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating receiver arrays')
+
   !  receivers
   if(myrank == 0) then
     write(IMAIN,*)
@@ -334,7 +379,22 @@
       write(IMAIN,*) 'this total is okay'
     endif
   endif
+  
+  end subroutine setup_receivers
 
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine setup_sources_receivers_VTKfile()
+
+  use specfem_par
+  implicit none
+
+  ! local parameters
+  character(len=256) :: filename,system_command,filename_new
+
   ! user output
   if(myrank == 0) then
 
@@ -365,22 +425,73 @@
       "'",NSOURCES,"'",trim(filename),trim(filename_new),trim(filename)
     call system(system_command)
 
+  endif
 
-    write(IMAIN,*)
-    write(IMAIN,*) 'Total number of samples for seismograms = ',NSTEP
-    write(IMAIN,*)
+  end subroutine setup_sources_receivers_VTKfile
 
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-    if(NSOURCES > 1) write(IMAIN,*) 'Using ',NSOURCES,' point sources'
+  subroutine setup_sources_precompute_arrays()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  implicit none
+
+  ! local parameters
+  integer :: ier
+  
+  ! allocates source arrays
+  if (SIMULATION_TYPE == 1  .or. SIMULATION_TYPE == 3) then
+    allocate(sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating sourcearrays')
+
+    ! stores source arrays
+    call setup_sources_receivers_srcarr(NSOURCES,myrank, &
+                      ispec_selected_source,islice_selected_source, &
+                      xi_source,eta_source,gamma_source, &
+                      Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
+                      xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+                      etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+                      gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+                      xigll,yigll,zigll,sourcearrays)
   endif
 
-  end subroutine setup_sources_receivers
+  ! adjoint source arrays
+  if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
+    NSTEP_SUB_ADJ = ceiling( dble(NSTEP)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
+    allocate(iadj_vec(NSTEP),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating iadj_vec')
 
+    ! initializes iadj_vec
+    do it=1,NSTEP
+       iadj_vec(it) = NSTEP-it+1  ! default is for reversing entire record
+    enddo
+
+    if(nadj_rec_local > 0) then
+      ! allocate adjoint source arrays
+      allocate(adj_sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC),stat=ier)
+      if( ier /= 0 ) call exit_MPI(myrank,'error allocating adjoint sourcearrays')
+      adj_sourcearrays(:,:,:,:,:,:) = 0._CUSTOM_REAL
+
+      ! allocate indexing arrays
+      allocate(iadjsrc(NSTEP_SUB_ADJ,2), &
+              iadjsrc_len(NSTEP_SUB_ADJ),stat=ier)
+      if( ier /= 0 ) call exit_MPI(myrank,'error allocating adjoint indexing arrays')
+      ! initializes iadjsrc, iadjsrc_len and iadj_vec
+      call setup_sources_receivers_adjindx(NSTEP,NSTEP_SUB_ADJ, &
+                      NTSTEP_BETWEEN_READ_ADJSRC, &
+                      iadjsrc,iadjsrc_len,iadj_vec)
+    endif
+  endif
+
+  end subroutine setup_sources_precompute_arrays  
+
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-
   subroutine setup_sources_receivers_srcarr(NSOURCES,myrank, &
                       ispec_selected_source,islice_selected_source, &
                       xi_source,eta_source,gamma_source, &
@@ -517,11 +628,89 @@
 
   end subroutine setup_sources_receivers_adjindx
 
+
 !
 !-------------------------------------------------------------------------------------------------
 !
 
+  subroutine setup_receivers_precompute_intp()
 
+  use specfem_par
+  implicit none
+
+  ! local parameters
+  integer :: ier
+  
+  ! define local to global receiver numbering mapping
+  ! needs to be allocate for subroutine calls (even if nrec_local == 0)
+  allocate(number_receiver_global(nrec_local),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating global receiver numbering')
+
+  ! allocates receiver interpolators
+  if (nrec_local > 0) then
+    ! allocate Lagrange interpolators for receivers
+    allocate(hxir_store(nrec_local,NGLLX), &
+            hetar_store(nrec_local,NGLLY), &
+            hgammar_store(nrec_local,NGLLZ),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating receiver interpolators')
+
+    ! define and store Lagrange interpolators at all the receivers
+    if (SIMULATION_TYPE == 2) then
+      nadj_hprec_local = nrec_local
+    else
+      nadj_hprec_local = 1
+    endif
+    allocate(hpxir_store(nadj_hprec_local,NGLLX), &
+            hpetar_store(nadj_hprec_local,NGLLY), &
+            hpgammar_store(nadj_hprec_local,NGLLZ),stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating derivative interpolators')
+
+    ! stores interpolators for receiver positions
+    call setup_sources_receivers_intp(NSOURCES,myrank, &
+                      islice_selected_source, &
+                      xi_source,eta_source,gamma_source, &
+                      xigll,yigll,zigll, &
+                      SIMULATION_TYPE,nrec,nrec_local, &
+                      islice_selected_rec,number_receiver_global, &
+                      xi_receiver,eta_receiver,gamma_receiver, &
+                      hxir_store,hetar_store,hgammar_store, &
+                      nadj_hprec_local,hpxir_store,hpetar_store,hpgammar_store)
+
+    ! allocate seismogram array
+    if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+      allocate(seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
+      if(ier /= 0) stop 'error while allocating seismograms'
+    else
+      allocate(seismograms(NDIM*NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
+      if(ier /= 0) stop 'error while allocating seismograms'
+      ! allocate Frechet derivatives array
+      allocate(moment_der(NDIM,NDIM,nrec_local),sloc_der(NDIM,nrec_local), &
+              stshift_der(nrec_local),shdur_der(nrec_local),stat=ier)
+      if( ier /= 0 ) call exit_MPI(myrank,'error allocating frechet derivatives arrays')
+
+      moment_der(:,:,:) = 0._CUSTOM_REAL
+      sloc_der(:,:) = 0._CUSTOM_REAL
+      stshift_der(:) = 0._CUSTOM_REAL
+      shdur_der(:) = 0._CUSTOM_REAL
+
+    endif
+    ! initialize seismograms
+    seismograms(:,:,:) = 0._CUSTOM_REAL
+    nit_written = 0
+  else
+    ! allocate dummy array since we need it to pass as argument e.g. in write_seismograms() routine
+    ! note: nrec_local is zero, fortran 90/95 should allow zero-sized array allocation...
+    allocate(seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
+    if( ier /= 0) stop 'error while allocating zero seismograms'
+  endif
+
+  end subroutine setup_receivers_precompute_intp
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+
   subroutine setup_sources_receivers_intp(NSOURCES,myrank, &
                       islice_selected_source, &
                       xi_source,eta_source,gamma_source, &

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D.F90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D.F90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -34,28 +34,27 @@
 !
 !#define _HANDOPT
 
-! BEWARE:
-! BEWARE: we have observed that using _HANDOPT in combination with -O3 or higher can lead to problems on some machines;
-! BEWARE: thus, be careful when using it. At the very least, run the same test simulation once with _HANDOPT and once without
-! BEWARE: and make sure that all the seismograms you get are the same down to roundoff noise.
-! BEWARE:
-
 ! note: these hand optimizations should help compilers to pipeline the code and make better use of the cache;
 !          depending on compilers, it can further decrease the computation time by ~ 30%.
 !          the original routines are commented with "! way 1", the hand-optimized routines with  "! way 2"
 
   program xspecfem3D
 
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  use specfem_par_movie
   implicit none
 
-! standard include of the MPI library
+  ! standard include of the MPI library
   include 'mpif.h'
 
-  include "constants.h"
-  include "precision.h"
+  ! local parameters
+  integer :: ier
 
 ! include values created by the mesher
-  include "OUTPUT_FILES/values_from_mesher.h"
+!  include "OUTPUT_FILES/values_from_mesher.h"
 
 !=======================================================================!
 !                                                                       !
@@ -374,531 +373,7 @@
 ! Its first time derivative is called veloc_outer_core.
 ! Its second time derivative is called accel_outer_core.
 
-! memory variables and standard linear solids for attenuation
-  real(kind=CUSTOM_REAL), dimension(ATT1,ATT2,ATT3,ATT4) :: one_minus_sum_beta_crust_mantle, factor_scale_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(ATT1,ATT2,ATT3,ATT5) :: one_minus_sum_beta_inner_core, factor_scale_inner_core
 
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval, betaval, gammaval
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1,ATT2,ATT3,ATT4) :: factor_common_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1,ATT2,ATT3,ATT5) :: factor_common_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: R_memory_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: epsilondev_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: eps_trace_over_3_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: R_memory_inner_core
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: epsilondev_inner_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STRAIN_ONLY) :: eps_trace_over_3_inner_core
-
-! ADJOINT
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: b_alphaval, b_betaval, b_gammaval
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT) :: b_R_memory_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: b_epsilondev_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: b_eps_trace_over_3_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT) :: b_R_memory_inner_core
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: b_epsilondev_inner_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: b_eps_trace_over_3_inner_core
-
-! for matching with central cube in inner core
-  integer, dimension(:), allocatable :: sender_from_slices_to_cube
-  integer, dimension(:,:), allocatable :: ibool_central_cube
-  double precision, dimension(:,:), allocatable :: buffer_slices,b_buffer_slices,buffer_slices2
-  double precision, dimension(:,:,:), allocatable :: buffer_all_cube_from_slices,b_buffer_all_cube_from_slices
-  integer nb_msgs_theor_in_cube,non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices,receiver_cube_from_slices
-
-  integer nspec2D_xmin_inner_core,nspec2D_xmax_inner_core,nspec2D_ymin_inner_core,nspec2D_ymax_inner_core
-
-! to save movie frames
-  integer nmovie_points,NIT
-  real(kind=CUSTOM_REAL), dimension(:), allocatable :: &
-      store_val_x,store_val_y,store_val_z, &
-      store_val_ux,store_val_uy,store_val_uz
-  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: &
-      store_val_x_all,store_val_y_all,store_val_z_all, &
-      store_val_ux_all,store_val_uy_all,store_val_uz_all
-
-! to save movie volume
-  integer :: npoints_3dmovie,nspecel_3dmovie
-  integer, dimension(NGLOB_CRUST_MANTLE) :: num_ibool_3dmovie
-  double precision :: scalingval
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: muvstore_crust_mantle_3dmovie
-  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: nu_3dmovie
-  logical, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: mask_3dmovie
-
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: Iepsilondev_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: Ieps_trace_over_3_crust_mantle
-
-! use integer array to store values
-  integer, dimension(NX_BATHY,NY_BATHY) :: ibathy_topo
-
-! for crust/oceans coupling
-  integer, dimension(NSPEC2DMAX_XMIN_XMAX_CM) :: ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle
-  integer, dimension(NSPEC2DMAX_YMIN_YMAX_CM) :: ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle
-  integer, dimension(NSPEC2D_BOTTOM_CM) :: ibelm_bottom_crust_mantle
-  integer, dimension(NSPEC2D_TOP_CM) :: ibelm_top_crust_mantle
-
-! additional mass matrix for ocean load
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE_OCEANS) :: rmass_ocean_load
-
-! flag to mask ocean-bottom degrees of freedom for ocean load
-  logical, dimension(NGLOB_CRUST_MANTLE_OCEANS) :: updated_dof_ocean_load
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_BOTTOM_CM) :: jacobian2D_bottom_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_TOP_CM) :: jacobian2D_top_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_CM) :: jacobian2D_xmin_crust_mantle,&
-  jacobian2D_xmax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_CM) :: jacobian2D_ymin_crust_mantle,&
-  jacobian2D_ymax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_CM) :: &
-  normal_xmin_crust_mantle,normal_xmax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2DMAX_YMIN_YMAX_CM) :: &
-  normal_ymin_crust_mantle,normal_ymax_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_BOTTOM_CM) :: normal_bottom_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_TOP_CM) :: normal_top_crust_mantle
-
-! Stacey
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STACEY) :: rho_vp_crust_mantle,rho_vs_crust_mantle
-  integer nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle,nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle
-  integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_CM) :: nimin_crust_mantle,nimax_crust_mantle,nkmin_eta_crust_mantle
-  integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_CM) :: njmin_crust_mantle,njmax_crust_mantle,nkmin_xi_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_STACEY) :: vp_outer_core
-  integer nspec2D_xmin_outer_core,nspec2D_xmax_outer_core,nspec2D_ymin_outer_core,nspec2D_ymax_outer_core
-  integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_OC) :: nimin_outer_core,nimax_outer_core,nkmin_eta_outer_core
-  integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_OC) :: njmin_outer_core,njmax_outer_core,nkmin_xi_outer_core
-
-! arrays to couple with the fluid regions by pointwise matching
-  integer, dimension(NSPEC2DMAX_XMIN_XMAX_OC) :: ibelm_xmin_outer_core,ibelm_xmax_outer_core
-  integer, dimension(NSPEC2DMAX_YMIN_YMAX_OC) :: ibelm_ymin_outer_core,ibelm_ymax_outer_core
-  integer, dimension(NSPEC2D_BOTTOM_OC) :: ibelm_bottom_outer_core
-  integer, dimension(NSPEC2D_TOP_OC) :: ibelm_top_outer_core
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_OC) :: normal_xmin_outer_core,normal_xmax_outer_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_OC) :: normal_ymin_outer_core,normal_ymax_outer_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: normal_bottom_outer_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_TOP_OC) :: normal_top_outer_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: jacobian2D_bottom_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_TOP_OC) :: jacobian2D_top_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_OC) :: jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_OC) :: jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core
-
-
-  integer, dimension(NSPEC2DMAX_XMIN_XMAX_IC) :: ibelm_xmin_inner_core,ibelm_xmax_inner_core
-  integer, dimension(NSPEC2DMAX_YMIN_YMAX_IC) :: ibelm_ymin_inner_core,ibelm_ymax_inner_core
-  integer, dimension(NSPEC2D_BOTTOM_IC) :: ibelm_bottom_inner_core
-  integer, dimension(NSPEC2D_TOP_IC) :: ibelm_top_inner_core
-
-! for ellipticity
-  integer nspl
-  double precision rspl(NR),espl(NR),espl2(NR)
-
-! for conversion from x y z to r theta phi
-  real(kind=CUSTOM_REAL) rval,thetaval,phival
-
-! ---- arrays to assemble between chunks
-
-! communication pattern for faces between chunks
-  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces,imsg_type
-
-! communication pattern for corners between chunks
-  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
-
-! indirect addressing for each message for faces and corners of the chunks
-! a given slice can belong to at most one corner and at most two faces
-  integer NGLOB2DMAX_XY
-  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle, &
-      iboolfaces_outer_core,iboolfaces_inner_core
-
-! this for non blocking MPI
-
-! buffers for send and receive between faces of the slices and the chunks
-! we use the same buffers to assemble scalars and vectors because vectors are
-! always three times bigger and therefore scalars can use the first part
-! of the vector buffer in memory even if it has an additional index here
-  integer :: npoin2D_max_all_CM_IC
-  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_send_faces,buffer_received_faces, &
-                                                           b_buffer_send_faces,b_buffer_received_faces
-
-! for non blocking communications
-  logical, dimension(NSPEC_CRUST_MANTLE) :: is_on_a_slice_edge_crust_mantle
-  logical, dimension(NSPEC_OUTER_CORE) :: is_on_a_slice_edge_outer_core
-  logical, dimension(NSPEC_INNER_CORE) :: is_on_a_slice_edge_inner_core
-  logical, dimension(NGLOB_CRUST_MANTLE) :: mask_ibool
-  real :: percentage_edge
-
-! assembling phase number for non blocking MPI
-! iphase is for the crust_mantle, outer_core and inner_core regions
-! iphase_CC is for the central cube
-  integer :: iphase,iphase_CC,icall
-  integer :: b_iphase,b_iphase_CC,b_icall
-
-! -------- arrays specific to each region here -----------
-
-! ----------------- crust, mantle and oceans ---------------------
-
-! mesh parameters
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
-        xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle,&
-        etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-        gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: &
-        xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle
-
-! arrays for isotropic elements stored only where needed to save space
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
-        rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle
-
-! arrays for anisotropic elements stored only where needed to save space
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_TISO_MANTLE) :: &
-        kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle
-
-! arrays for full anisotropy only when needed
-  integer nspec_iso,nspec_tiso,nspec_ani
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_MANTLE) :: &
-        c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-        c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-        c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-        c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-        c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-        c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-        c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle
-
-! local to global mapping
-!  integer, dimension(NSPEC_CRUST_MANTLE) :: idoubling_crust_mantle
-  logical, dimension(NSPEC_CRUST_MANTLE) :: ispec_is_tiso_crust_mantle
-
-! mass matrix
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: rmass_crust_mantle
-
-! displacement, velocity, acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: &
-     displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle
-
-! ----------------- outer core ---------------------
-
-! mesh parameters
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: ibool_outer_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: &
-        xix_outer_core,xiy_outer_core,xiz_outer_core,&
-        etax_outer_core,etay_outer_core,etaz_outer_core, &
-        gammax_outer_core,gammay_outer_core,gammaz_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: &
-        xstore_outer_core,ystore_outer_core,zstore_outer_core
-
- real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: &
-        rhostore_outer_core,kappavstore_outer_core
-
-! local to global mapping
-  integer, dimension(NSPEC_OUTER_CORE) :: idoubling_outer_core
-  logical, dimension(NSPEC_OUTER_CORE) :: ispec_is_tiso_outer_core ! only needed for compute_boundary_kernel()
-
-! mass matrix
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: rmass_outer_core
-
-! velocity potential
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: displ_outer_core, &
-    veloc_outer_core,accel_outer_core
-
-! ----------------- inner core ---------------------
-
-! mesh parameters
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
-        xix_inner_core,xiy_inner_core,xiz_inner_core,&
-        etax_inner_core,etay_inner_core,etaz_inner_core, &
-        gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-        rhostore_inner_core, kappavstore_inner_core,muvstore_inner_core
-  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: &
-        xstore_inner_core,ystore_inner_core,zstore_inner_core
-
-! arrays for inner-core anisotropy only when needed
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_IC) :: &
-        c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-        c13store_inner_core,c44store_inner_core
-
-! local to global mapping
-  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
-  logical, dimension(NSPEC_INNER_CORE) :: ispec_is_tiso_inner_core ! only needed for computer_boundary_kernel() routine
-
-! mass matrix
-  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: rmass_inner_core
-
-! displacement, velocity, acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: &
-     displ_inner_core,veloc_inner_core,accel_inner_core
-
-! Newmark time scheme parameters and non-dimensionalization
-  real(kind=CUSTOM_REAL) time,deltat,deltatover2,deltatsqover2
-  double precision scale_t,scale_t_inv,scale_displ,scale_veloc
-
-! ADJOINT
-  real(kind=CUSTOM_REAL) b_deltat,b_deltatover2,b_deltatsqover2
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE_ADJOINT) :: &
-    b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE_ADJOINT) :: &
-    b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE_ADJOINT) :: &
-    b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ADJOINT) :: div_displ_outer_core
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ADJOINT) :: b_div_displ_outer_core
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: rho_kl_crust_mantle, &
-     beta_kl_crust_mantle, alpha_kl_crust_mantle, Sigma_kl_crust_mantle
-! For anisotropic kernels (see compute_kernels.f90 for a definition of the array)
-  real(kind=CUSTOM_REAL), dimension(21,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: cijkl_kl_crust_mantle
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ADJOINT) :: rho_kl_outer_core, &
-     alpha_kl_outer_core
-
-  ! approximate hessian
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable :: hess_kl_crust_mantle
-
-  ! check for deviatoric kernel for outer core region
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable :: beta_kl_outer_core
-  integer :: nspec_beta_kl_outer_core
-  logical,parameter:: deviatoric_outercore = .false.
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: rho_kl_inner_core, &
-     beta_kl_inner_core, alpha_kl_inner_core
-
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: absorb_xmin_crust_mantle5, &
-     absorb_xmax_crust_mantle5, absorb_ymin_crust_mantle5, absorb_ymax_crust_mantle5
-
-  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: absorb_xmin_outer_core, &
-     absorb_xmax_outer_core, absorb_ymin_outer_core, absorb_ymax_outer_core, &
-     absorb_zmin_outer_core
-  integer nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm
-  integer nabs_xmin_oc,nabs_xmax_oc,nabs_ymin_oc,nabs_ymax_oc,nabs_zmin_oc
-
-  integer reclen_xmin_crust_mantle, reclen_xmax_crust_mantle, reclen_ymin_crust_mantle, &
-     reclen_ymax_crust_mantle, reclen_xmin_outer_core, reclen_xmax_outer_core,&
-     reclen_ymin_outer_core, reclen_ymax_outer_core, reclen_zmin
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_OUTER_CORE) :: vector_accel_outer_core,&
-             vector_displ_outer_core, b_vector_displ_outer_core
-
-  integer npoin2D_faces_crust_mantle(NUMFACES_SHARED)
-  integer npoin2D_faces_outer_core(NUMFACES_SHARED)
-  integer npoin2D_faces_inner_core(NUMFACES_SHARED)
-
-! parameters for the source
-  integer it
-  integer, dimension(:), allocatable :: islice_selected_source,ispec_selected_source
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: sourcearrays
-  double precision, dimension(:,:,:) ,allocatable:: nu_source
-  double precision sec
-  double precision, dimension(:), allocatable :: Mxx,Myy,Mzz,Mxy,Mxz,Myz
-  double precision, dimension(:), allocatable :: xi_source,eta_source,gamma_source
-  double precision, dimension(:), allocatable :: tshift_cmt,hdur,hdur_gaussian
-  double precision, dimension(:), allocatable :: theta_source,phi_source
-  double precision, external :: comp_source_time_function
-  double precision t0
-
-! receiver information
-  integer nrec,nrec_local
-  integer, dimension(:), allocatable :: islice_selected_rec,ispec_selected_rec,number_receiver_global
-  double precision, dimension(:), allocatable :: xi_receiver,eta_receiver,gamma_receiver
-  character(len=150) :: STATIONS,rec_filename
-  double precision, dimension(:,:,:), allocatable :: nu
-  double precision, allocatable, dimension(:) :: stlat,stlon,stele,stbur
-  character(len=MAX_LENGTH_STATION_NAME), dimension(:), allocatable  :: station_name
-  character(len=MAX_LENGTH_NETWORK_NAME), dimension(:), allocatable :: network_name
-
-!ADJOINT
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:,:), allocatable :: adj_sourcearrays
-  integer nrec_simulation, nadj_rec_local
-  integer NSTEP_SUB_ADJ  ! to read input in chunks
-  integer, dimension(:,:), allocatable :: iadjsrc ! to read input in chunks
-  integer, dimension(:), allocatable :: iadjsrc_len,iadj_vec
-! source frechet derivatives
-  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: moment_der
-  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: sloc_der
-  real(kind=CUSTOM_REAL), dimension(:), allocatable :: stshift_der, shdur_der
-  double precision, dimension(:,:), allocatable :: hpxir_store,hpetar_store,hpgammar_store
-  integer :: nadj_hprec_local
-
-! seismograms
-  integer it_begin,it_end,nit_written
-  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: seismograms
-  integer :: seismo_offset, seismo_current
-
-! non-dimensionalized rotation rate of the Earth times two
-  real(kind=CUSTOM_REAL) two_omega_earth
-
-! for the Euler scheme for rotation
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROTATION) :: &
-    A_array_rotation,B_array_rotation
-
-! number of faces between chunks
-  integer NUMMSGS_FACES
-
-! number of corners between chunks
-  integer NCORNERSCHUNKS
-
-! number of message types
-  integer NUM_MSG_TYPES
-
-! indirect addressing for each corner of the chunks
-  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
-  integer, dimension(NGLOB1D_RADIAL_OC,NUMCORNERS_SHARED) :: iboolcorner_outer_core
-  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
-
-! buffers for send and receive between corners of the chunks
-  real(kind=CUSTOM_REAL), dimension(NGLOB1D_RADIAL_CM) :: buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
-                                                          b_buffer_send_chunkcorn_scalar,b_buffer_recv_chunkcorn_scalar
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
-     buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
-     b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector
-
-! Gauss-Lobatto-Legendre points of integration and weights
-  double precision, dimension(NGLLX) :: xigll,wxgll
-  double precision, dimension(NGLLY) :: yigll,wygll
-  double precision, dimension(NGLLZ) :: zigll,wzgll
-
-! product of weights for gravity term
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xxT,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
-  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! Lagrange interpolators at receivers
-  double precision, dimension(:,:), allocatable :: hxir_store,hetar_store,hgammar_store
-
-! 2-D addressing and buffers for summation between slices
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_OC) :: iboolleft_xi_outer_core,iboolright_xi_outer_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_OC) :: iboolleft_eta_outer_core,iboolright_eta_outer_core
-
-  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
-  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
-
-! for addressing of the slices
-  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
-  integer, dimension(0:NPROCTOT_VAL-1) :: ichunk_slice,iproc_xi_slice,iproc_eta_slice
-
-! proc numbers for MPI
-  integer myrank
-
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_outer_core,npoin2D_eta_outer_core
-  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_inner_core,npoin2D_eta_inner_core
-
-  integer ichunk,iproc_xi,iproc_eta
-
-!ADJOINT
-  real(kind=CUSTOM_REAL) b_two_omega_earth
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROT_ADJOINT) :: &
-    b_A_array_rotation,b_B_array_rotation
-
-  double precision :: time_start
-
-! parameters read from parameter file
-  integer MIN_ATTENUATION_PERIOD,MAX_ATTENUATION_PERIOD,NER_CRUST, &
-          NER_80_MOHO,NER_220_80,NER_400_220,NER_600_400,NER_670_600,NER_771_670, &
-          NER_TOPDDOUBLEPRIME_771,NER_CMB_TOPDDOUBLEPRIME,NER_OUTER_CORE, &
-          NER_TOP_CENTRAL_CUBE_ICB,NEX_XI,NEX_ETA, &
-          NTSTEP_BETWEEN_OUTPUT_SEISMOS,&
-          NTSTEP_BETWEEN_READ_ADJSRC,NSTEP,NSOURCES,NTSTEP_BETWEEN_FRAMES, &
-          NTSTEP_BETWEEN_OUTPUT_INFO,NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN,SIMULATION_TYPE, &
-          MOVIE_VOLUME_TYPE,MOVIE_START,MOVIE_STOP,NOISE_TOMOGRAPHY
-
-  double precision DT,ROCEAN,RMIDDLE_CRUST, &
-          RMOHO,R80,R220,R400,R600,R670,R771,RTOPDDOUBLEPRIME,RCMB,RICB, &
-          RHO_TOP_OC,RHO_BOTTOM_OC,RHO_OCEANS,HDUR_MOVIE, &
-          MOVIE_TOP,MOVIE_BOTTOM,MOVIE_WEST,MOVIE_EAST,MOVIE_NORTH,MOVIE_SOUTH, &
-          ANGULAR_WIDTH_XI_IN_DEGREES
-
-  logical ONE_CRUST,TOPOGRAPHY,MOVIE_SURFACE,MOVIE_VOLUME,MOVIE_COARSE, &
-          RECEIVERS_CAN_BE_BURIED,PRINT_SOURCE_TIME_FUNCTION, &
-          SAVE_MESH_FILES,ABSORBING_CONDITIONS,INCLUDE_CENTRAL_CUBE,SAVE_FORWARD, &
-          OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY, &
-          ROTATE_SEISMOGRAMS_RT,HONOR_1D_SPHERICAL_MOHO,WRITE_SEISMOGRAMS_BY_MASTER,&
-          SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE
-
-  character(len=150) OUTPUT_FILES,LOCAL_PATH,MODEL
-
-!  logical COMPUTE_AND_STORE_STRAIN
-
-! for SAC headers for seismograms
-  integer yr_SAC,jda_SAC,ho_SAC,mi_SAC
-  real mb_SAC
-  double precision t_cmt_SAC,t_shift_SAC,elat_SAC,elon_SAC,depth_SAC, &
-    cmt_lat_SAC,cmt_lon_SAC,cmt_depth_SAC,cmt_hdur_SAC,sec_SAC
-  character(len=20) event_name_SAC
-
-! this for all the regions
-  integer, dimension(MAX_NUM_REGIONS) :: NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX, &
-               NSPEC2D_BOTTOM,NSPEC2D_TOP, &
-               NGLOB1D_RADIAL, &
-               NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
-
-  character(len=150) prname
-
-!daniel: debugging
-!  character(len=256) :: filename
-!  logical, parameter :: SNAPSHOT_INNER_CORE = .true.
-
-! lookup table every km for gravity
-  real(kind=CUSTOM_REAL) minus_g_cmb,minus_g_icb
-  double precision, dimension(NRAD_GRAVITY) :: minus_gravity_table, &
-    minus_deriv_gravity_table,density_table,d_ln_density_dr_table,minus_rho_g_over_kappa_fluid
-
-! dummy array that does not need to be actually read
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
-
-! computed in read_compute_parameters
-  integer, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: ner,ratio_sampling_array
-  integer, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: doubling_index
-  double precision, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: r_bottom,r_top
-  logical, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: this_region_has_a_doubling
-  double precision, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: rmins,rmaxs
-
-! Boundary Mesh and Kernels
-  integer k_top,k_bot,iregion_code
-  integer, dimension(NSPEC2D_MOHO) :: ibelm_moho_top,ibelm_moho_bot
-  integer, dimension(NSPEC2D_400) :: ibelm_400_top,ibelm_400_bot
-  integer, dimension(NSPEC2D_670) :: ibelm_670_top,ibelm_670_bot
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_MOHO) :: normal_moho
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_400) :: normal_400
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_670) :: normal_670
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_MOHO) :: moho_kl, moho_kl_top, moho_kl_bot
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_400) :: d400_kl, d400_kl_top, d400_kl_bot
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_670) ::  d670_kl, d670_kl_top, d670_kl_bot
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_CMB) :: cmb_kl, cmb_kl_top, cmb_kl_bot
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_ICB) :: icb_kl, icb_kl_top, icb_kl_bot
-  logical :: fluid_solid_boundary
-
-  integer :: i,ier
-
-! NOISE_TOMOGRAPHY
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: noise_sourcearray
-  real(kind=CUSTOM_REAL), dimension(:), allocatable :: &
-             normal_x_noise,normal_y_noise,normal_z_noise, mask_noise
-  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: noise_surface_movie
-  integer :: irec_master_noise
-
-#ifdef _HANDOPT
-  integer :: imodulo_NGLOB_CRUST_MANTLE,imodulo_NGLOB_CRUST_MANTLE4, &
-            imodulo_NGLOB_INNER_CORE
-#endif
-
 ! ************** PROGRAM STARTS HERE **************
 !
 !-------------------------------------------------------------------------------------------------
@@ -933,7 +408,7 @@
 !             passing them along as arguments to the routine makes the code slower.
 !             it seems that this stack/heap criterion is more complicated.
 !
-!             another reason why modules are avoided is to make the code thread safe.
+!             another reason why the use of modules is restricted is to make the code thread safe.
 !             having different threads access the same data structure and modifying it at the same time
 !             would lead to problems. passing arguments is a way to avoid such complications.
 !
@@ -970,3673 +445,29 @@
   call force_ftz()
 
   ! initializes simulation parameters
-  call initialize_simulation(myrank,MIN_ATTENUATION_PERIOD,MAX_ATTENUATION_PERIOD,NER_CRUST, &
-                NER_80_MOHO,NER_220_80,NER_400_220,NER_600_400,NER_670_600,NER_771_670, &
-                NER_TOPDDOUBLEPRIME_771,NER_CMB_TOPDDOUBLEPRIME,NER_OUTER_CORE, &
-                NER_TOP_CENTRAL_CUBE_ICB,ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,NEX_ETA, &
-                NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-                NTSTEP_BETWEEN_READ_ADJSRC,NSTEP,NSOURCES,NTSTEP_BETWEEN_FRAMES, &
-                NTSTEP_BETWEEN_OUTPUT_INFO,NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN,SIMULATION_TYPE, &
-                DT,ROCEAN,RMIDDLE_CRUST,RMOHO,R80,R220,R400,R600,R670,R771,&
-                RTOPDDOUBLEPRIME,RCMB,RICB, &
-                RHO_TOP_OC,RHO_BOTTOM_OC,RHO_OCEANS, &
-                MOVIE_VOLUME_TYPE,MOVIE_START,MOVIE_STOP, &
-                HDUR_MOVIE,MOVIE_TOP,MOVIE_BOTTOM,MOVIE_WEST,MOVIE_EAST, &
-                MOVIE_NORTH,MOVIE_SOUTH,MOVIE_SURFACE,MOVIE_VOLUME, &
-                RECEIVERS_CAN_BE_BURIED,PRINT_SOURCE_TIME_FUNCTION, &
-                SAVE_MESH_FILES,ABSORBING_CONDITIONS,INCLUDE_CENTRAL_CUBE,SAVE_FORWARD, &
-                SAVE_ALL_SEISMOS_IN_ONE_FILE,MOVIE_COARSE,OUTPUT_SEISMOS_ASCII_TEXT, &
-                OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY, &
-                ROTATE_SEISMOGRAMS_RT,WRITE_SEISMOGRAMS_BY_MASTER,USE_BINARY_FOR_LARGE_FILE, &
-                LOCAL_PATH,MODEL,OUTPUT_FILES, &
-                ratio_sampling_array, ner, doubling_index,r_bottom,r_top, &
-                this_region_has_a_doubling,rmins,rmaxs, &
-                TOPOGRAPHY,HONOR_1D_SPHERICAL_MOHO,ONE_CRUST, &
-                nspl,rspl,espl,espl2,ibathy_topo, &
-                NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX,NSPEC2D_BOTTOM,NSPEC2D_TOP, &
-                NGLOB1D_RADIAL,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX, &
-                xigll,yigll,zigll,wxgll,wygll,wzgll,wgll_cube, &
-                hprime_xx,hprime_yy,hprime_zz,hprime_xxT, &
-                hprimewgll_xx,hprimewgll_yy,hprimewgll_zz,hprimewgll_xxT, &
-                wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                rec_filename,STATIONS,nrec,NOISE_TOMOGRAPHY)
-!
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!
-! starts reading the databases
-  call read_mesh_databases(myrank,rho_vp_crust_mantle,rho_vs_crust_mantle, &
-              xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-              xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-              etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-              gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-              rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-              kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-              nspec_iso,nspec_tiso,nspec_ani, &
-              c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-              c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-              c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-              c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-              c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-              c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-              c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-              ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-            ! -- idoubling_crust_mantle,
-              is_on_a_slice_edge_crust_mantle,rmass_crust_mantle,rmass_ocean_load, &
-              vp_outer_core,xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-              xix_outer_core,xiy_outer_core,xiz_outer_core, &
-              etax_outer_core,etay_outer_core,etaz_outer_core, &
-              gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-              rhostore_outer_core,kappavstore_outer_core, &
-              ibool_outer_core,idoubling_outer_core,ispec_is_tiso_outer_core, &
-              is_on_a_slice_edge_outer_core,rmass_outer_core, &
-              xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-              xix_inner_core,xiy_inner_core,xiz_inner_core, &
-              etax_inner_core,etay_inner_core,etaz_inner_core, &
-              gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-              rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
-              c11store_inner_core,c12store_inner_core,c13store_inner_core, &
-              c33store_inner_core,c44store_inner_core, &
-              ibool_inner_core,idoubling_inner_core,ispec_is_tiso_inner_core, &
-              is_on_a_slice_edge_inner_core,rmass_inner_core, &
-              ABSORBING_CONDITIONS,LOCAL_PATH)
+  call initialize_simulation()
 
-  ! read 2-D addressing for summation between slices with MPI
-  call read_mesh_databases_addressing(myrank, &
-              iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle, &
-              iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-              npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-              iboolfaces_crust_mantle,npoin2D_faces_crust_mantle, &
-              iboolcorner_crust_mantle, &
-              iboolleft_xi_outer_core,iboolright_xi_outer_core, &
-              iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-              npoin2D_xi_outer_core,npoin2D_eta_outer_core,&
-              iboolfaces_outer_core,npoin2D_faces_outer_core, &
-              iboolcorner_outer_core, &
-              iboolleft_xi_inner_core,iboolright_xi_inner_core, &
-              iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-              npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-              iboolfaces_inner_core,npoin2D_faces_inner_core, &
-              iboolcorner_inner_core, &
-              iprocfrom_faces,iprocto_faces,imsg_type, &
-              iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-              LOCAL_PATH,OUTPUT_FILES, &
-              NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB1D_RADIAL, &
-              NGLOB2DMAX_XY,NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
-              addressing,ichunk_slice,iproc_xi_slice,iproc_eta_slice, &
-              ichunk,iproc_xi,iproc_eta)
+  ! starts reading the databases
+  call read_mesh_databases()
 
-  ! to couple mantle with outer core
-  call read_mesh_databases_coupling(myrank, &
-              nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-              nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
-              ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle,ibelm_ymin_crust_mantle, &
-              ibelm_ymax_crust_mantle,ibelm_bottom_crust_mantle,ibelm_top_crust_mantle, &
-              normal_xmin_crust_mantle,normal_xmax_crust_mantle,normal_ymin_crust_mantle, &
-              normal_ymax_crust_mantle,normal_bottom_crust_mantle,normal_top_crust_mantle, &
-              jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle,jacobian2D_ymin_crust_mantle, &
-              jacobian2D_ymax_crust_mantle,jacobian2D_bottom_crust_mantle,jacobian2D_top_crust_mantle, &
-              nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-              nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
-              ibelm_xmin_outer_core,ibelm_xmax_outer_core,ibelm_ymin_outer_core, &
-              ibelm_ymax_outer_core,ibelm_bottom_outer_core,ibelm_top_outer_core, &
-              normal_xmin_outer_core,normal_xmax_outer_core,normal_ymin_outer_core, &
-              normal_ymax_outer_core,normal_bottom_outer_core,normal_top_outer_core, &
-              jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core,jacobian2D_ymin_outer_core, &
-              jacobian2D_ymax_outer_core,jacobian2D_bottom_outer_core,jacobian2D_top_outer_core, &
-              nspec2D_xmin_inner_core,nspec2D_xmax_inner_core, &
-              nspec2D_ymin_inner_core,nspec2D_ymax_inner_core, &
-              ibelm_xmin_inner_core,ibelm_xmax_inner_core,ibelm_ymin_inner_core, &
-              ibelm_ymax_inner_core,ibelm_bottom_inner_core,ibelm_top_inner_core, &
-              ibelm_moho_top,ibelm_moho_bot,ibelm_400_top,ibelm_400_bot, &
-              ibelm_670_top,ibelm_670_bot,normal_moho,normal_400,normal_670, &
-              k_top,k_bot,moho_kl,d400_kl,d670_kl,cmb_kl,icb_kl, &
-              LOCAL_PATH,SIMULATION_TYPE)
+  ! sets up reference element GLL points/weights/derivatives
+  call setup_GLL_points()
 
-! added this to reduce the size of the buffers
-! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
-  npoin2D_max_all_CM_IC = max(maxval(npoin2D_xi_crust_mantle(:) + npoin2D_xi_inner_core(:)), &
-                        maxval(npoin2D_eta_crust_mantle(:) + npoin2D_eta_inner_core(:)))
+  ! reads topography & bathymetry & ellipticity
+  call read_topography_bathymetry()
 
-  allocate(buffer_send_faces(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED), &
-          buffer_received_faces(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED),stat=ier)
-  if( ier /= 0 ) call exit_MPI(myrank,'error allocating mpi buffer')
+  ! prepares sources and receivers
+  call setup_sources_receivers()
 
-  allocate(b_buffer_send_faces(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED), &
-          b_buffer_received_faces(NDIM,npoin2D_max_all_CM_IC,NUMFACES_SHARED),stat=ier)
-  if( ier /= 0 ) call exit_MPI(myrank,'error allocating mpi b_buffer')
+  ! sets up and precomputes simulation arrays
+  call prepare_timerun()
 
-  call fix_non_blocking_slices(is_on_a_slice_edge_crust_mantle,iboolright_xi_crust_mantle, &
-         iboolleft_xi_crust_mantle,iboolright_eta_crust_mantle,iboolleft_eta_crust_mantle, &
-         npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle,ibool_crust_mantle, &
-         mask_ibool,NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE,NGLOB2DMAX_XMIN_XMAX_CM,NGLOB2DMAX_YMIN_YMAX_CM)
+  ! steps through time iterations
+  call iterate_time()
 
-  call fix_non_blocking_slices(is_on_a_slice_edge_outer_core,iboolright_xi_outer_core, &
-         iboolleft_xi_outer_core,iboolright_eta_outer_core,iboolleft_eta_outer_core, &
-         npoin2D_xi_outer_core,npoin2D_eta_outer_core,ibool_outer_core, &
-         mask_ibool,NSPEC_OUTER_CORE,NGLOB_OUTER_CORE,NGLOB2DMAX_XMIN_XMAX_OC,NGLOB2DMAX_YMIN_YMAX_OC)
+  ! saves last time frame and finishes kernel calculations
+  call finalize_simulation()
 
-  call fix_non_blocking_slices(is_on_a_slice_edge_inner_core,iboolright_xi_inner_core, &
-         iboolleft_xi_inner_core,iboolright_eta_inner_core,iboolleft_eta_inner_core, &
-         npoin2D_xi_inner_core,npoin2D_eta_inner_core,ibool_inner_core, &
-         mask_ibool,NSPEC_INNER_CORE,NGLOB_INNER_CORE,NGLOB2DMAX_XMIN_XMAX_IC,NGLOB2DMAX_YMIN_YMAX_IC)
-
-  ! absorbing boundaries
-  if(ABSORBING_CONDITIONS) then
-    ! crust_mantle
-    if (nspec2D_xmin_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      nabs_xmin_cm = nspec2D_xmin_crust_mantle
-    else
-      nabs_xmin_cm = 1
-    endif
-    allocate(absorb_xmin_crust_mantle5(NDIM,NGLLY,NGLLZ,nabs_xmin_cm,8),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb xmin')
-
-    if (nspec2D_xmax_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      nabs_xmax_cm = nspec2D_xmax_crust_mantle
-    else
-      nabs_xmax_cm = 1
-    endif
-    allocate(absorb_xmax_crust_mantle5(NDIM,NGLLY,NGLLZ,nabs_xmax_cm,8),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb xmax')
-
-    if (nspec2D_ymin_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      nabs_ymin_cm = nspec2D_ymin_crust_mantle
-    else
-      nabs_ymin_cm = 1
-    endif
-    allocate(absorb_ymin_crust_mantle5(NDIM,NGLLX,NGLLZ,nabs_ymin_cm,8),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb ymin')
-
-    if (nspec2D_ymax_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      nabs_ymax_cm = nspec2D_ymax_crust_mantle
-    else
-      nabs_ymax_cm = 1
-    endif
-    allocate(absorb_ymax_crust_mantle5(NDIM,NGLLX,NGLLZ,nabs_ymax_cm,8),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb ymax')
-
-    ! outer_core
-    if (nspec2D_xmin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      nabs_xmin_oc = nspec2D_xmin_outer_core
-    else
-      nabs_xmin_oc = 1
-    endif
-    allocate(absorb_xmin_outer_core(NGLLY,NGLLZ,nabs_xmin_oc),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb xmin')
-
-    if (nspec2D_xmax_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      nabs_xmax_oc = nspec2D_xmax_outer_core
-    else
-      nabs_xmax_oc = 1
-    endif
-    allocate(absorb_xmax_outer_core(NGLLY,NGLLZ,nabs_xmax_oc),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb xmax')
-
-    if (nspec2D_ymin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      nabs_ymin_oc = nspec2D_ymin_outer_core
-    else
-      nabs_ymin_oc = 1
-    endif
-    allocate(absorb_ymin_outer_core(NGLLX,NGLLZ,nabs_ymin_oc),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb ymin')
-
-    if (nspec2D_ymax_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      nabs_ymax_oc = nspec2D_ymax_outer_core
-    else
-      nabs_ymax_oc = 1
-    endif
-    allocate(absorb_ymax_outer_core(NGLLX,NGLLZ,nabs_ymax_oc),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb ymax')
-
-    if (NSPEC2D_BOTTOM(IREGION_OUTER_CORE) > 0 .and. &
-       (SIMULATION_TYPE == 3 .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      nabs_zmin_oc = NSPEC2D_BOTTOM(IREGION_OUTER_CORE)
-    else
-      nabs_zmin_oc = 1
-    endif
-    allocate(absorb_zmin_outer_core(NGLLX,NGLLY,nabs_zmin_oc),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating absorb zmin')
-
-    ! read arrays for Stacey conditions
-    call read_mesh_databases_stacey(myrank, &
-                      nimin_crust_mantle,nimax_crust_mantle,njmin_crust_mantle, &
-                      njmax_crust_mantle,nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
-                      nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-                      nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
-                      reclen_xmin_crust_mantle,reclen_xmax_crust_mantle, &
-                      reclen_ymin_crust_mantle,reclen_ymax_crust_mantle, &
-                      nimin_outer_core,nimax_outer_core,njmin_outer_core, &
-                      njmax_outer_core,nkmin_xi_outer_core,nkmin_eta_outer_core, &
-                      nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-                      nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
-                      reclen_xmin_outer_core,reclen_xmax_outer_core, &
-                      reclen_ymin_outer_core,reclen_ymax_outer_core, &
-                      reclen_zmin,NSPEC2D_BOTTOM, &
-                      SIMULATION_TYPE,SAVE_FORWARD,LOCAL_PATH,NSTEP)
-
-  endif
-
-!
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!
-! source and receivers
-
-  ! allocate arrays for source
-  allocate(islice_selected_source(NSOURCES), &
-          ispec_selected_source(NSOURCES), &
-          Mxx(NSOURCES), &
-          Myy(NSOURCES), &
-          Mzz(NSOURCES), &
-          Mxy(NSOURCES), &
-          Mxz(NSOURCES), &
-          Myz(NSOURCES), &
-          xi_source(NSOURCES), &
-          eta_source(NSOURCES), &
-          gamma_source(NSOURCES), &
-          tshift_cmt(NSOURCES), &
-          hdur(NSOURCES), &
-          hdur_gaussian(NSOURCES), &
-          theta_source(NSOURCES), &
-          phi_source(NSOURCES), &
-          nu_source(NDIM,NDIM,NSOURCES),stat=ier)
-  if( ier /= 0 ) call exit_MPI(myrank,'error allocating source arrays')
-
-  ! allocate memory for receiver arrays
-  allocate(islice_selected_rec(nrec), &
-          ispec_selected_rec(nrec), &
-          xi_receiver(nrec), &
-          eta_receiver(nrec), &
-          gamma_receiver(nrec), &
-          station_name(nrec), &
-          network_name(nrec), &
-          stlat(nrec), &
-          stlon(nrec), &
-          stele(nrec), &
-          stbur(nrec), &
-          nu(NDIM,NDIM,nrec),stat=ier)
-  if( ier /= 0 ) call exit_MPI(myrank,'error allocating receiver arrays')
-
-  ! locates sources and receivers
-  call setup_sources_receivers(NSOURCES,myrank,ibool_crust_mantle, &
-                      xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-                      xigll,yigll,zigll,TOPOGRAPHY, &
-                      sec,tshift_cmt,theta_source,phi_source, &
-                      NSTEP,DT,hdur,hdur_gaussian,t0,Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
-                      islice_selected_source,ispec_selected_source, &
-                      xi_source,eta_source,gamma_source,nu_source, &
-                      rspl,espl,espl2,nspl,ibathy_topo,NEX_XI,PRINT_SOURCE_TIME_FUNCTION, &
-                      rec_filename,nrec,islice_selected_rec,ispec_selected_rec, &
-                      xi_receiver,eta_receiver,gamma_receiver,station_name,network_name, &
-                      stlat,stlon,stele,stbur,nu, &
-                      nrec_local,nadj_rec_local,nrec_simulation, &
-                      SIMULATION_TYPE,RECEIVERS_CAN_BE_BURIED,MOVIE_SURFACE,MOVIE_VOLUME, &
-                      HDUR_MOVIE,OUTPUT_FILES,LOCAL_PATH)
-
-  ! allocates source arrays
-  if (SIMULATION_TYPE == 1  .or. SIMULATION_TYPE == 3) then
-    allocate(sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating sourcearrays')
-
-    ! stores source arrays
-    call setup_sources_receivers_srcarr(NSOURCES,myrank, &
-                      ispec_selected_source,islice_selected_source, &
-                      xi_source,eta_source,gamma_source, &
-                      Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
-                      xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-                      etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-                      gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-                      xigll,yigll,zigll,sourcearrays)
-  endif
-
-
-  if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
-    NSTEP_SUB_ADJ = ceiling( dble(NSTEP)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
-    allocate(iadj_vec(NSTEP),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating iadj_vec')
-
-    ! initializes iadj_vec
-    do it=1,NSTEP
-       iadj_vec(it) = NSTEP-it+1  ! default is for reversing entire record
-    enddo
-
-    if(nadj_rec_local > 0) then
-      ! allocate adjoint source arrays
-      allocate(adj_sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC),stat=ier)
-      if( ier /= 0 ) call exit_MPI(myrank,'error allocating adjoint sourcearrays')
-      adj_sourcearrays(:,:,:,:,:,:) = 0._CUSTOM_REAL
-
-      ! allocate indexing arrays
-      allocate(iadjsrc(NSTEP_SUB_ADJ,2), &
-              iadjsrc_len(NSTEP_SUB_ADJ),stat=ier)
-      if( ier /= 0 ) call exit_MPI(myrank,'error allocating adjoint indexing arrays')
-      ! initializes iadjsrc, iadjsrc_len and iadj_vec
-      call setup_sources_receivers_adjindx(NSTEP,NSTEP_SUB_ADJ, &
-                      NTSTEP_BETWEEN_READ_ADJSRC, &
-                      iadjsrc,iadjsrc_len,iadj_vec)
-    endif
-  endif
-
-  ! allocates receiver interpolators
-  if (nrec_local > 0) then
-    ! allocate Lagrange interpolators for receivers
-    allocate(hxir_store(nrec_local,NGLLX), &
-            hetar_store(nrec_local,NGLLY), &
-            hgammar_store(nrec_local,NGLLZ),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating receiver interpolators')
-    ! define local to global receiver numbering mapping
-    allocate(number_receiver_global(nrec_local),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating global receiver numbering')
-    ! define and store Lagrange interpolators at all the receivers
-    if (SIMULATION_TYPE == 2) then
-      nadj_hprec_local = nrec_local
-    else
-      nadj_hprec_local = 1
-    endif
-    allocate(hpxir_store(nadj_hprec_local,NGLLX), &
-            hpetar_store(nadj_hprec_local,NGLLY), &
-            hpgammar_store(nadj_hprec_local,NGLLZ),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating derivative interpolators')
-
-    ! stores interpolators for receiver positions
-    call setup_sources_receivers_intp(NSOURCES,myrank, &
-                      islice_selected_source, &
-                      xi_source,eta_source,gamma_source, &
-                      xigll,yigll,zigll, &
-                      SIMULATION_TYPE,nrec,nrec_local, &
-                      islice_selected_rec,number_receiver_global, &
-                      xi_receiver,eta_receiver,gamma_receiver, &
-                      hxir_store,hetar_store,hgammar_store, &
-                      nadj_hprec_local,hpxir_store,hpetar_store,hpgammar_store)
-
-    ! allocate seismogram array
-    if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
-      allocate(seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
-      if(ier /= 0) stop 'error while allocating seismograms'
-    else
-      allocate(seismograms(NDIM*NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
-      if(ier /= 0) stop 'error while allocating seismograms'
-      ! allocate Frechet derivatives array
-      allocate(moment_der(NDIM,NDIM,nrec_local),sloc_der(NDIM,nrec_local), &
-              stshift_der(nrec_local),shdur_der(nrec_local),stat=ier)
-      if( ier /= 0 ) call exit_MPI(myrank,'error allocating frechet derivatives arrays')
-
-      moment_der(:,:,:) = 0._CUSTOM_REAL
-      sloc_der(:,:) = 0._CUSTOM_REAL
-      stshift_der(:) = 0._CUSTOM_REAL
-      shdur_der(:) = 0._CUSTOM_REAL
-
-    endif
-    ! initialize seismograms
-    seismograms(:,:,:) = 0._CUSTOM_REAL
-    nit_written = 0
-  else
-    ! allocate dummy array since we need it to pass as argument e.g. in write_seismograms() routine
-    ! note: nrec_local is zero, fortran 90/95 should allow zero-sized array allocation...
-    allocate(seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
-    if( ier /= 0) stop 'error while allocating zero seismograms'
-    allocate(number_receiver_global(nrec_local),stat=ier)
-    if( ier /= 0) stop 'error while allocating zero number_receiver_global'
-  endif
-
-  ! get information about event name and location for SAC seismograms
-
-  ! The following line is added for get_event_info subroutine.
-  ! Because the way NSOURCES_SAC was declared has been changed.
-  ! The rest of the changes in this program is just the updates of the subroutines that
-  ! I did changes, e.g., adding/removing parameters. by Ebru Bozdag
-  call get_event_info_parallel(myrank,yr_SAC,jda_SAC,ho_SAC,mi_SAC,sec_SAC,&
-                              event_name_SAC,t_cmt_SAC,t_shift_SAC, &
-                              elat_SAC,elon_SAC,depth_SAC,mb_SAC,cmt_lat_SAC,&
-                              cmt_lon_SAC,cmt_depth_SAC,cmt_hdur_SAC,NSOURCES)
-
-!
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!
-
-  ! user output
-  if(myrank == 0) then
-
-    write(IMAIN,*)
-    write(IMAIN,*) 'Reference radius of the Earth used is ',R_EARTH_KM,' km'
-    write(IMAIN,*)
-
-    write(IMAIN,*)
-    if(OCEANS_VAL) then
-      write(IMAIN,*) 'incorporating the oceans using equivalent load'
-    else
-      write(IMAIN,*) 'no oceans'
-    endif
-
-    write(IMAIN,*)
-    if(ELLIPTICITY_VAL) then
-      write(IMAIN,*) 'incorporating ellipticity'
-    else
-      write(IMAIN,*) 'no ellipticity'
-    endif
-
-    write(IMAIN,*)
-    if(TOPOGRAPHY) then
-      write(IMAIN,*) 'incorporating surface topography'
-    else
-      write(IMAIN,*) 'no surface topography'
-    endif
-
-    write(IMAIN,*)
-    if(GRAVITY_VAL) then
-      write(IMAIN,*) 'incorporating self-gravitation (Cowling approximation)'
-    else
-      write(IMAIN,*) 'no self-gravitation'
-    endif
-
-    write(IMAIN,*)
-    if(ROTATION_VAL) then
-      write(IMAIN,*) 'incorporating rotation'
-    else
-      write(IMAIN,*) 'no rotation'
-    endif
-
-    write(IMAIN,*)
-    if(ATTENUATION_VAL) then
-      write(IMAIN,*) 'incorporating attenuation using ',N_SLS,' standard linear solids'
-
-      if(ATTENUATION_3D_VAL) write(IMAIN,*) 'using 3D attenuation'
-
-      if(USE_ATTENUATION_MIMIC ) write(IMAIN,*) 'mimicking effects on velocity only'
-    else
-      write(IMAIN,*) 'no attenuation'
-    endif
-
-    write(IMAIN,*)
-    write(IMAIN,*)
-    write(IMAIN,*)
-
-  endif
-
-  ! the mass matrix needs to be assembled with MPI here once and for all
-  call prepare_timerun_rmass(myrank,rmass_ocean_load,rmass_crust_mantle, &
-                      rmass_outer_core,rmass_inner_core, &
-                      iproc_xi,iproc_eta,ichunk,addressing, &
-                      iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle, &
-                      iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-                      npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-                      iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-                      iboolleft_xi_outer_core,iboolright_xi_outer_core, &
-                      iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-                      npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-                      iboolfaces_outer_core,iboolcorner_outer_core, &
-                      iboolleft_xi_inner_core,iboolright_xi_inner_core, &
-                      iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-                      npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-                      iboolfaces_inner_core,iboolcorner_inner_core, &
-                      iprocfrom_faces,iprocto_faces,imsg_type, &
-                      iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-                      buffer_send_faces,buffer_received_faces, &
-                      buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
-                      NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
-                      NGLOB1D_RADIAL,NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX,NGLOB2DMAX_XY,npoin2D_max_all_CM_IC)
-
-  ! mass matrix including central cube
-  if(INCLUDE_CENTRAL_CUBE) then
-
-    if(myrank == 0) write(IMAIN,*) 'including central cube'
-
-    ! compute number of messages to expect in cube as well as their size
-    call comp_central_cube_buffer_size(iproc_xi,iproc_eta,ichunk, &
-                NPROC_XI_VAL,NPROC_ETA_VAL,NSPEC2D_BOTTOM(IREGION_INNER_CORE), &
-                nb_msgs_theor_in_cube,npoin2D_cube_from_slices)
-
-    ! this value is used for dynamic memory allocation, therefore make sure it is never zero
-    if(nb_msgs_theor_in_cube > 0) then
-      non_zero_nb_msgs_theor_in_cube = nb_msgs_theor_in_cube
-    else
-      non_zero_nb_msgs_theor_in_cube = 1
-    endif
-
-    ! allocate buffers for cube and slices
-    allocate(sender_from_slices_to_cube(non_zero_nb_msgs_theor_in_cube), &
-            buffer_all_cube_from_slices(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices,NDIM), &
-            b_buffer_all_cube_from_slices(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices,NDIM), &
-            buffer_slices(npoin2D_cube_from_slices,NDIM), &
-            b_buffer_slices(npoin2D_cube_from_slices,NDIM), &
-            buffer_slices2(npoin2D_cube_from_slices,NDIM), &
-            ibool_central_cube(non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating cube buffers')
-
-    ! handles the communications with the central cube if it was included in the mesh
-    call prepare_timerun_centralcube(myrank,rmass_inner_core, &
-                      iproc_xi,iproc_eta,ichunk, &
-                      NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX,NSPEC2D_BOTTOM, &
-                      addressing,ibool_inner_core,idoubling_inner_core, &
-                      xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-                      nspec2D_xmin_inner_core,nspec2D_xmax_inner_core, &
-                      nspec2D_ymin_inner_core,nspec2D_ymax_inner_core, &
-                      ibelm_xmin_inner_core,ibelm_xmax_inner_core, &
-                      ibelm_ymin_inner_core,ibelm_ymax_inner_core,ibelm_bottom_inner_core, &
-                      nb_msgs_theor_in_cube,non_zero_nb_msgs_theor_in_cube, &
-                      npoin2D_cube_from_slices,receiver_cube_from_slices, &
-                      sender_from_slices_to_cube,ibool_central_cube, &
-                      buffer_slices,buffer_slices2,buffer_all_cube_from_slices)
-
-    call fix_non_blocking_central_cube(is_on_a_slice_edge_inner_core, &
-         ibool_inner_core,NSPEC_INNER_CORE,NGLOB_INNER_CORE,nb_msgs_theor_in_cube,ibelm_bottom_inner_core, &
-         idoubling_inner_core,npoin2D_cube_from_slices,ibool_central_cube, &
-         NSPEC2D_BOTTOM(IREGION_INNER_CORE),ichunk)
-
-
-  else
-
-    ! allocate fictitious buffers for cube and slices with a dummy size
-    ! just to be able to use them as arguments in subroutine calls
-    allocate(sender_from_slices_to_cube(1), &
-            buffer_all_cube_from_slices(1,1,1), &
-            b_buffer_all_cube_from_slices(1,1,1), &
-            buffer_slices(1,1), &
-            b_buffer_slices(1,1), &
-            buffer_slices2(1,1), &
-            ibool_central_cube(1,1),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating dummy buffers')
-
-  endif
-
-  ! check that all the mass matrices are positive
-  if(OCEANS_VAL) then
-    if(minval(rmass_ocean_load) <= 0.) call exit_MPI(myrank,'negative mass matrix term for the oceans')
-  endif
-  if(minval(rmass_crust_mantle) <= 0.) call exit_MPI(myrank,'negative mass matrix term for the crust_mantle')
-  if(minval(rmass_inner_core) <= 0.) call exit_MPI(myrank,'negative mass matrix term for the inner core')
-  if(minval(rmass_outer_core) <= 0.) call exit_MPI(myrank,'negative mass matrix term for the outer core')
-
-  ! for efficiency, invert final mass matrix once and for all on each slice
-  if(OCEANS_VAL) rmass_ocean_load = 1._CUSTOM_REAL / rmass_ocean_load
-  rmass_crust_mantle = 1._CUSTOM_REAL / rmass_crust_mantle
-  rmass_outer_core = 1._CUSTOM_REAL / rmass_outer_core
-  rmass_inner_core = 1._CUSTOM_REAL / rmass_inner_core
-
-
-  ! change x, y, z to r, theta and phi once and for all
-  ! IMPROVE dangerous: old name kept (xstore ystore zstore) for new values
-
-  ! convert in the crust and mantle
-  do i = 1,NGLOB_CRUST_MANTLE
-    call xyz_2_rthetaphi(xstore_crust_mantle(i), &
-                        ystore_crust_mantle(i), &
-                        zstore_crust_mantle(i),rval,thetaval,phival)
-    xstore_crust_mantle(i) = rval
-    ystore_crust_mantle(i) = thetaval
-    zstore_crust_mantle(i) = phival
-  enddo
-
-  ! convert in the outer core
-  do i = 1,NGLOB_OUTER_CORE
-    call xyz_2_rthetaphi(xstore_outer_core(i), &
-                        ystore_outer_core(i), &
-                        zstore_outer_core(i),rval,thetaval,phival)
-    xstore_outer_core(i) = rval
-    ystore_outer_core(i) = thetaval
-    zstore_outer_core(i) = phival
-  enddo
-
-  ! convert in the inner core
-  do i = 1,NGLOB_INNER_CORE
-    call xyz_2_rthetaphi(xstore_inner_core(i), &
-                        ystore_inner_core(i), &
-                        zstore_inner_core(i),rval,thetaval,phival)
-    xstore_inner_core(i) = rval
-    ystore_inner_core(i) = thetaval
-    zstore_inner_core(i) = phival
-  enddo
-
-  ! allocate files to save movies
-  if(MOVIE_SURFACE .or. NOISE_TOMOGRAPHY /=0) then    ! for noise tomography, store_val_x/y/z/ux/uy/uz needed for 'surface movie'
-    if(MOVIE_COARSE .and. NOISE_TOMOGRAPHY ==0) then  ! only output corners !for noise tomography, must NOT be coarse
-       nmovie_points = 2 * 2 * NSPEC2D_TOP(IREGION_CRUST_MANTLE)
-       if(NGLLX /= NGLLY) &
-        call exit_MPI(myrank,'MOVIE_COARSE together with MOVIE_SURFACE requires NGLLX=NGLLY')
-       NIT = NGLLX - 1
-    else
-       nmovie_points = NGLLX * NGLLY * NSPEC2D_TOP(IREGION_CRUST_MANTLE)
-       NIT = 1
-    endif
-    allocate(store_val_x(nmovie_points), &
-            store_val_y(nmovie_points), &
-            store_val_z(nmovie_points), &
-            store_val_ux(nmovie_points), &
-            store_val_uy(nmovie_points), &
-            store_val_uz(nmovie_points),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating movie surface arrays')
-
-    if (MOVIE_SURFACE) then  ! those arrays are not neccessary for noise tomography, so only allocate them in MOVIE_SURFACE case
-       allocate(store_val_x_all(nmovie_points,0:NPROCTOT_VAL-1), &
-              store_val_y_all(nmovie_points,0:NPROCTOT_VAL-1), &
-              store_val_z_all(nmovie_points,0:NPROCTOT_VAL-1), &
-              store_val_ux_all(nmovie_points,0:NPROCTOT_VAL-1), &
-              store_val_uy_all(nmovie_points,0:NPROCTOT_VAL-1), &
-              store_val_uz_all(nmovie_points,0:NPROCTOT_VAL-1),stat=ier)
-       if( ier /= 0 ) call exit_MPI(myrank,'error allocating movie surface all arrays')
-    endif
-    if(myrank == 0) then
-      write(IMAIN,*)
-      write(IMAIN,*) 'Movie surface:'
-      write(IMAIN,*) '  Writing to moviedata*** files in output directory'
-      if(MOVIE_VOLUME_TYPE == 5) then
-        write(IMAIN,*) '  movie output: displacement'
-      else
-        write(IMAIN,*) '  movie output: velocity'
-      endif
-      write(IMAIN,*) '  time steps every: ',NTSTEP_BETWEEN_FRAMES
-    endif
-  endif
-
-
-  ! output point and element information for 3D movies
-  if(MOVIE_VOLUME) then
-    ! the following has to be true for the the array dimensions of eps to match with those of xstore etc..
-    ! note that epsilondev and eps_trace_over_3 don't have the same dimensions.. could cause trouble
-    if (NSPEC_CRUST_MANTLE_STR_OR_ATT /= NSPEC_CRUST_MANTLE) &
-      stop 'NSPEC_CRUST_MANTLE_STRAINS_ATT /= NSPEC_CRUST_MANTLE'
-    if (NSPEC_CRUST_MANTLE_STRAIN_ONLY /= NSPEC_CRUST_MANTLE) &
-      stop 'NSPEC_CRUST_MANTLE_STRAIN_ONLY /= NSPEC_CRUST_MANTLE'
-
-    write(prname,'(a,i6.6,a)') trim(LOCAL_PATH)//'/'//'proc',myrank,'_'
-    call count_points_movie_volume(prname,ibool_crust_mantle, xstore_crust_mantle,ystore_crust_mantle, &
-                zstore_crust_mantle,MOVIE_TOP,MOVIE_BOTTOM,MOVIE_WEST,MOVIE_EAST,MOVIE_NORTH,MOVIE_SOUTH, &
-                MOVIE_COARSE,npoints_3dmovie,nspecel_3dmovie,num_ibool_3dmovie,mask_ibool,mask_3dmovie)
-
-
-    allocate(nu_3dmovie(3,3,npoints_3dmovie),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating nu for 3d movie')
-
-    call write_movie_volume_mesh(npoints_3dmovie,prname,ibool_crust_mantle,xstore_crust_mantle, &
-                           ystore_crust_mantle,zstore_crust_mantle, muvstore_crust_mantle_3dmovie, &
-                           mask_3dmovie,mask_ibool,num_ibool_3dmovie,nu_3dmovie,MOVIE_COARSE)
-
-    if(myrank == 0) then
-      write(IMAIN,*)
-      write(IMAIN,*) 'Movie volume:'
-      write(IMAIN,*) '  Writing to movie3D*** files on local disk databases directory'
-      if(MOVIE_VOLUME_TYPE == 1) then
-        write(IMAIN,*) '  movie output: strain'
-      else if(MOVIE_VOLUME_TYPE == 2) then
-        write(IMAIN,*) '  movie output: time integral of strain'
-      else if(MOVIE_VOLUME_TYPE == 3) then
-        write(IMAIN,*) '  movie output: potency or integral of strain'
-      else if(MOVIE_VOLUME_TYPE == 4) then
-        write(IMAIN,*) '  movie output: divergence and curl'
-      else if(MOVIE_VOLUME_TYPE == 5) then
-        write(IMAIN,*) '  movie output: displacement'
-      else if(MOVIE_VOLUME_TYPE == 6) then
-        write(IMAIN,*) '  movie output: velocity'
-      endif
-      write(IMAIN,*) '  depth(T,B):',MOVIE_TOP,MOVIE_BOTTOM
-      write(IMAIN,*) '  lon(W,E)  :',MOVIE_WEST,MOVIE_EAST
-      write(IMAIN,*) '  lat(S,N)  :',MOVIE_SOUTH,MOVIE_NORTH
-      write(IMAIN,*) '  Starting at time step:',MOVIE_START, 'ending at:',MOVIE_STOP,'every: ',NTSTEP_BETWEEN_FRAMES
-    endif
-
-    if( MOVIE_VOLUME_TYPE < 1 .or. MOVIE_VOLUME_TYPE > 6) &
-        call exit_MPI(myrank, 'MOVIE_VOLUME_TYPE has to be 1,2,3,4,5 or 6')
-
-  endif ! MOVIE_VOLUME
-
-  ! sets up time increments and rotation constants
-  call prepare_timerun_constants(myrank,NSTEP, &
-                    DT,t0,scale_t,scale_t_inv,scale_displ,scale_veloc, &
-                    deltat,deltatover2,deltatsqover2, &
-                    b_deltat,b_deltatover2,b_deltatsqover2, &
-                    two_omega_earth,A_array_rotation,B_array_rotation, &
-                    b_two_omega_earth, SIMULATION_TYPE)
-
-  ! precomputes gravity factors
-  call prepare_timerun_gravity(myrank, &
-                    minus_g_cmb,minus_g_icb, &
-                    minus_gravity_table,minus_deriv_gravity_table, &
-                    density_table,d_ln_density_dr_table,minus_rho_g_over_kappa_fluid, &
-                    ONE_CRUST,RICB,RCMB,RTOPDDOUBLEPRIME, &
-                    R600,R670,R220,R771,R400,R80,RMOHO,RMIDDLE_CRUST,ROCEAN)
-
-  ! precomputes attenuation factors
-  if(ATTENUATION_VAL) then
-    call prepare_timerun_attenuation(myrank, &
-                factor_scale_crust_mantle,one_minus_sum_beta_crust_mantle,factor_common_crust_mantle, &
-                factor_scale_inner_core,one_minus_sum_beta_inner_core,factor_common_inner_core, &
-                c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-                c22store_crust_mantle,c23store_crust_mantle, &
-                c33store_crust_mantle,c44store_crust_mantle, &
-                c55store_crust_mantle,c66store_crust_mantle, &
-                muvstore_crust_mantle,muhstore_crust_mantle,ispec_is_tiso_crust_mantle, &
-                muvstore_inner_core, &
-                SIMULATION_TYPE,MOVIE_VOLUME,muvstore_crust_mantle_3dmovie, &
-                c11store_inner_core,c12store_inner_core,c13store_inner_core, &
-                c33store_inner_core,c44store_inner_core, &
-                alphaval,betaval,gammaval,b_alphaval,b_betaval,b_gammaval, &
-                deltat,b_deltat,LOCAL_PATH)
-  endif
-
-  if(myrank == 0) then
-
-  write(IMAIN,*) 'for overlapping of communications with calculations:'
-  write(IMAIN,*)
-
-  percentage_edge = 100.*count(is_on_a_slice_edge_crust_mantle(:))/real(NSPEC_CRUST_MANTLE)
-  write(IMAIN,*) 'percentage of edge elements in crust/mantle ',percentage_edge,'%'
-  write(IMAIN,*) 'percentage of volume elements in crust/mantle ',100. - percentage_edge,'%'
-  write(IMAIN,*)
-
-  percentage_edge = 100.*count(is_on_a_slice_edge_outer_core(:))/real(NSPEC_OUTER_CORE)
-  write(IMAIN,*) 'percentage of edge elements in outer core ',percentage_edge,'%'
-  write(IMAIN,*) 'percentage of volume elements in outer core ',100. - percentage_edge,'%'
-  write(IMAIN,*)
-
-  percentage_edge = 100.*count(is_on_a_slice_edge_inner_core(:))/real(NSPEC_INNER_CORE)
-  write(IMAIN,*) 'percentage of edge elements in inner core ',percentage_edge,'%'
-  write(IMAIN,*) 'percentage of volume elements in inner core ',100. - percentage_edge,'%'
-  write(IMAIN,*)
-
-  endif
-
-  if(.not. USE_NONBLOCKING_COMMS) then
-    is_on_a_slice_edge_crust_mantle(:) = .true.
-    is_on_a_slice_edge_outer_core(:) = .true.
-    is_on_a_slice_edge_inner_core(:) = .true.
-  endif
-
-  ! initialize arrays to zero
-  displ_crust_mantle(:,:) = 0._CUSTOM_REAL
-  veloc_crust_mantle(:,:) = 0._CUSTOM_REAL
-  accel_crust_mantle(:,:) = 0._CUSTOM_REAL
-
-  displ_outer_core(:) = 0._CUSTOM_REAL
-  veloc_outer_core(:) = 0._CUSTOM_REAL
-  accel_outer_core(:) = 0._CUSTOM_REAL
-
-  displ_inner_core(:,:) = 0._CUSTOM_REAL
-  veloc_inner_core(:,:) = 0._CUSTOM_REAL
-  accel_inner_core(:,:) = 0._CUSTOM_REAL
-
-  ! put negligible initial value to avoid very slow underflow trapping
-  if(FIX_UNDERFLOW_PROBLEM) then
-    displ_crust_mantle(:,:) = VERYSMALLVAL
-    displ_outer_core(:) = VERYSMALLVAL
-    displ_inner_core(:,:) = VERYSMALLVAL
-  endif
-
-! if doing benchmark runs to measure scaling of the code,
-! set the initial field to 1 to make sure gradual underflow trapping does not slow down the code
-  if (DO_BENCHMARK_RUN_ONLY .and. SET_INITIAL_FIELD_TO_1_IN_BENCH) then
-    displ_crust_mantle(:,:) = 1._CUSTOM_REAL
-    veloc_crust_mantle(:,:) = 1._CUSTOM_REAL
-    accel_crust_mantle(:,:) = 1._CUSTOM_REAL
-
-    displ_outer_core(:) = 1._CUSTOM_REAL
-    veloc_outer_core(:) = 1._CUSTOM_REAL
-    accel_outer_core(:) = 1._CUSTOM_REAL
-
-    displ_inner_core(:,:) = 1._CUSTOM_REAL
-    veloc_inner_core(:,:) = 1._CUSTOM_REAL
-    accel_inner_core(:,:) = 1._CUSTOM_REAL
-  endif
-
-  if (SIMULATION_TYPE == 3) then
-    rho_kl_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
-    beta_kl_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
-    alpha_kl_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
-    if (NOISE_TOMOGRAPHY == 3) Sigma_kl_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
-
-    ! approximate hessian
-    if( APPROXIMATE_HESS_KL ) then
-      allocate( hess_kl_crust_mantle(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT),stat=ier)
-      if( ier /= 0 ) call exit_MPI(myrank,'error allocating hessian')
-      hess_kl_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
-    endif
-
-    ! For anisotropic kernels (in crust_mantle only)
-    cijkl_kl_crust_mantle(:,:,:,:,:) = 0._CUSTOM_REAL
-
-    rho_kl_outer_core(:,:,:,:) = 0._CUSTOM_REAL
-    alpha_kl_outer_core(:,:,:,:) = 0._CUSTOM_REAL
-
-    rho_kl_inner_core(:,:,:,:) = 0._CUSTOM_REAL
-    beta_kl_inner_core(:,:,:,:) = 0._CUSTOM_REAL
-    alpha_kl_inner_core(:,:,:,:) = 0._CUSTOM_REAL
-
-    div_displ_outer_core(:,:,:,:) = 0._CUSTOM_REAL
-    b_div_displ_outer_core(:,:,:,:) = 0._CUSTOM_REAL
-
-    ! deviatoric kernel check
-    if( deviatoric_outercore) then
-      nspec_beta_kl_outer_core = NSPEC_OUTER_CORE_ADJOINT
-    else
-      nspec_beta_kl_outer_core = 1
-    endif
-    allocate(beta_kl_outer_core(NGLLX,NGLLY,NGLLZ,nspec_beta_kl_outer_core),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating beta outercore')
-    beta_kl_outer_core(:,:,:,:) = 0._CUSTOM_REAL
-  endif
-
-  ! initialize to be on the save side for adjoint runs SIMULATION_TYPE==2
-  eps_trace_over_3_crust_mantle(:,:,:,:) = 0._CUSTOM_REAL
-  epsilondev_crust_mantle(:,:,:,:,:) = 0._CUSTOM_REAL
-  eps_trace_over_3_inner_core(:,:,:,:) = 0._CUSTOM_REAL
-  epsilondev_inner_core(:,:,:,:,:) = 0._CUSTOM_REAL
-  if(FIX_UNDERFLOW_PROBLEM) then
-    eps_trace_over_3_crust_mantle(:,:,:,:) = VERYSMALLVAL
-    epsilondev_crust_mantle(:,:,:,:,:) = VERYSMALLVAL
-    eps_trace_over_3_inner_core(:,:,:,:) = VERYSMALLVAL
-    epsilondev_inner_core(:,:,:,:,:) = VERYSMALLVAL
-  endif
-
-  if (COMPUTE_AND_STORE_STRAIN) then
-    if(MOVIE_VOLUME .and. (MOVIE_VOLUME_TYPE == 2 .or. MOVIE_VOLUME_TYPE == 3)) then
-      Iepsilondev_crust_mantle(:,:,:,:,:) = 0._CUSTOM_REAL
-      Ieps_trace_over_3_crust_mantle(:,:,:,:)=0._CUSTOM_REAL
-    endif
-  endif
-
-  ! clear memory variables if attenuation
-  if(ATTENUATION_VAL) then
-    R_memory_crust_mantle(:,:,:,:,:,:) = 0._CUSTOM_REAL
-    R_memory_inner_core(:,:,:,:,:,:) = 0._CUSTOM_REAL
-    if(FIX_UNDERFLOW_PROBLEM) then
-      R_memory_crust_mantle(:,:,:,:,:,:) = VERYSMALLVAL
-      R_memory_inner_core(:,:,:,:,:,:) = VERYSMALLVAL
-    endif
-  endif
-
-  ! reads files back from local disk or MT tape system if restart file
-  ! note: for SIMULATION_TYPE 3 simulations, the stored wavefields
-  !          will be read in the time loop after the Newmark time scheme update.
-  !          this makes indexing and timing easier to match with adjoint wavefields indexing.
-  call read_forward_arrays_startrun(myrank,NSTEP, &
-                    SIMULATION_TYPE,NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN, &
-                    it_begin,it_end, &
-                    displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
-                    displ_inner_core,veloc_inner_core,accel_inner_core, &
-                    displ_outer_core,veloc_outer_core,accel_outer_core, &
-                    R_memory_crust_mantle,R_memory_inner_core, &
-                    epsilondev_crust_mantle,epsilondev_inner_core, &
-                    A_array_rotation,B_array_rotation, &
-                    b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
-                    b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
-                    b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
-                    b_R_memory_crust_mantle,b_R_memory_inner_core, &
-                    b_epsilondev_crust_mantle,b_epsilondev_inner_core, &
-                    b_A_array_rotation,b_B_array_rotation,LOCAL_PATH)
-
-!<YANGL
-  ! NOISE TOMOGRAPHY
-  if ( NOISE_TOMOGRAPHY /= 0 ) then
-    allocate(noise_sourcearray(NDIM,NGLLX,NGLLY,NGLLZ,NSTEP), &
-            normal_x_noise(nmovie_points), &
-            normal_y_noise(nmovie_points), &
-            normal_z_noise(nmovie_points), &
-            mask_noise(nmovie_points), &
-            noise_surface_movie(NDIM,NGLLX,NGLLY,NSPEC2D_TOP(IREGION_CRUST_MANTLE)),stat=ier)
-    if( ier /= 0 ) call exit_MPI(myrank,'error allocating noise arrays')
-
-    noise_sourcearray(:,:,:,:,:) = 0._CUSTOM_REAL
-    normal_x_noise(:)            = 0._CUSTOM_REAL
-    normal_y_noise(:)            = 0._CUSTOM_REAL
-    normal_z_noise(:)            = 0._CUSTOM_REAL
-    mask_noise(:)                = 0._CUSTOM_REAL
-    noise_surface_movie(:,:,:,:) = 0._CUSTOM_REAL
-
-    call read_parameters_noise(myrank,nrec,NSTEP,nmovie_points, &
-                              islice_selected_rec,xi_receiver,eta_receiver,gamma_receiver,nu, &
-                              noise_sourcearray,xigll,yigll,zigll,NSPEC2D_TOP(IREGION_CRUST_MANTLE), &
-                              NIT, ibool_crust_mantle, ibelm_top_crust_mantle, &
-                              xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-                              irec_master_noise,normal_x_noise,normal_y_noise,normal_z_noise,mask_noise)
-
-    call check_parameters_noise(myrank,NOISE_TOMOGRAPHY,SIMULATION_TYPE,SAVE_FORWARD, &
-                              NUMBER_OF_RUNS, NUMBER_OF_THIS_RUN,ROTATE_SEISMOGRAMS_RT, &
-                              SAVE_ALL_SEISMOS_IN_ONE_FILE, USE_BINARY_FOR_LARGE_FILE, &
-                              MOVIE_COARSE,LOCAL_PATH,NSPEC2D_TOP(IREGION_CRUST_MANTLE),NSTEP)
-  endif
-!>YANGL
-
-!
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!
-
-!
-!   s t a r t   t i m e   i t e r a t i o n s
-!
-
-! synchronize all processes to make sure everybody is ready to start time loop
-  call MPI_BARRIER(MPI_COMM_WORLD,ier)
-  if(myrank == 0) write(IMAIN,*) 'All processes are synchronized before time loop'
-
-  if(myrank == 0) then
-    write(IMAIN,*)
-    write(IMAIN,*) 'Starting time iteration loop...'
-    write(IMAIN,*)
-  endif
-
-! create an empty file to monitor the start of the simulation
-  if(myrank == 0) then
-    open(unit=IOUT,file=trim(OUTPUT_FILES)//'/starttimeloop.txt',status='unknown',action='write')
-    write(IOUT,*) 'hello, starting time loop'
-    close(IOUT)
-  endif
-
-! initialize variables for writing seismograms
-  seismo_offset = it_begin-1
-  seismo_current = 0
-
-#ifdef _HANDOPT
-  imodulo_NGLOB_CRUST_MANTLE = mod(NGLOB_CRUST_MANTLE,3)
-  imodulo_NGLOB_CRUST_MANTLE4 = mod(NGLOB_CRUST_MANTLE,4)
-  imodulo_NGLOB_INNER_CORE = mod(NGLOB_INNER_CORE,3)
-#endif
-
-! get MPI starting time
-  time_start = MPI_WTIME()
-
-! *********************************************************
-! ************* MAIN LOOP OVER THE TIME STEPS *************
-! *********************************************************
-
-  do it = it_begin,it_end
-
-    ! update position in seismograms
-    seismo_current = seismo_current + 1
-
-    ! Newark time scheme update
-#ifdef _HANDOPT
-! way 2:
-! One common technique in computational science to help enhance pipelining is loop unrolling
-!
-! we're accessing NDIM=3 components at each line,
-! that is, for an iteration, the register must contain
-! NDIM * displ_ + NDIM * veloc_ + NDIM * accel + deltat + deltatsq..
-! in most cases a real (CUSTOM_REAL) value will have 4 bytes,
-! assuming a default cache size of about 128 bytes, we unroll here in steps of 3, thus 29 reals or 118 bytes,
-! rather than with steps of 4
-    ! mantle
-    if(imodulo_NGLOB_CRUST_MANTLE >= 1) then
-      do i = 1,imodulo_NGLOB_CRUST_MANTLE
-        displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
-          + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
-
-        veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
-          + deltatover2*accel_crust_mantle(:,i)
-
-        accel_crust_mantle(:,i) = 0._CUSTOM_REAL
-      enddo
-    endif
-    do i = imodulo_NGLOB_CRUST_MANTLE+1,NGLOB_CRUST_MANTLE, 3 ! in steps of 3
-      displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
-        + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
-      displ_crust_mantle(:,i+1) = displ_crust_mantle(:,i+1) &
-        + deltat*veloc_crust_mantle(:,i+1) + deltatsqover2*accel_crust_mantle(:,i+1)
-      displ_crust_mantle(:,i+2) = displ_crust_mantle(:,i+2) &
-        + deltat*veloc_crust_mantle(:,i+2) + deltatsqover2*accel_crust_mantle(:,i+2)
-
-
-      veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
-        + deltatover2*accel_crust_mantle(:,i)
-      veloc_crust_mantle(:,i+1) = veloc_crust_mantle(:,i+1) &
-        + deltatover2*accel_crust_mantle(:,i+1)
-      veloc_crust_mantle(:,i+2) = veloc_crust_mantle(:,i+2) &
-        + deltatover2*accel_crust_mantle(:,i+2)
-
-      ! set acceleration to zero
-      ! note: we do initialize acceleration in this loop since it is read already into the cache,
-      !           otherwise it would have to be read in again for this explicitly,
-      !           which would make this step more expensive
-      accel_crust_mantle(:,i) = 0._CUSTOM_REAL
-      accel_crust_mantle(:,i+1) = 0._CUSTOM_REAL
-      accel_crust_mantle(:,i+2) = 0._CUSTOM_REAL
-    enddo
-
-    ! outer core
-    do i=1,NGLOB_OUTER_CORE
-      displ_outer_core(i) = displ_outer_core(i) &
-        + deltat*veloc_outer_core(i) + deltatsqover2*accel_outer_core(i)
-
-      veloc_outer_core(i) = veloc_outer_core(i) &
-        + deltatover2*accel_outer_core(i)
-
-      accel_outer_core(i) = 0._CUSTOM_REAL
-    enddo
-
-    ! inner core
-    if(imodulo_NGLOB_INNER_CORE >= 1) then
-      do i = 1,imodulo_NGLOB_INNER_CORE
-        displ_inner_core(:,i) = displ_inner_core(:,i) &
-          + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
-
-        veloc_inner_core(:,i) = veloc_inner_core(:,i) &
-          + deltatover2*accel_inner_core(:,i)
-
-        accel_inner_core(:,i) = 0._CUSTOM_REAL
-      enddo
-    endif
-    do i = imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE, 3 ! in steps of 3
-      displ_inner_core(:,i) = displ_inner_core(:,i) &
-        + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
-      displ_inner_core(:,i+1) = displ_inner_core(:,i+1) &
-        + deltat*veloc_inner_core(:,i+1) + deltatsqover2*accel_inner_core(:,i+1)
-      displ_inner_core(:,i+2) = displ_inner_core(:,i+2) &
-        + deltat*veloc_inner_core(:,i+2) + deltatsqover2*accel_inner_core(:,i+2)
-
-
-      veloc_inner_core(:,i) = veloc_inner_core(:,i) &
-        + deltatover2*accel_inner_core(:,i)
-      veloc_inner_core(:,i+1) = veloc_inner_core(:,i+1) &
-        + deltatover2*accel_inner_core(:,i+1)
-      veloc_inner_core(:,i+2) = veloc_inner_core(:,i+2) &
-        + deltatover2*accel_inner_core(:,i+2)
-
-      accel_inner_core(:,i) = 0._CUSTOM_REAL
-      accel_inner_core(:,i+1) = 0._CUSTOM_REAL
-      accel_inner_core(:,i+2) = 0._CUSTOM_REAL
-    enddo
-
-#else
-! way 1:
-    ! mantle
-    do i=1,NGLOB_CRUST_MANTLE
-      displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
-        + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
-      veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
-        + deltatover2*accel_crust_mantle(:,i)
-      accel_crust_mantle(:,i) = 0._CUSTOM_REAL
-    enddo
-    ! outer core
-    do i=1,NGLOB_OUTER_CORE
-      displ_outer_core(i) = displ_outer_core(i) &
-        + deltat*veloc_outer_core(i) + deltatsqover2*accel_outer_core(i)
-      veloc_outer_core(i) = veloc_outer_core(i) &
-        + deltatover2*accel_outer_core(i)
-      accel_outer_core(i) = 0._CUSTOM_REAL
-    enddo
-    ! inner core
-    do i=1,NGLOB_INNER_CORE
-      displ_inner_core(:,i) = displ_inner_core(:,i) &
-        + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
-      veloc_inner_core(:,i) = veloc_inner_core(:,i) &
-        + deltatover2*accel_inner_core(:,i)
-      accel_inner_core(:,i) = 0._CUSTOM_REAL
-    enddo
-#endif
-
-
-
-
-    ! backward field
-    if (SIMULATION_TYPE == 3) then
-
-#ifdef _HANDOPT
-! way 2:
-      ! mantle
-      if(imodulo_NGLOB_CRUST_MANTLE >= 1) then
-        do i=1,imodulo_NGLOB_CRUST_MANTLE
-          b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
-            + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
-          b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
-            + b_deltatover2*b_accel_crust_mantle(:,i)
-          b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
-        enddo
-      endif
-      do i=imodulo_NGLOB_CRUST_MANTLE+1,NGLOB_CRUST_MANTLE,3
-        b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
-          + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
-        b_displ_crust_mantle(:,i+1) = b_displ_crust_mantle(:,i+1) &
-          + b_deltat*b_veloc_crust_mantle(:,i+1) + b_deltatsqover2*b_accel_crust_mantle(:,i+1)
-        b_displ_crust_mantle(:,i+2) = b_displ_crust_mantle(:,i+2) &
-          + b_deltat*b_veloc_crust_mantle(:,i+2) + b_deltatsqover2*b_accel_crust_mantle(:,i+2)
-
-
-        b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
-          + b_deltatover2*b_accel_crust_mantle(:,i)
-        b_veloc_crust_mantle(:,i+1) = b_veloc_crust_mantle(:,i+1) &
-          + b_deltatover2*b_accel_crust_mantle(:,i+1)
-        b_veloc_crust_mantle(:,i+2) = b_veloc_crust_mantle(:,i+2) &
-          + b_deltatover2*b_accel_crust_mantle(:,i+2)
-
-        b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
-        b_accel_crust_mantle(:,i+1) = 0._CUSTOM_REAL
-        b_accel_crust_mantle(:,i+2) = 0._CUSTOM_REAL
-      enddo
-
-      ! outer core
-      do i=1,NGLOB_OUTER_CORE
-        b_displ_outer_core(i) = b_displ_outer_core(i) &
-          + b_deltat*b_veloc_outer_core(i) + b_deltatsqover2*b_accel_outer_core(i)
-        b_veloc_outer_core(i) = b_veloc_outer_core(i) &
-          + b_deltatover2*b_accel_outer_core(i)
-        b_accel_outer_core(i) = 0._CUSTOM_REAL
-      enddo
-
-      ! inner core
-      if(imodulo_NGLOB_INNER_CORE >= 1) then
-        do i=1,imodulo_NGLOB_INNER_CORE
-          b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
-            + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
-          b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
-            + b_deltatover2*b_accel_inner_core(:,i)
-          b_accel_inner_core(:,i) = 0._CUSTOM_REAL
-        enddo
-      endif
-      do i=imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE,3
-        b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
-          + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
-        b_displ_inner_core(:,i+1) = b_displ_inner_core(:,i+1) &
-          + b_deltat*b_veloc_inner_core(:,i+1) + b_deltatsqover2*b_accel_inner_core(:,i+1)
-        b_displ_inner_core(:,i+2) = b_displ_inner_core(:,i+2) &
-          + b_deltat*b_veloc_inner_core(:,i+2) + b_deltatsqover2*b_accel_inner_core(:,i+2)
-
-        b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
-          + b_deltatover2*b_accel_inner_core(:,i)
-        b_veloc_inner_core(:,i+1) = b_veloc_inner_core(:,i+1) &
-          + b_deltatover2*b_accel_inner_core(:,i+1)
-        b_veloc_inner_core(:,i+2) = b_veloc_inner_core(:,i+2) &
-          + b_deltatover2*b_accel_inner_core(:,i+2)
-
-        b_accel_inner_core(:,i) = 0._CUSTOM_REAL
-        b_accel_inner_core(:,i+1) = 0._CUSTOM_REAL
-        b_accel_inner_core(:,i+2) = 0._CUSTOM_REAL
-      enddo
-#else
-! way 1:
-      ! mantle
-      do i=1,NGLOB_CRUST_MANTLE
-        b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
-          + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
-        b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
-          + b_deltatover2*b_accel_crust_mantle(:,i)
-        b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
-      enddo
-      ! outer core
-      do i=1,NGLOB_OUTER_CORE
-        b_displ_outer_core(i) = b_displ_outer_core(i) &
-          + b_deltat*b_veloc_outer_core(i) + b_deltatsqover2*b_accel_outer_core(i)
-        b_veloc_outer_core(i) = b_veloc_outer_core(i) &
-          + b_deltatover2*b_accel_outer_core(i)
-        b_accel_outer_core(i) = 0._CUSTOM_REAL
-      enddo
-      ! inner core
-      do i=1,NGLOB_INNER_CORE
-        b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
-          + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
-        b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
-          + b_deltatover2*b_accel_inner_core(:,i)
-        b_accel_inner_core(:,i) = 0._CUSTOM_REAL
-      enddo
-#endif
-    endif ! SIMULATION_TYPE == 3
-
-    ! integral of strain for adjoint movie volume
-    if(MOVIE_VOLUME .and. (MOVIE_VOLUME_TYPE == 2 .or. MOVIE_VOLUME_TYPE == 3) ) then
-      Iepsilondev_crust_mantle(:,:,:,:,:) = Iepsilondev_crust_mantle(:,:,:,:,:)  &
-                                              + deltat*epsilondev_crust_mantle(:,:,:,:,:)
-      Ieps_trace_over_3_crust_mantle(:,:,:,:) = Ieps_trace_over_3_crust_mantle(:,:,:,:) &
-                                              + deltat*eps_trace_over_3_crust_mantle(:,:,:,:)
-    endif
-
-    ! daniel: debugging
-    !if( maxval(displ_crust_mantle(1,:)**2 + &
-    !                displ_crust_mantle(2,:)**2 + displ_crust_mantle(3,:)**2) > 1.e4 ) then
-    !  print*,'slice',myrank
-    !  print*,'  crust_mantle displ:', maxval(displ_crust_mantle(1,:)), &
-    !           maxval(displ_crust_mantle(2,:)),maxval(displ_crust_mantle(3,:))
-    !  print*,'  indxs: ',maxloc( displ_crust_mantle(1,:)),maxloc( displ_crust_mantle(2,:)),maxloc( displ_crust_mantle(3,:))
-    !  indx = maxloc( displ_crust_mantle(3,:) )
-    !  rval = xstore_crust_mantle(indx(1))
-    !  thetaval = ystore_crust_mantle(indx(1))
-    !  phival = zstore_crust_mantle(indx(1))
-    !  !thetaval = PI/2.0d0-datan(1.006760466d0*dcos(dble(thetaval))/dmax1(TINYVAL,dsin(dble(thetaval))))
-    !  print*,'r/lat/lon:',rval*R_EARTH_KM,90.0-thetaval*180./PI,phival*180./PI
-    !  call rthetaphi_2_xyz(rval,thetaval,phival,xstore_crust_mantle(indx(1)),&
-    !                     ystore_crust_mantle(indx(1)),zstore_crust_mantle(indx(1)))
-    !  print*,'x/y/z:',rval,thetaval,phival
-    !  call exit_MPI(myrank,'error stability')
-    !endif
-
-
-    ! compute the maximum of the norm of the displacement
-    ! in all the slices using an MPI reduction
-    ! and output timestamp file to check that simulation is running fine
-    if(mod(it,NTSTEP_BETWEEN_OUTPUT_INFO) == 0 .or. it == 5 .or. it == NSTEP) &
-      call check_simulation_stability(it,displ_crust_mantle,displ_inner_core,displ_outer_core, &
-                          b_displ_crust_mantle,b_displ_inner_core,b_displ_outer_core, &
-                          eps_trace_over_3_crust_mantle,epsilondev_crust_mantle, &
-                          SIMULATION_TYPE,OUTPUT_FILES,time_start,DT,t0,NSTEP, &
-                          myrank)
-
-
-    ! ****************************************************
-    !   big loop over all spectral elements in the fluid
-    ! ****************************************************
-
-    ! compute internal forces in the fluid region
-    if(CUSTOM_REAL == SIZE_REAL) then
-      time = sngl((dble(it-1)*DT-t0)*scale_t_inv)
-    else
-      time = (dble(it-1)*DT-t0)*scale_t_inv
-    endif
-
-    iphase = 0 ! do not start any non blocking communications at this stage
-    icall = 1  ! compute all the outer elements first in the case of non blocking MPI
-
-    if( USE_DEVILLE_PRODUCTS_VAL ) then
-      ! uses Deville et al. (2002) routine
-      call compute_forces_outer_core_Dev(time,deltat,two_omega_earth, &
-           A_array_rotation,B_array_rotation,d_ln_density_dr_table, &
-           minus_rho_g_over_kappa_fluid, &
-           displ_outer_core,accel_outer_core,div_displ_outer_core, &
-           xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-           xix_outer_core,xiy_outer_core,xiz_outer_core, &
-           etax_outer_core,etay_outer_core,etaz_outer_core, &
-           gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-          is_on_a_slice_edge_outer_core, &
-          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-          iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces, &
-          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-          buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-          buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar,iphase,icall, &
-           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-           ibool_outer_core,MOVIE_VOLUME)
-    else
-      ! div_displ_outer_core is initialized to zero in the following subroutine.
-      call compute_forces_outer_core(time,deltat,two_omega_earth, &
-           A_array_rotation,B_array_rotation,d_ln_density_dr_table, &
-           minus_rho_g_over_kappa_fluid, &
-           displ_outer_core,accel_outer_core,div_displ_outer_core, &
-           xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-           xix_outer_core,xiy_outer_core,xiz_outer_core, &
-           etax_outer_core,etay_outer_core,etaz_outer_core, &
-           gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-          is_on_a_slice_edge_outer_core, &
-          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-          iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces, &
-          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-          buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-          buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar,iphase,icall, &
-           hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-           ibool_outer_core,MOVIE_VOLUME)
-    endif
-
-    if (SIMULATION_TYPE == 3) then
-      ! note on backward/reconstructed wavefields:
-      !       time for b_displ( it=1 ) corresponds to (NSTEP - 1)*DT - t0  (after Newmark scheme...)
-      !       as we start with saved wavefields b_displ( 1 ) <-> displ( NSTEP ) which correspond
-      !       to a time (NSTEP - (it-1) - 1)*DT - t0
-      !       for reconstructing the rotational contributions
-      if(CUSTOM_REAL == SIZE_REAL) then
-        time = sngl((dble(NSTEP-it)*DT-t0)*scale_t_inv)
-      else
-        time = (dble(NSTEP-it)*DT-t0)*scale_t_inv
-      endif
-
-      b_iphase = 0 ! do not start any non blocking communications at this stage
-      b_icall = 1  ! compute all the outer elements first in the case of non blocking MPI
-
-      if( USE_DEVILLE_PRODUCTS_VAL ) then
-        ! uses Deville et al. (2002) routine
-        call compute_forces_outer_core_Dev(time,b_deltat,b_two_omega_earth, &
-           b_A_array_rotation,b_B_array_rotation,d_ln_density_dr_table, &
-           minus_rho_g_over_kappa_fluid, &
-           b_displ_outer_core,b_accel_outer_core,b_div_displ_outer_core, &
-           xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-           xix_outer_core,xiy_outer_core,xiz_outer_core, &
-           etax_outer_core,etay_outer_core,etaz_outer_core, &
-           gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-          is_on_a_slice_edge_outer_core, &
-          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-          iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces, &
-          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-          b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-          b_buffer_send_chunkcorn_scalar,b_buffer_recv_chunkcorn_scalar,b_iphase,b_icall, &
-           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-           ibool_outer_core,MOVIE_VOLUME)
-      else
-        call compute_forces_outer_core(time,b_deltat,b_two_omega_earth, &
-           b_A_array_rotation,b_B_array_rotation,d_ln_density_dr_table, &
-           minus_rho_g_over_kappa_fluid, &
-           b_displ_outer_core,b_accel_outer_core,b_div_displ_outer_core, &
-           xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-           xix_outer_core,xiy_outer_core,xiz_outer_core, &
-           etax_outer_core,etay_outer_core,etaz_outer_core, &
-           gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-          is_on_a_slice_edge_outer_core, &
-          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-          iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces, &
-          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-          b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-          b_buffer_send_chunkcorn_scalar,b_buffer_recv_chunkcorn_scalar,b_iphase,b_icall, &
-           hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-           ibool_outer_core,MOVIE_VOLUME)
-      endif
-    endif
-
-    ! Stacey absorbing boundaries
-    if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) then
-      call compute_stacey_outer_core(ichunk,SIMULATION_TYPE,SAVE_FORWARD, &
-                              NSTEP,it,ibool_outer_core, &
-                              veloc_outer_core,accel_outer_core,b_accel_outer_core, &
-                              vp_outer_core,wgllwgll_xz,wgllwgll_yz,wgllwgll_xy, &
-                              jacobian2D_bottom_outer_core, &
-                              jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core, &
-                              jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core, &
-                              ibelm_bottom_outer_core, &
-                              ibelm_xmin_outer_core,ibelm_xmax_outer_core, &
-                              ibelm_ymin_outer_core,ibelm_ymax_outer_core, &
-                              nimin_outer_core,nimax_outer_core, &
-                              njmin_outer_core,njmax_outer_core, &
-                              nkmin_xi_outer_core,nkmin_eta_outer_core, &
-                              NSPEC2D_BOTTOM, &
-                              nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
-                              nspec2D_ymin_outer_core,nspec2D_ymax_outer_core, &
-                              reclen_zmin, &
-                              reclen_xmin_outer_core,reclen_xmax_outer_core, &
-                              reclen_ymin_outer_core,reclen_ymax_outer_core, &
-                              nabs_zmin_oc, &
-                              nabs_xmin_oc,nabs_xmax_oc,nabs_ymin_oc,nabs_ymax_oc, &
-                              absorb_zmin_outer_core, &
-                              absorb_xmin_outer_core,absorb_xmax_outer_core, &
-                              absorb_ymin_outer_core,absorb_ymax_outer_core)
-    endif ! Stacey conditions
-
-
-    ! ****************************************************
-    ! **********  add matching with solid part  **********
-    ! ****************************************************
-
-    ! only for elements in first matching layer in the fluid
-
-    !---
-    !--- couple with mantle at the top of the outer core
-    !---
-    if(ACTUALLY_COUPLE_FLUID_CMB) &
-      call compute_coupling_fluid_CMB(displ_crust_mantle,b_displ_crust_mantle, &
-                            ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
-                            accel_outer_core,b_accel_outer_core, &
-                            normal_top_outer_core,jacobian2D_top_outer_core, &
-                            wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
-                            SIMULATION_TYPE,NSPEC2D_TOP(IREGION_OUTER_CORE))
-
-    !---
-    !--- couple with inner core at the bottom of the outer core
-    !---
-    if(ACTUALLY_COUPLE_FLUID_ICB) &
-      call compute_coupling_fluid_ICB(displ_inner_core,b_displ_inner_core, &
-                            ibool_inner_core,ibelm_top_inner_core,  &
-                            accel_outer_core,b_accel_outer_core, &
-                            normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
-                            wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
-                            SIMULATION_TYPE,NSPEC2D_BOTTOM(IREGION_OUTER_CORE))
-
-
-    ! assemble all the contributions between slices using MPI
-
-    ! outer core
-    if(USE_NONBLOCKING_COMMS) then
-      iphase = 1 ! start the non blocking communications
-      call assemble_MPI_scalar(myrank,accel_outer_core,NGLOB_OUTER_CORE, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-            iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
-            NUMMSGS_FACES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XY,NCHUNKS_VAL,iphase)
-
-      icall = 2 ! now compute all the inner elements in the case of non blocking MPI
-
-      if( USE_DEVILLE_PRODUCTS_VAL ) then
-        ! uses Deville et al. (2002) routine
-        call compute_forces_outer_core_Dev(time,deltat,two_omega_earth, &
-           A_array_rotation,B_array_rotation,d_ln_density_dr_table, &
-           minus_rho_g_over_kappa_fluid,displ_outer_core,accel_outer_core,div_displ_outer_core, &
-           xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-           xix_outer_core,xiy_outer_core,xiz_outer_core, &
-           etax_outer_core,etay_outer_core,etaz_outer_core, &
-           gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-          is_on_a_slice_edge_outer_core, &
-          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-          iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces, &
-          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-          buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-          buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar,iphase,icall, &
-           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-           ibool_outer_core,MOVIE_VOLUME)
-      else
-        ! div_displ_outer_core is initialized to zero in the following subroutine.
-        call compute_forces_outer_core(time,deltat,two_omega_earth, &
-           A_array_rotation,B_array_rotation,d_ln_density_dr_table, &
-           minus_rho_g_over_kappa_fluid,displ_outer_core,accel_outer_core,div_displ_outer_core, &
-           xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-           xix_outer_core,xiy_outer_core,xiz_outer_core, &
-           etax_outer_core,etay_outer_core,etaz_outer_core, &
-           gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-          is_on_a_slice_edge_outer_core, &
-          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-          iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces, &
-          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-          buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-          buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar,iphase,icall, &
-           hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-           ibool_outer_core,MOVIE_VOLUME)
-      endif
-
-      do while (iphase <= 7) ! make sure the last communications are finished and processed
-        call assemble_MPI_scalar(myrank,accel_outer_core,NGLOB_OUTER_CORE, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-            iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
-            NUMMSGS_FACES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XY,NCHUNKS_VAL,iphase)
-      enddo
-
-    else ! if(.not. USE_NONBLOCKING_COMMS) then
-
-      call assemble_MPI_scalar_block(myrank,accel_outer_core,NGLOB_OUTER_CORE, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_outer_core,iboolright_xi_outer_core, &
-            iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-            iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XY,NCHUNKS_VAL)
-
-    endif
-
-    ! multiply by the inverse of the mass matrix and update velocity
-    do i=1,NGLOB_OUTER_CORE
-      accel_outer_core(i) = accel_outer_core(i)*rmass_outer_core(i)
-      veloc_outer_core(i) = veloc_outer_core(i) + deltatover2*accel_outer_core(i)
-    enddo
-
-    if (SIMULATION_TYPE == 3) then
-
-! ------------------- new non blocking implementation -------------------
-
-    ! outer core
-      if(USE_NONBLOCKING_COMMS) then
-        b_iphase = 1 ! start the non blocking communications
-        call assemble_MPI_scalar(myrank,b_accel_outer_core,NGLOB_OUTER_CORE, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-            iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_scalar,b_buffer_recv_chunkcorn_scalar, &
-            NUMMSGS_FACES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XY,NCHUNKS_VAL,b_iphase)
-
-        b_icall = 2 ! now compute all the inner elements in the case of non blocking MPI
-
-        if( USE_DEVILLE_PRODUCTS_VAL ) then
-          ! uses Deville et al. (2002) routine
-          call compute_forces_outer_core_Dev(time,b_deltat,b_two_omega_earth, &
-           b_A_array_rotation,b_B_array_rotation,d_ln_density_dr_table, &
-           minus_rho_g_over_kappa_fluid, &
-           b_displ_outer_core,b_accel_outer_core,b_div_displ_outer_core, &
-           xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-           xix_outer_core,xiy_outer_core,xiz_outer_core, &
-           etax_outer_core,etay_outer_core,etaz_outer_core, &
-           gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-          is_on_a_slice_edge_outer_core, &
-          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-          iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces, &
-          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-          b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-          b_buffer_send_chunkcorn_scalar,b_buffer_recv_chunkcorn_scalar,b_iphase,b_icall, &
-           hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-           ibool_outer_core,MOVIE_VOLUME)
-        else
-          ! div_displ_outer_core is initialized to zero in the following subroutine.
-          call compute_forces_outer_core(time,b_deltat,b_two_omega_earth, &
-           b_A_array_rotation,b_B_array_rotation,d_ln_density_dr_table, &
-           minus_rho_g_over_kappa_fluid, &
-           b_displ_outer_core,b_accel_outer_core,b_div_displ_outer_core, &
-           xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-           xix_outer_core,xiy_outer_core,xiz_outer_core, &
-           etax_outer_core,etay_outer_core,etaz_outer_core, &
-           gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-          is_on_a_slice_edge_outer_core, &
-          myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-          iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-          npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-          iboolfaces_outer_core,iboolcorner_outer_core, &
-          iprocfrom_faces,iprocto_faces, &
-          iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-          b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-          b_buffer_send_chunkcorn_scalar,b_buffer_recv_chunkcorn_scalar,b_iphase,b_icall, &
-           hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-           wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-           ibool_outer_core,MOVIE_VOLUME)
-        endif
-
-        do while (b_iphase <= 7) ! make sure the last communications are finished and processed
-          call assemble_MPI_scalar(myrank,b_accel_outer_core,NGLOB_OUTER_CORE, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_outer_core,iboolright_xi_outer_core,iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-            iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_scalar,b_buffer_recv_chunkcorn_scalar, &
-            NUMMSGS_FACES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XY,NCHUNKS_VAL,b_iphase)
-        enddo
-
-      else ! if(.not. USE_NONBLOCKING_COMMS) then
-
-        call assemble_MPI_scalar_block(myrank,b_accel_outer_core,NGLOB_OUTER_CORE, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_outer_core,iboolright_xi_outer_core, &
-            iboolleft_eta_outer_core,iboolright_eta_outer_core, &
-            npoin2D_faces_outer_core,npoin2D_xi_outer_core,npoin2D_eta_outer_core, &
-            iboolfaces_outer_core,iboolcorner_outer_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_scalar,b_buffer_recv_chunkcorn_scalar, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE), &
-            NGLOB2DMAX_XY,NCHUNKS_VAL)
-
-      endif
-
-! ------------------- new non blocking implementation -------------------
-
-      ! Newmark time scheme - corrector for fluid parts
-      do i=1,NGLOB_OUTER_CORE
-        b_accel_outer_core(i) = b_accel_outer_core(i)*rmass_outer_core(i)
-        b_veloc_outer_core(i) = b_veloc_outer_core(i) + b_deltatover2*b_accel_outer_core(i)
-      enddo
-
-    endif
-
-    ! ****************************************************
-    !   big loop over all spectral elements in the solid
-    ! ****************************************************
-
-    ! compute internal forces in the solid regions
-
-    ! for anisotropy and gravity, x y and z contain r theta and phi
-
-    iphase = 0 ! do not start any non blocking communications at this stage
-    iphase_CC = 0 ! do not start any non blocking communications at this stage
-    icall = 1  ! compute all the outer elements first in the case of non blocking MPI
-
-    if( USE_DEVILLE_PRODUCTS_VAL ) then
-      call compute_forces_crust_mantle_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_crust_mantle,accel_crust_mantle, &
-          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-          etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-!----------------------
-            is_on_a_slice_edge_crust_mantle,icall, &
-            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_xxT, &
-          hprimewgll_xx,hprimewgll_xxT, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle, &
-          muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-          c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-          c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-          c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-          c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-          c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-          c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-          c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-          ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-      ! --idoubling_crust_mantle, &
-          R_memory_crust_mantle,epsilondev_crust_mantle, &
-          eps_trace_over_3_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          alphaval,betaval,gammaval,factor_common_crust_mantle, &
-          size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
-          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
-    else
-      call compute_forces_crust_mantle(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_crust_mantle,accel_crust_mantle, &
-          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-          etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-!----------------------
-            is_on_a_slice_edge_crust_mantle,icall, &
-            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_yy,hprime_zz, &
-          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle, &
-          muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-          c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-          c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-          c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-          c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-          c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-          c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-          c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-          ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-        ! --idoubling_crust_mantle, &
-          R_memory_crust_mantle,epsilondev_crust_mantle, &
-          eps_trace_over_3_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          alphaval,betaval,gammaval,factor_common_crust_mantle, &
-          size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
-          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
-    endif
-
-    if (SIMULATION_TYPE == 3 ) then
-
-      b_iphase = 0 ! do not start any non blocking communications at this stage
-      b_iphase_CC = 0 ! do not start any non blocking communications at this stage
-      b_icall = 1  ! compute all the outer elements first in the case of non blocking MPI
-
-    ! for anisotropy and gravity, x y and z contain r theta and phi
-      if( USE_DEVILLE_PRODUCTS_VAL ) then
-        call compute_forces_crust_mantle_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          b_displ_crust_mantle,b_accel_crust_mantle, &
-          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-          etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-!----------------------
-            is_on_a_slice_edge_crust_mantle,b_icall, &
-            b_accel_inner_core,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector,b_iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,b_buffer_all_cube_from_slices,b_buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,b_iphase_CC, &
-!----------------------
-          hprime_xx,hprime_xxT, &
-          hprimewgll_xx,hprimewgll_xxT, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle, &
-          muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-          c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-          c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-          c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-          c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-          c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-          c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-          c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-          ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-     ! --     idoubling_crust_mantle, &
-          b_R_memory_crust_mantle,b_epsilondev_crust_mantle, &
-          b_eps_trace_over_3_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          b_alphaval,b_betaval,b_gammaval,factor_common_crust_mantle, &
-          size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
-          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
-      else
-        call compute_forces_crust_mantle(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          b_displ_crust_mantle,b_accel_crust_mantle, &
-          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-          etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-!----------------------
-            is_on_a_slice_edge_crust_mantle,b_icall, &
-            b_accel_inner_core,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector,b_iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,b_buffer_all_cube_from_slices,b_buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,b_iphase_CC, &
-!----------------------
-          hprime_xx,hprime_yy,hprime_zz, &
-          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle, &
-          muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-          c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-          c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-          c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-          c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-          c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-          c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-          c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-          ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-      ! --idoubling_crust_mantle, &
-          b_R_memory_crust_mantle,b_epsilondev_crust_mantle, &
-          b_eps_trace_over_3_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          b_alphaval,b_betaval,b_gammaval,factor_common_crust_mantle, &
-          size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
-          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
-
-      endif
-    endif
-
-    ! Deville routine
-    if( USE_DEVILLE_PRODUCTS_VAL ) then
-      call compute_forces_inner_core_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_inner_core,accel_inner_core, &
-          xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-          xix_inner_core,xiy_inner_core,xiz_inner_core, &
-          etax_inner_core,etay_inner_core,etaz_inner_core, &
-          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-!----------------------
-            is_on_a_slice_edge_inner_core,icall, &
-            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
-          c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-          c13store_inner_core,c44store_inner_core, &
-          R_memory_inner_core,epsilondev_inner_core, eps_trace_over_3_inner_core,&
-          one_minus_sum_beta_inner_core, &
-          alphaval,betaval,gammaval, &
-          factor_common_inner_core, &
-          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
-          size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
-    else
-      call compute_forces_inner_core(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_inner_core,accel_inner_core, &
-          xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-          xix_inner_core,xiy_inner_core,xiz_inner_core, &
-          etax_inner_core,etay_inner_core,etaz_inner_core, &
-          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-!----------------------
-            is_on_a_slice_edge_inner_core,icall, &
-            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
-          c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-          c13store_inner_core,c44store_inner_core, &
-          R_memory_inner_core,epsilondev_inner_core, eps_trace_over_3_inner_core,&
-          one_minus_sum_beta_inner_core, &
-          alphaval,betaval,gammaval, &
-          factor_common_inner_core, &
-          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
-          size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
-    endif
-
-    if (SIMULATION_TYPE == 3) then
-      if( USE_DEVILLE_PRODUCTS_VAL ) then
-        call compute_forces_inner_core_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          b_displ_inner_core,b_accel_inner_core, &
-          xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-          xix_inner_core,xiy_inner_core,xiz_inner_core, &
-          etax_inner_core,etay_inner_core,etaz_inner_core, &
-          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-!----------------------
-            is_on_a_slice_edge_inner_core,b_icall, &
-            b_accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector,b_iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,b_buffer_all_cube_from_slices,b_buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,b_iphase_CC, &
-!----------------------
-          hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
-          c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-          c13store_inner_core,c44store_inner_core, &
-          b_R_memory_inner_core,b_epsilondev_inner_core, b_eps_trace_over_3_inner_core,&
-          one_minus_sum_beta_inner_core, &
-          b_alphaval,b_betaval,b_gammaval, &
-          factor_common_inner_core, &
-          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
-          size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
-      else
-        call compute_forces_inner_core(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          b_displ_inner_core,b_accel_inner_core, &
-          xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-          xix_inner_core,xiy_inner_core,xiz_inner_core, &
-          etax_inner_core,etay_inner_core,etaz_inner_core, &
-          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-!----------------------
-            is_on_a_slice_edge_inner_core,b_icall, &
-            b_accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector,b_iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,b_buffer_all_cube_from_slices,b_buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,b_iphase_CC, &
-!----------------------
-          hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
-          c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-          c13store_inner_core,c44store_inner_core, &
-          b_R_memory_inner_core,b_epsilondev_inner_core, b_eps_trace_over_3_inner_core,&
-          one_minus_sum_beta_inner_core, &
-          b_alphaval,b_betaval,b_gammaval, &
-          factor_common_inner_core, &
-          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
-          size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
-      endif
-    endif
-
-    ! Stacey
-    if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) then
-      call compute_stacey_crust_mantle(ichunk,SIMULATION_TYPE, &
-                              NSTEP,it,SAVE_FORWARD,ibool_crust_mantle, &
-                              veloc_crust_mantle,accel_crust_mantle,b_accel_crust_mantle, &
-                              jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle, &
-                              jacobian2D_ymin_crust_mantle,jacobian2D_ymax_crust_mantle, &
-                              wgllwgll_xz,wgllwgll_yz, &
-                              normal_xmin_crust_mantle,normal_xmax_crust_mantle, &
-                              normal_ymin_crust_mantle,normal_ymax_crust_mantle, &
-                              rho_vp_crust_mantle,rho_vs_crust_mantle, &
-                              ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle, &
-                              ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle, &
-                              nimin_crust_mantle,nimax_crust_mantle, &
-                              njmin_crust_mantle,njmax_crust_mantle, &
-                              nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
-                              nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-                              nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
-                              reclen_xmin_crust_mantle,reclen_xmax_crust_mantle, &
-                              reclen_ymin_crust_mantle,reclen_ymax_crust_mantle, &
-                              nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm, &
-                              absorb_xmin_crust_mantle5,absorb_xmax_crust_mantle5, &
-                              absorb_ymin_crust_mantle5,absorb_ymax_crust_mantle5)
-    endif ! Stacey conditions
-
-    ! add the sources
-    if (SIMULATION_TYPE == 1) &
-      call compute_add_sources(myrank,NSOURCES, &
-                                accel_crust_mantle,sourcearrays, &
-                                DT,t0,tshift_cmt,hdur_gaussian,ibool_crust_mantle, &
-                                islice_selected_source,ispec_selected_source,it, &
-                                hdur,xi_source,eta_source,gamma_source,nu_source)
-
-    ! add adjoint sources
-    if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
-      if( nadj_rec_local > 0 ) &
-        call compute_add_sources_adjoint(myrank,nrec, &
-                                nadj_rec_local,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC, &
-                                accel_crust_mantle,adj_sourcearrays, &
-                                nu,xi_receiver,eta_receiver,gamma_receiver, &
-                                xigll,yigll,zigll,ibool_crust_mantle, &
-                                islice_selected_rec,ispec_selected_rec, &
-                                NSTEP_SUB_ADJ,iadjsrc_len,iadjsrc,iadj_vec, &
-                                it,it_begin,station_name,network_name,DT)
-    endif
-
-    ! add sources for backward/reconstructed wavefield
-    if (SIMULATION_TYPE == 3) &
-      call compute_add_sources_backward(myrank,NSOURCES,NSTEP, &
-                                b_accel_crust_mantle,sourcearrays, &
-                                DT,t0,tshift_cmt,hdur_gaussian,ibool_crust_mantle, &
-                                islice_selected_source,ispec_selected_source,it, &
-                                hdur,xi_source,eta_source,gamma_source,nu_source)
-
-!<YANGL
-    ! NOISE_TOMOGRAPHY
-    if ( NOISE_TOMOGRAPHY == 1 ) then
-       ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
-       ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
-       ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
-       ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
-       call add_source_master_rec_noise(myrank,nrec, &
-                                NSTEP,accel_crust_mantle,noise_sourcearray, &
-                                ibool_crust_mantle,islice_selected_rec,ispec_selected_rec, &
-                                it,irec_master_noise)
-    elseif ( NOISE_TOMOGRAPHY == 2 ) then
-       ! second step of noise tomography, i.e., read the surface movie saved at every timestep
-       ! use the movie to drive the ensemble forward wavefield
-       call noise_read_add_surface_movie(nmovie_points,accel_crust_mantle, &
-                              normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
-                              ibelm_top_crust_mantle,ibool_crust_mantle, &
-                              NSPEC2D_TOP(IREGION_CRUST_MANTLE),noise_surface_movie, &
-                              NSTEP-it+1,jacobian2D_top_crust_mantle,wgllwgll_xy)
-        ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
-        ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
-        ! note the ensemble forward sources are generally distributed on the surface of the earth
-        ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
-        ! therefore, we must add it here, before applying the inverse of mass matrix
-    elseif ( NOISE_TOMOGRAPHY == 3 ) then
-        ! third step of noise tomography, i.e., read the surface movie saved at every timestep
-        ! use the movie to reconstruct the ensemble forward wavefield
-        ! the ensemble adjoint wavefield is done as usual
-        ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
-        call noise_read_add_surface_movie(nmovie_points,b_accel_crust_mantle, &
-                              normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
-                              ibelm_top_crust_mantle,ibool_crust_mantle, &
-                              NSPEC2D_TOP(IREGION_CRUST_MANTLE),noise_surface_movie, &
-                              it,jacobian2D_top_crust_mantle,wgllwgll_xy)
-    endif
-!>YANGL
-
-    ! ****************************************************
-    ! **********  add matching with fluid part  **********
-    ! ****************************************************
-
-    ! only for elements in first matching layer in the solid
-
-    !---
-    !--- couple with outer core at the bottom of the mantle
-    !---
-    if(ACTUALLY_COUPLE_FLUID_CMB) &
-      call compute_coupling_CMB_fluid(displ_crust_mantle,b_displ_crust_mantle, &
-                            accel_crust_mantle,b_accel_crust_mantle, &
-                            ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
-                            accel_outer_core,b_accel_outer_core, &
-                            normal_top_outer_core,jacobian2D_top_outer_core, &
-                            wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
-                            RHO_TOP_OC,minus_g_cmb, &
-                            SIMULATION_TYPE,NSPEC2D_BOTTOM(IREGION_CRUST_MANTLE))
-
-    !---
-    !--- couple with outer core at the top of the inner core
-    !---
-    if(ACTUALLY_COUPLE_FLUID_ICB) &
-      call compute_coupling_ICB_fluid(displ_inner_core,b_displ_inner_core, &
-                            accel_inner_core,b_accel_inner_core, &
-                            ibool_inner_core,ibelm_top_inner_core,  &
-                            accel_outer_core,b_accel_outer_core, &
-                            normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
-                            wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
-                            RHO_BOTTOM_OC,minus_g_icb, &
-                            SIMULATION_TYPE,NSPEC2D_TOP(IREGION_INNER_CORE))
-
-
-    ! assemble all the contributions between slices using MPI
-
-! assemble all the contributions between slices using MPI
-! crust/mantle and inner core handled in the same call
-! in order to reduce the number of MPI messages by 2
-    if(USE_NONBLOCKING_COMMS) then
-
-      iphase = 1 ! initialize the non blocking communication counter
-      iphase_CC = 1 ! initialize the non blocking communication counter for the central cube
-
-! start the non blocking communications
-      call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
-            NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS_VAL,iphase)
-
-      icall = 2 ! now compute all the inner elements in the case of non blocking MPI
-
-      ! compute internal forces in the solid regions
-
-      ! for anisotropy and gravity, x y and z contain r theta and phi
-
-      if( USE_DEVILLE_PRODUCTS_VAL ) then
-        call compute_forces_crust_mantle_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_crust_mantle,accel_crust_mantle, &
-          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-          etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-!----------------------
-            is_on_a_slice_edge_crust_mantle,icall, &
-            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_xxT, &
-          hprimewgll_xx,hprimewgll_xxT, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle, &
-          muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-          c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-          c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-          c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-          c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-          c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-          c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-          c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-          ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-      !---idoubling_crust_mantle, &
-          R_memory_crust_mantle,epsilondev_crust_mantle, &
-          eps_trace_over_3_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          alphaval,betaval,gammaval,factor_common_crust_mantle, &
-          size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
-          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
-      else
-        call compute_forces_crust_mantle(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_crust_mantle,accel_crust_mantle, &
-          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-          etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-!----------------------
-            is_on_a_slice_edge_crust_mantle,icall, &
-            accel_inner_core,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_yy,hprime_zz, &
-          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle, &
-          muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-          c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-          c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-          c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-          c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-          c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-          c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-          c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-          ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-      ! --idoubling_crust_mantle, &
-          R_memory_crust_mantle,epsilondev_crust_mantle, &
-          eps_trace_over_3_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          alphaval,betaval,gammaval,factor_common_crust_mantle, &
-          size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
-          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
-      endif
-
-      ! Deville routine
-      if( USE_DEVILLE_PRODUCTS_VAL ) then
-        call compute_forces_inner_core_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_inner_core,accel_inner_core, &
-          xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-          xix_inner_core,xiy_inner_core,xiz_inner_core, &
-          etax_inner_core,etay_inner_core,etaz_inner_core, &
-          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-!----------------------
-            is_on_a_slice_edge_inner_core,icall, &
-            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
-          c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-          c13store_inner_core,c44store_inner_core, &
-          R_memory_inner_core,epsilondev_inner_core, eps_trace_over_3_inner_core,&
-          one_minus_sum_beta_inner_core, &
-          alphaval,betaval,gammaval, &
-          factor_common_inner_core, &
-          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
-          size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
-      else
-        call compute_forces_inner_core(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          displ_inner_core,accel_inner_core, &
-          xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-          xix_inner_core,xiy_inner_core,xiz_inner_core, &
-          etax_inner_core,etay_inner_core,etaz_inner_core, &
-          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-!----------------------
-            is_on_a_slice_edge_inner_core,icall, &
-            accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector,iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,iphase_CC, &
-!----------------------
-          hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
-          c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-          c13store_inner_core,c44store_inner_core, &
-          R_memory_inner_core,epsilondev_inner_core, eps_trace_over_3_inner_core,&
-          one_minus_sum_beta_inner_core, &
-          alphaval,betaval,gammaval, &
-          factor_common_inner_core, &
-          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
-          size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
-      endif
-
-! assemble all the contributions between slices using MPI
-! crust/mantle and inner core handled in the same call
-! in order to reduce the number of MPI messages by 2
-      do while (iphase <= 7) ! make sure the last communications are finished and processed
-        call assemble_MPI_vector(myrank,accel_crust_mantle,accel_inner_core, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces,npoin2D_max_all_CM_IC, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
-            NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS_VAL,iphase)
-      enddo
-    else
-      ! crust/mantle and inner core handled in the same call
-      ! in order to reduce the number of MPI messages by 2
-      call assemble_MPI_vector_block(myrank, &
-            accel_crust_mantle,NGLOB_CRUST_MANTLE, &
-            accel_inner_core,NGLOB_INNER_CORE, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle, &
-            iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core, &
-            iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            buffer_send_faces,buffer_received_faces, &
-            buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL, &
-            NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
-            NGLOB2DMAX_XMIN_XMAX(IREGION_CRUST_MANTLE),NGLOB2DMAX_YMIN_YMAX(IREGION_CRUST_MANTLE), &
-            NGLOB1D_RADIAL(IREGION_INNER_CORE), &
-            NGLOB2DMAX_XMIN_XMAX(IREGION_INNER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_INNER_CORE), &
-            NGLOB2DMAX_XY,NCHUNKS_VAL)
-    endif
-
-    !---
-    !---  use buffers to assemble forces with the central cube
-    !---
-
-    if(INCLUDE_CENTRAL_CUBE) then
-      if(USE_NONBLOCKING_COMMS) then
-        do while (iphase_CC <= 4) ! make sure the last communications are finished and processed
-          call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
-            ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),accel_inner_core,NDIM,iphase_CC)
-        enddo
-      else
-        call assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-          npoin2D_cube_from_slices,buffer_all_cube_from_slices,buffer_slices,buffer_slices2,ibool_central_cube, &
-          receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core,NSPEC_INNER_CORE, &
-          ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),NGLOB_INNER_CORE,accel_inner_core,NDIM)
-      endif
-    endif   ! end of assembling forces with the central cube
-
-#ifdef _HANDOPT
-! way 2:
-    if(imodulo_NGLOB_CRUST_MANTLE4 >= 1) then
-      do i=1,imodulo_NGLOB_CRUST_MANTLE4
-        accel_crust_mantle(1,i) = accel_crust_mantle(1,i)*rmass_crust_mantle(i) &
-               + two_omega_earth*veloc_crust_mantle(2,i)
-        accel_crust_mantle(2,i) = accel_crust_mantle(2,i)*rmass_crust_mantle(i) &
-               - two_omega_earth*veloc_crust_mantle(1,i)
-        accel_crust_mantle(3,i) = accel_crust_mantle(3,i)*rmass_crust_mantle(i)
-      enddo
-    endif
-    do i=imodulo_NGLOB_CRUST_MANTLE4+1,NGLOB_CRUST_MANTLE,4
-      accel_crust_mantle(1,i) = accel_crust_mantle(1,i)*rmass_crust_mantle(i) &
-               + two_omega_earth*veloc_crust_mantle(2,i)
-      accel_crust_mantle(2,i) = accel_crust_mantle(2,i)*rmass_crust_mantle(i) &
-               - two_omega_earth*veloc_crust_mantle(1,i)
-      accel_crust_mantle(3,i) = accel_crust_mantle(3,i)*rmass_crust_mantle(i)
-
-      accel_crust_mantle(1,i+1) = accel_crust_mantle(1,i+1)*rmass_crust_mantle(i+1) &
-               + two_omega_earth*veloc_crust_mantle(2,i+1)
-      accel_crust_mantle(2,i+1) = accel_crust_mantle(2,i+1)*rmass_crust_mantle(i+1) &
-               - two_omega_earth*veloc_crust_mantle(1,i+1)
-      accel_crust_mantle(3,i+1) = accel_crust_mantle(3,i+1)*rmass_crust_mantle(i+1)
-
-      accel_crust_mantle(1,i+2) = accel_crust_mantle(1,i+2)*rmass_crust_mantle(i+2) &
-               + two_omega_earth*veloc_crust_mantle(2,i+2)
-      accel_crust_mantle(2,i+2) = accel_crust_mantle(2,i+2)*rmass_crust_mantle(i+2) &
-               - two_omega_earth*veloc_crust_mantle(1,i+2)
-      accel_crust_mantle(3,i+2) = accel_crust_mantle(3,i+2)*rmass_crust_mantle(i+2)
-
-      accel_crust_mantle(1,i+3) = accel_crust_mantle(1,i+3)*rmass_crust_mantle(i+3) &
-               + two_omega_earth*veloc_crust_mantle(2,i+3)
-      accel_crust_mantle(2,i+3) = accel_crust_mantle(2,i+3)*rmass_crust_mantle(i+3) &
-               - two_omega_earth*veloc_crust_mantle(1,i+3)
-      accel_crust_mantle(3,i+3) = accel_crust_mantle(3,i+3)*rmass_crust_mantle(i+3)
-    enddo
-#else
-! way 1:
-    do i=1,NGLOB_CRUST_MANTLE
-      accel_crust_mantle(1,i) = accel_crust_mantle(1,i)*rmass_crust_mantle(i) &
-               + two_omega_earth*veloc_crust_mantle(2,i)
-      accel_crust_mantle(2,i) = accel_crust_mantle(2,i)*rmass_crust_mantle(i) &
-               - two_omega_earth*veloc_crust_mantle(1,i)
-      accel_crust_mantle(3,i) = accel_crust_mantle(3,i)*rmass_crust_mantle(i)
-    enddo
-#endif
-
-    if (SIMULATION_TYPE == 3) then
-
-! ------------------- new non blocking implementation -------------------
-
-      ! assemble all the contributions between slices using MPI
-
-! assemble all the contributions between slices using MPI
-! crust/mantle and inner core handled in the same call
-! in order to reduce the number of MPI messages by 2
-      if(USE_NONBLOCKING_COMMS) then
-
-        b_iphase = 1 ! initialize the non blocking communication counter
-        b_iphase_CC = 1 ! initialize the non blocking communication counter for the central cube
-
-! start the non blocking communications
-        call assemble_MPI_vector(myrank,b_accel_crust_mantle,b_accel_inner_core, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
-            NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS_VAL,b_iphase)
-
-        b_icall = 2 ! now compute all the inner elements in the case of non blocking MPI
-
-        ! compute internal forces in the solid regions
-
-        ! for anisotropy and gravity, x y and z contain r theta and phi
-
-        if( USE_DEVILLE_PRODUCTS_VAL ) then
-          call compute_forces_crust_mantle_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          b_displ_crust_mantle,b_accel_crust_mantle, &
-          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-          etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-!----------------------
-            is_on_a_slice_edge_crust_mantle,b_icall, &
-            b_accel_inner_core,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector,b_iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,b_buffer_all_cube_from_slices,b_buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,b_iphase_CC, &
-!----------------------
-          hprime_xx,hprime_xxT, &
-          hprimewgll_xx,hprimewgll_xxT, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle, &
-          muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-          c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-          c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-          c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-          c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-          c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-          c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-          c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-          ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-      ! --idoubling_crust_mantle, &
-          b_R_memory_crust_mantle,b_epsilondev_crust_mantle, &
-          b_eps_trace_over_3_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          b_alphaval,b_betaval,b_gammaval,factor_common_crust_mantle, &
-          size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
-          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
-        else
-          call compute_forces_crust_mantle(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          b_displ_crust_mantle,b_accel_crust_mantle, &
-          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-          xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-          etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-          gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-!----------------------
-            is_on_a_slice_edge_crust_mantle,b_icall, &
-            b_accel_inner_core,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector,b_iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,b_buffer_all_cube_from_slices,b_buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,b_iphase_CC, &
-!----------------------
-          hprime_xx,hprime_yy,hprime_zz, &
-          hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_crust_mantle,kappahstore_crust_mantle,muvstore_crust_mantle, &
-          muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-          c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
-          c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
-          c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
-          c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
-          c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
-          c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-          c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-          ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-      !--idoubling_crust_mantle, &
-          b_R_memory_crust_mantle,b_epsilondev_crust_mantle, &
-          b_eps_trace_over_3_crust_mantle,one_minus_sum_beta_crust_mantle, &
-          b_alphaval,b_betaval,b_gammaval,factor_common_crust_mantle, &
-          size(factor_common_crust_mantle,2), size(factor_common_crust_mantle,3), &
-          size(factor_common_crust_mantle,4), size(factor_common_crust_mantle,5) )
-        endif
-
-        ! Deville routine
-        if( USE_DEVILLE_PRODUCTS_VAL ) then
-          call compute_forces_inner_core_Dev(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          b_displ_inner_core,b_accel_inner_core, &
-          xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-          xix_inner_core,xiy_inner_core,xiz_inner_core, &
-          etax_inner_core,etay_inner_core,etaz_inner_core, &
-          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-!----------------------
-            is_on_a_slice_edge_inner_core,b_icall, &
-            b_accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector,b_iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,b_buffer_all_cube_from_slices,b_buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,b_iphase_CC, &
-!----------------------
-          hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
-          c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-          c13store_inner_core,c44store_inner_core, &
-          b_R_memory_inner_core,b_epsilondev_inner_core, b_eps_trace_over_3_inner_core,&
-          one_minus_sum_beta_inner_core, &
-          b_alphaval,b_betaval,b_gammaval, &
-          factor_common_inner_core, &
-          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
-          size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
-        else
-          call compute_forces_inner_core(minus_gravity_table,density_table,minus_deriv_gravity_table, &
-          b_displ_inner_core,b_accel_inner_core, &
-          xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-          xix_inner_core,xiy_inner_core,xiz_inner_core, &
-          etax_inner_core,etay_inner_core,etaz_inner_core, &
-          gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-!----------------------
-            is_on_a_slice_edge_inner_core,b_icall, &
-            b_accel_crust_mantle,ibool_inner_core,idoubling_inner_core, &
-            myrank,iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector,b_iphase, &
-            nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,b_buffer_all_cube_from_slices,b_buffer_slices,ibool_central_cube, &
-            receiver_cube_from_slices,ibelm_bottom_inner_core,NSPEC2D_BOTTOM_IC,INCLUDE_CENTRAL_CUBE,b_iphase_CC, &
-!----------------------
-          hprime_xx,hprime_yy,hprime_zz,hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
-          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube, &
-          kappavstore_inner_core,muvstore_inner_core,ibool_inner_core,idoubling_inner_core, &
-          c11store_inner_core,c33store_inner_core,c12store_inner_core, &
-          c13store_inner_core,c44store_inner_core, &
-          b_R_memory_inner_core,b_epsilondev_inner_core, b_eps_trace_over_3_inner_core,&
-          one_minus_sum_beta_inner_core, &
-          b_alphaval,b_betaval,b_gammaval, &
-          factor_common_inner_core, &
-          size(factor_common_inner_core,2), size(factor_common_inner_core,3), &
-          size(factor_common_inner_core,4), size(factor_common_inner_core,5) )
-        endif
-
-! assemble all the contributions between slices using MPI
-! crust/mantle and inner core handled in the same call
-! in order to reduce the number of MPI messages by 2
-        do while (b_iphase <= 7) ! make sure the last communications are finished and processed
-          call assemble_MPI_vector(myrank,b_accel_crust_mantle,b_accel_inner_core, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces,npoin2D_max_all_CM_IC, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
-            NGLOB1D_RADIAL(IREGION_INNER_CORE),NCHUNKS_VAL,b_iphase)
-        enddo
-      else
-        ! crust/mantle and inner core handled in the same call
-        ! in order to reduce the number of MPI messages by 2
-        call assemble_MPI_vector_block(myrank, &
-            b_accel_crust_mantle,NGLOB_CRUST_MANTLE, &
-            b_accel_inner_core,NGLOB_INNER_CORE, &
-            iproc_xi,iproc_eta,ichunk,addressing, &
-            iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle, &
-            iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
-            npoin2D_faces_crust_mantle,npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
-            iboolfaces_crust_mantle,iboolcorner_crust_mantle, &
-            iboolleft_xi_inner_core,iboolright_xi_inner_core, &
-            iboolleft_eta_inner_core,iboolright_eta_inner_core, &
-            npoin2D_faces_inner_core,npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
-            iboolfaces_inner_core,iboolcorner_inner_core, &
-            iprocfrom_faces,iprocto_faces,imsg_type, &
-            iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners, &
-            b_buffer_send_faces,b_buffer_received_faces, &
-            b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector, &
-            NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
-            NPROC_XI_VAL,NPROC_ETA_VAL, &
-            NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
-            NGLOB2DMAX_XMIN_XMAX(IREGION_CRUST_MANTLE),NGLOB2DMAX_YMIN_YMAX(IREGION_CRUST_MANTLE), &
-            NGLOB1D_RADIAL(IREGION_INNER_CORE), &
-            NGLOB2DMAX_XMIN_XMAX(IREGION_INNER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_INNER_CORE), &
-            NGLOB2DMAX_XY,NCHUNKS_VAL)
-      endif
-
-      !---
-      !---  use buffers to assemble forces with the central cube
-      !---
-
-      if(INCLUDE_CENTRAL_CUBE) then
-        if(USE_NONBLOCKING_COMMS) then
-          do while (b_iphase_CC <= 4) ! make sure the last communications are finished and processed
-            call assemble_MPI_central_cube(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-              npoin2D_cube_from_slices,b_buffer_all_cube_from_slices,b_buffer_slices,ibool_central_cube, &
-              receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core, &
-              ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),b_accel_inner_core,NDIM,b_iphase_CC)
-          enddo
-        else
-          call assemble_MPI_central_cube_block(ichunk,nb_msgs_theor_in_cube,sender_from_slices_to_cube, &
-            npoin2D_cube_from_slices,b_buffer_all_cube_from_slices,b_buffer_slices,buffer_slices2,ibool_central_cube, &
-            receiver_cube_from_slices,ibool_inner_core,idoubling_inner_core,NSPEC_INNER_CORE, &
-            ibelm_bottom_inner_core,NSPEC2D_BOTTOM(IREGION_INNER_CORE),NGLOB_INNER_CORE,b_accel_inner_core,NDIM)
-        endif
-      endif   ! end of assembling forces with the central cube
-
-! ------------------- new non blocking implementation -------------------
-
-#ifdef _HANDOPT
-! way 2:
-      if( imodulo_NGLOB_CRUST_MANTLE4 >=1 ) then
-        do i=1,imodulo_NGLOB_CRUST_MANTLE4
-          b_accel_crust_mantle(1,i) = b_accel_crust_mantle(1,i)*rmass_crust_mantle(i) &
-                 + b_two_omega_earth*b_veloc_crust_mantle(2,i)
-          b_accel_crust_mantle(2,i) = b_accel_crust_mantle(2,i)*rmass_crust_mantle(i) &
-                 - b_two_omega_earth*b_veloc_crust_mantle(1,i)
-          b_accel_crust_mantle(3,i) = b_accel_crust_mantle(3,i)*rmass_crust_mantle(i)
-        enddo
-      endif
-      do i=imodulo_NGLOB_CRUST_MANTLE4+1,NGLOB_CRUST_MANTLE,4
-        b_accel_crust_mantle(1,i) = b_accel_crust_mantle(1,i)*rmass_crust_mantle(i) &
-                 + b_two_omega_earth*b_veloc_crust_mantle(2,i)
-        b_accel_crust_mantle(2,i) = b_accel_crust_mantle(2,i)*rmass_crust_mantle(i) &
-                 - b_two_omega_earth*b_veloc_crust_mantle(1,i)
-        b_accel_crust_mantle(3,i) = b_accel_crust_mantle(3,i)*rmass_crust_mantle(i)
-
-        b_accel_crust_mantle(1,i+1) = b_accel_crust_mantle(1,i+1)*rmass_crust_mantle(i+1) &
-                 + b_two_omega_earth*b_veloc_crust_mantle(2,i+1)
-        b_accel_crust_mantle(2,i+1) = b_accel_crust_mantle(2,i+1)*rmass_crust_mantle(i+1) &
-                 - b_two_omega_earth*b_veloc_crust_mantle(1,i+1)
-        b_accel_crust_mantle(3,i+1) = b_accel_crust_mantle(3,i+1)*rmass_crust_mantle(i+1)
-
-        b_accel_crust_mantle(1,i+2) = b_accel_crust_mantle(1,i+2)*rmass_crust_mantle(i+2) &
-                 + b_two_omega_earth*b_veloc_crust_mantle(2,i+2)
-        b_accel_crust_mantle(2,i+2) = b_accel_crust_mantle(2,i+2)*rmass_crust_mantle(i+2) &
-                 - b_two_omega_earth*b_veloc_crust_mantle(1,i+2)
-        b_accel_crust_mantle(3,i+2) = b_accel_crust_mantle(3,i+2)*rmass_crust_mantle(i+2)
-
-        b_accel_crust_mantle(1,i+3) = b_accel_crust_mantle(1,i+3)*rmass_crust_mantle(i+3) &
-                 + b_two_omega_earth*b_veloc_crust_mantle(2,i+3)
-        b_accel_crust_mantle(2,i+3) = b_accel_crust_mantle(2,i+3)*rmass_crust_mantle(i+3) &
-                 - b_two_omega_earth*b_veloc_crust_mantle(1,i+3)
-        b_accel_crust_mantle(3,i+3) = b_accel_crust_mantle(3,i+3)*rmass_crust_mantle(i+3)
-      enddo
-#else
-! way 1:
-      do i=1,NGLOB_CRUST_MANTLE
-        b_accel_crust_mantle(1,i) = b_accel_crust_mantle(1,i)*rmass_crust_mantle(i) &
-                 + b_two_omega_earth*b_veloc_crust_mantle(2,i)
-        b_accel_crust_mantle(2,i) = b_accel_crust_mantle(2,i)*rmass_crust_mantle(i) &
-                 - b_two_omega_earth*b_veloc_crust_mantle(1,i)
-        b_accel_crust_mantle(3,i) = b_accel_crust_mantle(3,i)*rmass_crust_mantle(i)
-      enddo
-#endif
-
-    endif ! SIMULATION_TYPE == 3
-
-    ! couples ocean with crust mantle
-    if(OCEANS_VAL) &
-      call compute_coupling_ocean(accel_crust_mantle,b_accel_crust_mantle, &
-                            rmass_crust_mantle,rmass_ocean_load,normal_top_crust_mantle, &
-                            ibool_crust_mantle,ibelm_top_crust_mantle, &
-                            updated_dof_ocean_load, &
-                            SIMULATION_TYPE,NSPEC2D_TOP(IREGION_CRUST_MANTLE))
-
-
-!
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!
-    ! Newmark time scheme - corrector for elastic parts
-#ifdef _HANDOPT
-! way 2:
-    ! mantle
-    if( imodulo_NGLOB_CRUST_MANTLE4 >= 1 ) then
-      do i=1,imodulo_NGLOB_CRUST_MANTLE4
-        veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + deltatover2*accel_crust_mantle(:,i)
-      enddo
-    endif
-    do i=imodulo_NGLOB_CRUST_MANTLE4+1,NGLOB_CRUST_MANTLE,4
-      veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + deltatover2*accel_crust_mantle(:,i)
-      veloc_crust_mantle(:,i+1) = veloc_crust_mantle(:,i+1) + deltatover2*accel_crust_mantle(:,i+1)
-      veloc_crust_mantle(:,i+2) = veloc_crust_mantle(:,i+2) + deltatover2*accel_crust_mantle(:,i+2)
-      veloc_crust_mantle(:,i+3) = veloc_crust_mantle(:,i+3) + deltatover2*accel_crust_mantle(:,i+3)
-    enddo
-
-    ! inner core
-    if(imodulo_NGLOB_INNER_CORE >= 1) then
-      do i=1,imodulo_NGLOB_INNER_CORE
-        accel_inner_core(1,i) = accel_inner_core(1,i)*rmass_inner_core(i) &
-               + two_omega_earth*veloc_inner_core(2,i)
-        accel_inner_core(2,i) = accel_inner_core(2,i)*rmass_inner_core(i) &
-               - two_omega_earth*veloc_inner_core(1,i)
-        accel_inner_core(3,i) = accel_inner_core(3,i)*rmass_inner_core(i)
-
-        veloc_inner_core(:,i) = veloc_inner_core(:,i) + deltatover2*accel_inner_core(:,i)
-      enddo
-    endif
-    do i=imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE,3
-      accel_inner_core(1,i) = accel_inner_core(1,i)*rmass_inner_core(i) &
-             + two_omega_earth*veloc_inner_core(2,i)
-      accel_inner_core(2,i) = accel_inner_core(2,i)*rmass_inner_core(i) &
-             - two_omega_earth*veloc_inner_core(1,i)
-      accel_inner_core(3,i) = accel_inner_core(3,i)*rmass_inner_core(i)
-
-      accel_inner_core(1,i+1) = accel_inner_core(1,i+1)*rmass_inner_core(i+1) &
-             + two_omega_earth*veloc_inner_core(2,i+1)
-      accel_inner_core(2,i+1) = accel_inner_core(2,i+1)*rmass_inner_core(i+1) &
-             - two_omega_earth*veloc_inner_core(1,i+1)
-      accel_inner_core(3,i+1) = accel_inner_core(3,i+1)*rmass_inner_core(i+1)
-
-      accel_inner_core(1,i+2) = accel_inner_core(1,i+2)*rmass_inner_core(i+2) &
-             + two_omega_earth*veloc_inner_core(2,i+2)
-      accel_inner_core(2,i+2) = accel_inner_core(2,i+2)*rmass_inner_core(i+2) &
-             - two_omega_earth*veloc_inner_core(1,i+2)
-      accel_inner_core(3,i+2) = accel_inner_core(3,i+2)*rmass_inner_core(i+2)
-
-      veloc_inner_core(:,i) = veloc_inner_core(:,i) + deltatover2*accel_inner_core(:,i)
-      veloc_inner_core(:,i+1) = veloc_inner_core(:,i+1) + deltatover2*accel_inner_core(:,i+1)
-      veloc_inner_core(:,i+2) = veloc_inner_core(:,i+2) + deltatover2*accel_inner_core(:,i+2)
-    enddo
-#else
-! way 1:
-    ! mantle
-    do i=1,NGLOB_CRUST_MANTLE
-      veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) + deltatover2*accel_crust_mantle(:,i)
-    enddo
-    ! inner core
-    do i=1,NGLOB_INNER_CORE
-      accel_inner_core(1,i) = accel_inner_core(1,i)*rmass_inner_core(i) &
-             + two_omega_earth*veloc_inner_core(2,i)
-      accel_inner_core(2,i) = accel_inner_core(2,i)*rmass_inner_core(i) &
-             - two_omega_earth*veloc_inner_core(1,i)
-      accel_inner_core(3,i) = accel_inner_core(3,i)*rmass_inner_core(i)
-
-      veloc_inner_core(:,i) = veloc_inner_core(:,i) + deltatover2*accel_inner_core(:,i)
-    enddo
-#endif
-
-    if (SIMULATION_TYPE == 3) then
-#ifdef _HANDOPT
-! way 2:
-      ! mantle
-      if( imodulo_NGLOB_CRUST_MANTLE4 >= 1 ) then
-        do i=1,imodulo_NGLOB_CRUST_MANTLE4
-          b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) + b_deltatover2*b_accel_crust_mantle(:,i)
-        enddo
-      endif
-      do i=imodulo_NGLOB_CRUST_MANTLE4+1,NGLOB_CRUST_MANTLE,4
-        b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) + b_deltatover2*b_accel_crust_mantle(:,i)
-        b_veloc_crust_mantle(:,i+1) = b_veloc_crust_mantle(:,i+1) + b_deltatover2*b_accel_crust_mantle(:,i+1)
-        b_veloc_crust_mantle(:,i+2) = b_veloc_crust_mantle(:,i+2) + b_deltatover2*b_accel_crust_mantle(:,i+2)
-        b_veloc_crust_mantle(:,i+3) = b_veloc_crust_mantle(:,i+3) + b_deltatover2*b_accel_crust_mantle(:,i+3)
-      enddo
-      ! inner core
-      if(imodulo_NGLOB_INNER_CORE >= 1) then
-        do i=1,imodulo_NGLOB_INNER_CORE
-          b_accel_inner_core(1,i) = b_accel_inner_core(1,i)*rmass_inner_core(i) &
-            + b_two_omega_earth*b_veloc_inner_core(2,i)
-          b_accel_inner_core(2,i) = b_accel_inner_core(2,i)*rmass_inner_core(i) &
-            - b_two_omega_earth*b_veloc_inner_core(1,i)
-          b_accel_inner_core(3,i) = b_accel_inner_core(3,i)*rmass_inner_core(i)
-
-          b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) + b_deltatover2*b_accel_inner_core(:,i)
-        enddo
-      endif
-      do i=imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE,3
-        b_accel_inner_core(1,i) = b_accel_inner_core(1,i)*rmass_inner_core(i) &
-         + b_two_omega_earth*b_veloc_inner_core(2,i)
-        b_accel_inner_core(2,i) = b_accel_inner_core(2,i)*rmass_inner_core(i) &
-         - b_two_omega_earth*b_veloc_inner_core(1,i)
-        b_accel_inner_core(3,i) = b_accel_inner_core(3,i)*rmass_inner_core(i)
-
-        b_accel_inner_core(1,i+1) = b_accel_inner_core(1,i+1)*rmass_inner_core(i+1) &
-         + b_two_omega_earth*b_veloc_inner_core(2,i+1)
-        b_accel_inner_core(2,i+1) = b_accel_inner_core(2,i+1)*rmass_inner_core(i+1) &
-         - b_two_omega_earth*b_veloc_inner_core(1,i+1)
-        b_accel_inner_core(3,i+1) = b_accel_inner_core(3,i+1)*rmass_inner_core(i+1)
-
-        b_accel_inner_core(1,i+2) = b_accel_inner_core(1,i+2)*rmass_inner_core(i+2) &
-         + b_two_omega_earth*b_veloc_inner_core(2,i+2)
-        b_accel_inner_core(2,i+2) = b_accel_inner_core(2,i+2)*rmass_inner_core(i+2) &
-         - b_two_omega_earth*b_veloc_inner_core(1,i+2)
-        b_accel_inner_core(3,i+2) = b_accel_inner_core(3,i+2)*rmass_inner_core(i+2)
-
-        b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) + b_deltatover2*b_accel_inner_core(:,i)
-        b_veloc_inner_core(:,i+1) = b_veloc_inner_core(:,i+1) + b_deltatover2*b_accel_inner_core(:,i+1)
-        b_veloc_inner_core(:,i+2) = b_veloc_inner_core(:,i+2) + b_deltatover2*b_accel_inner_core(:,i+2)
-
-      enddo
-#else
-! way 1:
-      ! mantle
-      do i=1,NGLOB_CRUST_MANTLE
-        b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) + b_deltatover2*b_accel_crust_mantle(:,i)
-      enddo
-      ! inner core
-      do i=1,NGLOB_INNER_CORE
-        b_accel_inner_core(1,i) = b_accel_inner_core(1,i)*rmass_inner_core(i) &
-         + b_two_omega_earth*b_veloc_inner_core(2,i)
-        b_accel_inner_core(2,i) = b_accel_inner_core(2,i)*rmass_inner_core(i) &
-         - b_two_omega_earth*b_veloc_inner_core(1,i)
-        b_accel_inner_core(3,i) = b_accel_inner_core(3,i)*rmass_inner_core(i)
-
-        b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) + b_deltatover2*b_accel_inner_core(:,i)
-      enddo
-#endif
-
-    endif ! SIMULATION_TYPE == 3
-
-
-    ! restores last time snapshot saved for backward/reconstruction of wavefields
-    ! note: this is done here after the Newmark time scheme, otherwise the indexing for sources
-    !          and adjoint sources will become more complicated
-    !          that is, index it for adjoint sources will match index NSTEP - 1 for backward/reconstructed wavefields
-    if( SIMULATION_TYPE == 3 .and. it == 1 ) then
-      call read_forward_arrays(myrank, &
-                    b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
-                    b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
-                    b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
-                    b_R_memory_crust_mantle,b_R_memory_inner_core, &
-                    b_epsilondev_crust_mantle,b_epsilondev_inner_core, &
-                    b_A_array_rotation,b_B_array_rotation,LOCAL_PATH)
-    endif
-
-! write the seismograms with time shift
-
-! store the seismograms only if there is at least one receiver located in this slice
-  if (nrec_local > 0) then
-    if (SIMULATION_TYPE == 1) then
-      call compute_seismograms(nrec_local,nrec,displ_crust_mantle, &
-                                nu,hxir_store,hetar_store,hgammar_store, &
-                                scale_displ,ibool_crust_mantle, &
-                                ispec_selected_rec,number_receiver_global, &
-                                seismo_current,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-                                seismograms)
-
-    else if (SIMULATION_TYPE == 2) then
-      call compute_seismograms_adjoint(NSOURCES,nrec_local,displ_crust_mantle, &
-                    eps_trace_over_3_crust_mantle,epsilondev_crust_mantle, &
-                    nu_source,Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
-                    hxir_store,hetar_store,hgammar_store, &
-                    hpxir_store,hpetar_store,hpgammar_store, &
-                    tshift_cmt,hdur_gaussian,DT,t0,scale_displ, &
-                    hprime_xx,hprime_yy,hprime_zz, &
-                    xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-                    etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
-                    gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
-                    moment_der,sloc_der,stshift_der,shdur_der, &
-                    NTSTEP_BETWEEN_OUTPUT_SEISMOS,seismograms,deltat, &
-                    ibool_crust_mantle,ispec_selected_source,number_receiver_global, &
-                    NSTEP,it,nit_written)
-
-    else if (SIMULATION_TYPE == 3) then
-      call compute_seismograms_backward(nrec_local,nrec,b_displ_crust_mantle, &
-                                nu,hxir_store,hetar_store,hgammar_store, &
-                                scale_displ,ibool_crust_mantle, &
-                                ispec_selected_rec,number_receiver_global, &
-                                seismo_current,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-                                seismograms)
-
-    endif
-  endif ! nrec_local
-
-  ! write the current or final seismograms
-  if(seismo_current == NTSTEP_BETWEEN_OUTPUT_SEISMOS .or. it == it_end) then
-    if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
-      call write_seismograms(myrank,seismograms,number_receiver_global,station_name, &
-            network_name,stlat,stlon,stele,stbur, &
-            nrec,nrec_local,ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,DT,t0,it_end, &
-            yr_SAC,jda_SAC,ho_SAC,mi_SAC,sec_SAC,t_cmt_SAC,t_shift_SAC, &
-            elat_SAC,elon_SAC,depth_SAC,event_name_SAC,cmt_lat_SAC,cmt_lon_SAC,&
-            cmt_depth_SAC,cmt_hdur_SAC,NPROCTOT_VAL, &
-            OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
-            OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-            seismo_offset,seismo_current,WRITE_SEISMOGRAMS_BY_MASTER, &
-            SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE,MODEL)
-      if(myrank==0) then
-        write(IMAIN,*)
-        write(IMAIN,*) ' Total number of time steps written: ', it-it_begin+1
-        write(IMAIN,*)
-      endif
-    else
-      if( nrec_local > 0 ) &
-        call write_adj_seismograms(seismograms,number_receiver_global, &
-                                  nrec_local,it,nit_written,DT, &
-                                  NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS,t0,LOCAL_PATH)
-        nit_written = it
-    endif
-    seismo_offset = seismo_offset + seismo_current
-    seismo_current = 0
-  endif
-
-!
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!
-
-! kernel calculations
-  if (SIMULATION_TYPE == 3) then
-    ! crust mantle
-    call compute_kernels_crust_mantle(ibool_crust_mantle, &
-                          rho_kl_crust_mantle,beta_kl_crust_mantle, &
-                          alpha_kl_crust_mantle,cijkl_kl_crust_mantle, &
-                          accel_crust_mantle,b_displ_crust_mantle, &
-                          epsilondev_crust_mantle,b_epsilondev_crust_mantle, &
-                          eps_trace_over_3_crust_mantle,b_eps_trace_over_3_crust_mantle, &
-                          deltat)
-
-    ! outer core
-    call compute_kernels_outer_core(ibool_outer_core, &
-                        xix_outer_core,xiy_outer_core,xiz_outer_core, &
-                        etax_outer_core,etay_outer_core,etaz_outer_core, &
-                        gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
-                        hprime_xx,hprime_yy,hprime_zz, &
-                        displ_outer_core,accel_outer_core, &
-                        b_displ_outer_core,b_accel_outer_core, &
-                        vector_accel_outer_core,vector_displ_outer_core, &
-                        b_vector_displ_outer_core, &
-                        div_displ_outer_core,b_div_displ_outer_core, &
-                        rhostore_outer_core,kappavstore_outer_core, &
-                        rho_kl_outer_core,alpha_kl_outer_core, &
-                        deviatoric_outercore,nspec_beta_kl_outer_core,beta_kl_outer_core, &
-                        deltat)
-
-    ! inner core
-    call compute_kernels_inner_core(ibool_inner_core, &
-                          rho_kl_inner_core,beta_kl_inner_core, &
-                          alpha_kl_inner_core, &
-                          accel_inner_core,b_displ_inner_core, &
-                          epsilondev_inner_core,b_epsilondev_inner_core, &
-                          eps_trace_over_3_inner_core,b_eps_trace_over_3_inner_core, &
-                          deltat)
-
-!<YANGL
-    ! NOISE TOMOGRAPHY --- source strength kernel
-    if (NOISE_TOMOGRAPHY == 3)  &
-       call compute_kernels_strength_noise(nmovie_points,ibool_crust_mantle, &
-                          Sigma_kl_crust_mantle,displ_crust_mantle,deltat,it, &
-                          normal_x_noise,normal_y_noise,normal_z_noise, &
-                          NSPEC2D_TOP(IREGION_CRUST_MANTLE),noise_surface_movie, &
-                          ibelm_top_crust_mantle)
-!>YANGL
-
-    ! --- boundary kernels ------
-    if (SAVE_BOUNDARY_MESH) then
-      fluid_solid_boundary = .false.
-      iregion_code = IREGION_CRUST_MANTLE
-
-      ! Moho
-      if (.not. SUPPRESS_CRUSTAL_MESH .and. HONOR_1D_SPHERICAL_MOHO) then
-        call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
-                 b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-                 ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-            ! -- idoubling_crust_mantle, &
-                 xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-                 etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-                 gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-                 rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-                 kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-                 c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-                 c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-                 c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-                 c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-                 c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-                 c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-                 k_top,ibelm_moho_top,normal_moho,moho_kl_top,fluid_solid_boundary,NSPEC2D_MOHO)
-
-        call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
-                 b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-                 ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-              ! --idoubling_crust_mantle, &
-                 xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-                 etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-                 gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-                 rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-                 kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-                 c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-                 c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-                 c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-                 c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-                 c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-                 c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-                 k_bot,ibelm_moho_bot,normal_moho,moho_kl_bot,fluid_solid_boundary,NSPEC2D_MOHO)
-
-        moho_kl = moho_kl + (moho_kl_top - moho_kl_bot) * deltat
-      endif
-
-      ! 400
-      call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
-                 b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-                 ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-            ! --idoubling_crust_mantle, &
-                 xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-                 etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-                 gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-                 rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-                 kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-                 c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-                 c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-                 c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-                 c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-                 c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-                 c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-                 k_top,ibelm_400_top,normal_400,d400_kl_top,fluid_solid_boundary,NSPEC2D_400)
-
-      call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
-                 b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-                 ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-              ! --idoubling_crust_mantle, &
-                 xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-                 etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-                 gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-                 rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-                 kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-                 c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-                 c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-                 c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-                 c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-                 c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-                 c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-                 k_bot,ibelm_400_bot,normal_400,d400_kl_bot,fluid_solid_boundary,NSPEC2D_400)
-
-      d400_kl = d400_kl + (d400_kl_top - d400_kl_bot) * deltat
-
-      ! 670
-      call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
-                 b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-                 ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-              ! --idoubling_crust_mantle, &
-                 xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-                 etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-                 gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-                 rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-                 kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-                 c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-                 c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-                 c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-                 c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-                 c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-                 c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-                 k_top,ibelm_670_top,normal_670,d670_kl_top,fluid_solid_boundary,NSPEC2D_670)
-
-      call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
-                 b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-                 ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-              ! --idoubling_crust_mantle, &
-                 xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-                 etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-                 gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-                 rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle, &
-                 kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-                 c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-                 c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-                 c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-                 c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-                 c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-                 c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-                 k_bot,ibelm_670_bot,normal_670,d670_kl_bot,fluid_solid_boundary,NSPEC2D_670)
-
-      d670_kl = d670_kl + (d670_kl_top - d670_kl_bot) * deltat
-
-      ! CMB
-      fluid_solid_boundary = .true.
-      iregion_code = IREGION_CRUST_MANTLE
-      call compute_boundary_kernel(displ_crust_mantle,accel_crust_mantle, &
-                 b_displ_crust_mantle,nspec_crust_mantle,iregion_code, &
-                 ystore_crust_mantle,zstore_crust_mantle,ibool_crust_mantle,ispec_is_tiso_crust_mantle, &
-              ! -- idoubling_crust_mantle, &
-                 xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
-                 etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle,&
-                 gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle,hprime_xx,hprime_yy,hprime_zz, &
-                 rhostore_crust_mantle,kappavstore_crust_mantle, muvstore_crust_mantle, &
-                 kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle, &
-                 c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle,c14store_crust_mantle, &
-                 c15store_crust_mantle,c16store_crust_mantle,c22store_crust_mantle, &
-                 c23store_crust_mantle,c24store_crust_mantle,c25store_crust_mantle,c26store_crust_mantle, &
-                 c33store_crust_mantle,c34store_crust_mantle,c35store_crust_mantle, &
-                 c36store_crust_mantle,c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
-                 c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
-                 k_top,ibelm_bottom_crust_mantle,normal_top_outer_core, &
-                 cmb_kl_top,fluid_solid_boundary,NSPEC2D_CMB)
-
-      iregion_code = IREGION_OUTER_CORE
-      call compute_boundary_kernel(vector_displ_outer_core,vector_accel_outer_core, &
-                 b_vector_displ_outer_core,nspec_outer_core, &
-                 iregion_code,ystore_outer_core,zstore_outer_core,ibool_outer_core,ispec_is_tiso_outer_core, &
-              ! --idoubling_outer_core, &
-                 xix_outer_core,xiy_outer_core,xiz_outer_core, &
-                 etax_outer_core,etay_outer_core,etaz_outer_core,&
-                 gammax_outer_core,gammay_outer_core,gammaz_outer_core,hprime_xx,hprime_yy,hprime_zz, &
-                 rhostore_outer_core,kappavstore_outer_core,dummy_array, &
-                 dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array, &
-                 k_bot,ibelm_top_outer_core,normal_top_outer_core, &
-                 cmb_kl_bot,fluid_solid_boundary,NSPEC2D_CMB)
-
-      cmb_kl = cmb_kl + (cmb_kl_top - cmb_kl_bot) * deltat
-
-      ! ICB
-      fluid_solid_boundary = .true.
-      call compute_boundary_kernel(vector_displ_outer_core,vector_accel_outer_core, &
-                 b_vector_displ_outer_core,nspec_outer_core, &
-                 iregion_code,ystore_outer_core,zstore_outer_core,ibool_outer_core,ispec_is_tiso_outer_core, &
-              ! --idoubling_outer_core, &
-                 xix_outer_core,xiy_outer_core,xiz_outer_core, &
-                 etax_outer_core,etay_outer_core,etaz_outer_core,&
-                 gammax_outer_core,gammay_outer_core,gammaz_outer_core,hprime_xx,hprime_yy,hprime_zz, &
-                 rhostore_outer_core,kappavstore_outer_core,dummy_array, &
-                 dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array, &
-                 k_top,ibelm_bottom_outer_core,normal_bottom_outer_core, &
-                 icb_kl_top,fluid_solid_boundary,NSPEC2D_ICB)
-
-      iregion_code = IREGION_INNER_CORE
-      call compute_boundary_kernel(displ_inner_core,accel_inner_core, &
-                 b_displ_inner_core,nspec_inner_core,iregion_code, &
-                 ystore_inner_core,zstore_inner_core,ibool_inner_core,ispec_is_tiso_inner_core, &
-              ! -- idoubling_inner_core, &
-                 xix_inner_core,xiy_inner_core,xiz_inner_core, &
-                 etax_inner_core,etay_inner_core,etaz_inner_core,&
-                 gammax_inner_core,gammay_inner_core,gammaz_inner_core,hprime_xx,hprime_yy,hprime_zz, &
-                 rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
-                 dummy_array,dummy_array,dummy_array, &
-                 c11store_inner_core,c12store_inner_core,c13store_inner_core,dummy_array, &
-                 dummy_array,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array,dummy_array, &
-                 c33store_inner_core,dummy_array,dummy_array, &
-                 dummy_array,c44store_inner_core,dummy_array,dummy_array, &
-                 dummy_array,dummy_array,dummy_array, &
-                 k_bot,ibelm_top_inner_core,normal_bottom_outer_core, &
-                 icb_kl_bot,fluid_solid_boundary,NSPEC2D_ICB)
-
-      icb_kl = icb_kl + (icb_kl_top - icb_kl_bot) * deltat
-    endif
-
-    ! approximate hessian
-    if( APPROXIMATE_HESS_KL ) then
-      call compute_kernels_hessian(ibool_crust_mantle, &
-                          hess_kl_crust_mantle,&
-                          accel_crust_mantle,b_accel_crust_mantle, &
-                          deltat)
-    endif
-
-  endif ! end computing kernels
-
-!
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!
-
-!<YANGL
-  ! first step of noise tomography, i.e., save a surface movie at every time step
-  ! modified from the subroutine 'write_movie_surface'
-  if ( NOISE_TOMOGRAPHY == 1 ) then
-        call noise_save_surface_movie(displ_crust_mantle, &
-                            ibelm_top_crust_mantle,ibool_crust_mantle, &
-                            NSPEC2D_TOP(IREGION_CRUST_MANTLE),noise_surface_movie,it)
-  endif
-!>YANGL
-
-  ! save movie on surface
-  if( MOVIE_SURFACE ) then
-    if( mod(it,NTSTEP_BETWEEN_FRAMES) == 0) then
-      ! save velocity here to avoid static offset on displacement for movies
-      call write_movie_surface(myrank,nmovie_points,scale_veloc,veloc_crust_mantle, &
-                    scale_displ,displ_crust_mantle, &
-                    xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
-                    store_val_x,store_val_y,store_val_z, &
-                    store_val_x_all,store_val_y_all,store_val_z_all, &
-                    store_val_ux,store_val_uy,store_val_uz, &
-                    store_val_ux_all,store_val_uy_all,store_val_uz_all, &
-                    ibelm_top_crust_mantle,ibool_crust_mantle, &
-                    NSPEC2D_TOP(IREGION_CRUST_MANTLE), &
-                    NIT,it,OUTPUT_FILES,MOVIE_VOLUME_TYPE)
-    endif
-  endif
-
-
-  ! save movie in full 3D mesh
-  if(MOVIE_VOLUME ) then
-    if( mod(it-MOVIE_START,NTSTEP_BETWEEN_FRAMES) == 0  &
-      .and. it >= MOVIE_START .and. it <= MOVIE_STOP) then
-
-      if (MOVIE_VOLUME_TYPE == 1) then  ! output strains
-
-        call  write_movie_volume_strains(myrank,npoints_3dmovie, &
-                    LOCAL_PATH,MOVIE_VOLUME_TYPE,MOVIE_COARSE, &
-                    it,eps_trace_over_3_crust_mantle,epsilondev_crust_mantle, &
-                    muvstore_crust_mantle_3dmovie, &
-                    mask_3dmovie,nu_3dmovie)
-
-      else if (MOVIE_VOLUME_TYPE == 2 .or. MOVIE_VOLUME_TYPE == 3) then
-        ! output the Time Integral of Strain, or \mu*TIS
-        call  write_movie_volume_strains(myrank,npoints_3dmovie, &
-                    LOCAL_PATH,MOVIE_VOLUME_TYPE,MOVIE_COARSE, &
-                    it,Ieps_trace_over_3_crust_mantle,Iepsilondev_crust_mantle, &
-                    muvstore_crust_mantle_3dmovie, &
-                    mask_3dmovie,nu_3dmovie)
-
-      else if (MOVIE_VOLUME_TYPE == 4) then ! output divergence and curl in whole volume
-
-        call write_movie_volume_divcurl(myrank,it,eps_trace_over_3_crust_mantle,&
-                        div_displ_outer_core, &
-                        accel_outer_core,kappavstore_outer_core,rhostore_outer_core,ibool_outer_core, &
-                        eps_trace_over_3_inner_core, &
-                        epsilondev_crust_mantle,epsilondev_inner_core, &
-                        LOCAL_PATH, &
-                        displ_crust_mantle,displ_inner_core,displ_outer_core, &
-                        veloc_crust_mantle,veloc_inner_core,veloc_outer_core, &
-                        accel_crust_mantle,accel_inner_core, &
-                        ibool_crust_mantle,ibool_inner_core)
-
-      else if (MOVIE_VOLUME_TYPE == 5) then !output displacement
-        scalingval = scale_displ
-        call write_movie_volume_vector(myrank,it,npoints_3dmovie, &
-                    LOCAL_PATH,MOVIE_VOLUME_TYPE, &
-                    MOVIE_COARSE,ibool_crust_mantle,displ_crust_mantle, &
-                    scalingval,mask_3dmovie,nu_3dmovie)
-
-      else if (MOVIE_VOLUME_TYPE == 6) then !output velocity
-        scalingval = scale_veloc
-        call write_movie_volume_vector(myrank,it,npoints_3dmovie, &
-                    LOCAL_PATH,MOVIE_VOLUME_TYPE, &
-                    MOVIE_COARSE,ibool_crust_mantle,veloc_crust_mantle, &
-                    scalingval,mask_3dmovie,nu_3dmovie)
-
-      else
-
-        call exit_MPI(myrank, 'MOVIE_VOLUME_TYPE has to be 1,2,3,4,5 or 6')
-
-      endif ! MOVIE_VOLUME_TYPE
-    endif
-  endif ! MOVIE_VOLUME
-
-!daniel: debugging
-!  if( SNAPSHOT_INNER_CORE .and. mod(it-MOVIE_START,NTSTEP_BETWEEN_FRAMES) == 0  &
-!      .and. it >= MOVIE_START .and. it <= MOVIE_STOP) then
-!    ! VTK file output
-!    ! displacement values
-!    !write(prname,'(a,i6.6,a)') trim(LOCAL_PATH)//'/'//'proc',myrank,'_'
-!    !write(filename,'(a,a,i6.6)') prname(1:len_trim(prname)),'reg_3_displ_',it
-!    !call write_VTK_data_cr(idoubling_inner_core,NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
-!    !                    xstore_inner_core,ystore_inner_core,zstore_inner_core,ibool_inner_core, &
-!    !                    displ_inner_core,filename)
-!
-!    write(prname,'(a)') 'OUTPUT_FILES/snapshot_all_'
-!    write(filename,'(a,a,i6.6)') prname(1:len_trim(prname)),'reg_3_displ_',it
-!    call write_VTK_data_cr_all(myrank,idoubling_inner_core, &
-!                        NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
-!                        xstore_inner_core,ystore_inner_core,zstore_inner_core,ibool_inner_core, &
-!                        displ_inner_core,filename)
-!
-!  endif
-
-
-!---- end of time iteration loop
-!
-  enddo   ! end of main time loop
-!
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!-------------------------------------------------------------------------------------------------
-!
-
-  ! synchronize all processes, waits until all processes have written their seismograms
-  call MPI_BARRIER(MPI_COMM_WORLD,ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error synchronize after time loop')
-
-  ! closes Stacey absorbing boundary snapshots
-  if( ABSORBING_CONDITIONS ) then
-    ! crust mantle
-    if (nspec2D_xmin_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      call close_file_abs(0)
-    endif
-
-    if (nspec2D_xmax_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      call close_file_abs(1)
-    endif
-
-    if (nspec2D_ymin_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      call close_file_abs(2)
-    endif
-
-    if (nspec2D_ymax_crust_mantle > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      call close_file_abs(3)
-    endif
-
-    ! outer core
-    if (nspec2D_xmin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      call close_file_abs(4)
-    endif
-
-    if (nspec2D_xmax_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      call close_file_abs(5)
-    endif
-
-    if (nspec2D_ymin_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      call close_file_abs(6)
-    endif
-
-    if (nspec2D_ymax_outer_core > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      call close_file_abs(7)
-    endif
-
-    if (NSPEC2D_BOTTOM(IREGION_OUTER_CORE) > 0 .and. (SIMULATION_TYPE == 3 &
-      .or. (SIMULATION_TYPE == 1 .and. SAVE_FORWARD))) then
-      call close_file_abs(8)
-    endif
-
-    ! frees memory
-    deallocate(absorb_xmin_crust_mantle5, &
-              absorb_xmax_crust_mantle5, &
-              absorb_ymin_crust_mantle5, &
-              absorb_ymax_crust_mantle5, &
-              absorb_xmin_outer_core, &
-              absorb_xmax_outer_core, &
-              absorb_ymin_outer_core, &
-              absorb_ymax_outer_core, &
-              absorb_zmin_outer_core)    
-  endif
-
-  ! save/read the surface movie using the same c routine as we do for absorbing boundaries (file ID is 9)
-  if (NOISE_TOMOGRAPHY/=0) then
-    call close_file_abs(9)
-  endif
-
-
-  ! synchronize all processes
-  call MPI_BARRIER(MPI_COMM_WORLD,ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error synchronize closing snapshots')
-
-  ! save files to local disk or tape system if restart file
-  call save_forward_arrays(myrank,SIMULATION_TYPE,SAVE_FORWARD, &
-                    NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN, &
-                    displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
-                    displ_inner_core,veloc_inner_core,accel_inner_core, &
-                    displ_outer_core,veloc_outer_core,accel_outer_core, &
-                    R_memory_crust_mantle,R_memory_inner_core, &
-                    epsilondev_crust_mantle,epsilondev_inner_core, &
-                    A_array_rotation,B_array_rotation, &
-                    LOCAL_PATH)
-
-  ! synchronize all processes
-  call MPI_BARRIER(MPI_COMM_WORLD,ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error synchronize saving forward')
-
-  ! dump kernel arrays
-  if (SIMULATION_TYPE == 3) then
-    ! crust mantle
-    call save_kernels_crust_mantle(myrank,scale_t,scale_displ, &
-                  cijkl_kl_crust_mantle,rho_kl_crust_mantle, &
-                  alpha_kl_crust_mantle,beta_kl_crust_mantle, &
-                  ystore_crust_mantle,zstore_crust_mantle, &
-                  rhostore_crust_mantle,muvstore_crust_mantle, &
-                  kappavstore_crust_mantle,ibool_crust_mantle, &
-                  kappahstore_crust_mantle,muhstore_crust_mantle, &
-                  eta_anisostore_crust_mantle,ispec_is_tiso_crust_mantle, &
-              ! --idoubling_crust_mantle, &
-                  LOCAL_PATH)
-
-!<YANGL
-    ! noise strength kernel
-    if (NOISE_TOMOGRAPHY == 3) then
-       call save_kernels_strength_noise(myrank,LOCAL_PATH,Sigma_kl_crust_mantle)
-    endif
-!>YANGL
-
-    ! outer core
-    call save_kernels_outer_core(myrank,scale_t,scale_displ, &
-                        rho_kl_outer_core,alpha_kl_outer_core, &
-                        rhostore_outer_core,kappavstore_outer_core, &
-                        deviatoric_outercore,nspec_beta_kl_outer_core,beta_kl_outer_core, &
-                        LOCAL_PATH)
-
-    ! inner core
-    call save_kernels_inner_core(myrank,scale_t,scale_displ, &
-                          rho_kl_inner_core,beta_kl_inner_core,alpha_kl_inner_core, &
-                          rhostore_inner_core,muvstore_inner_core,kappavstore_inner_core, &
-                          LOCAL_PATH)
-
-    ! boundary kernel
-    if (SAVE_BOUNDARY_MESH) then
-      call save_kernels_boundary_kl(myrank,scale_t,scale_displ, &
-                                  moho_kl,d400_kl,d670_kl,cmb_kl,icb_kl, &
-                                  LOCAL_PATH,HONOR_1D_SPHERICAL_MOHO)
-    endif
-
-    ! approximate hessian
-    if( APPROXIMATE_HESS_KL ) then
-      call save_kernels_hessian(myrank,scale_t,scale_displ, &
-                                            hess_kl_crust_mantle,LOCAL_PATH)
-    endif
-  endif
-
-  ! save source derivatives for adjoint simulations
-  if (SIMULATION_TYPE == 2 .and. nrec_local > 0) then
-    call save_kernels_source_derivatives(nrec_local,NSOURCES,scale_displ,scale_t, &
-                                nu_source,moment_der,sloc_der,stshift_der,shdur_der,number_receiver_global)
-  endif
-
-  ! frees dynamically allocated memory
-  ! mpi buffers
-  deallocate(buffer_send_faces, &
-            buffer_received_faces, &
-            b_buffer_send_faces, &
-            b_buffer_received_faces)
-
-  ! central cube buffers
-  deallocate(sender_from_slices_to_cube, &
-            buffer_all_cube_from_slices, &
-            b_buffer_all_cube_from_slices, &
-            buffer_slices, &
-            b_buffer_slices, &
-            buffer_slices2, &
-            ibool_central_cube)
-
-  ! sources
-  deallocate(islice_selected_source, &
-          ispec_selected_source, &
-          Mxx, &
-          Myy, &
-          Mzz, &
-          Mxy, &
-          Mxz, &
-          Myz, &
-          xi_source, &
-          eta_source, &
-          gamma_source, &
-          tshift_cmt, &
-          hdur, &
-          hdur_gaussian, &
-          theta_source, &
-          phi_source, &
-          nu_source)
-  if (SIMULATION_TYPE == 1  .or. SIMULATION_TYPE == 3) deallocate(sourcearrays)
-  if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
-    deallocate(iadj_vec)
-    if(nadj_rec_local > 0) then
-      deallocate(adj_sourcearrays)
-      deallocate(iadjsrc,iadjsrc_len)
-    endif
-  endif
-  
-  ! receivers
-  deallocate(islice_selected_rec, &
-          ispec_selected_rec, &
-          xi_receiver, &
-          eta_receiver, &
-          gamma_receiver, &
-          station_name, &
-          network_name, &
-          stlat, &
-          stlon, &
-          stele, &
-          stbur, &
-          nu, &
-          number_receiver_global)
-  if( nrec_local > 0 ) then
-    deallocate(hxir_store, &
-              hetar_store, &
-              hgammar_store)
-    if( SIMULATION_TYPE == 2 ) then
-      deallocate(moment_der,stshift_der)
-    endif
-  endif
-  deallocate(seismograms)
-
-  if (SIMULATION_TYPE == 3) then
-    if( APPROXIMATE_HESS_KL ) then
-      deallocate(hess_kl_crust_mantle)
-    endif
-    deallocate(beta_kl_outer_core)
-  endif
-  
-  ! movies
-  if(MOVIE_SURFACE .or. NOISE_TOMOGRAPHY /= 0 ) then
-    deallocate(store_val_x, &
-              store_val_y, &
-              store_val_z, &
-              store_val_ux, &
-              store_val_uy, &
-              store_val_uz)
-    if (MOVIE_SURFACE) then
-      deallocate(store_val_x_all, &
-            store_val_y_all, &
-            store_val_z_all, &
-            store_val_ux_all, &
-            store_val_uy_all, &
-            store_val_uz_all)
-    endif
-  endif
-  if(MOVIE_VOLUME) then
-    deallocate(nu_3dmovie)
-  endif
-  
-  ! noise simulations
-  if ( NOISE_TOMOGRAPHY /= 0 ) then
-    deallocate(noise_sourcearray, &
-            normal_x_noise, &
-            normal_y_noise, &
-            normal_z_noise, &
-            mask_noise, &
-            noise_surface_movie)
-  endif
-
-  ! close the main output file
-  if(myrank == 0) then
-    write(IMAIN,*)
-    write(IMAIN,*) 'End of the simulation'
-    write(IMAIN,*)
-    close(IMAIN)
-  endif
-
-  ! synchronize all the processes to make sure everybody has finished
-  call MPI_BARRIER(MPI_COMM_WORLD,ier)
-  if( ier /= 0 ) call exit_mpi(myrank,'error synchronize finishing simulation')
-
   ! stop all the MPI processes, and exit
   call MPI_FINALIZE(ier)
 

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,827 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  5 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!                        Princeton University, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+!
+! United States and French Government Sponsorship Acknowledged.
+
+module constants
+
+  include "constants.h"
+
+  ! include values created by the mesher
+  ! done for performance only using static allocation to allow for loop unrolling
+  include "OUTPUT_FILES/values_from_mesher.h"
+
+end module constants
+
+!=====================================================================
+
+module specfem_par
+
+! main parameter module for specfem simulations
+
+  use constants
+
+  implicit none
+
+  !-----------------------------------------------------------------
+  ! attenuation parameters
+  !-----------------------------------------------------------------
+
+  ! memory variables and standard linear solids for attenuation
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval, betaval, gammaval
+  ! ADJOINT
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: b_alphaval, b_betaval, b_gammaval
+
+  !-----------------------------------------------------------------
+  ! topography/bathymetry & oceans
+  !-----------------------------------------------------------------
+
+  ! use integer array to store values
+  integer, dimension(NX_BATHY,NY_BATHY) :: ibathy_topo
+
+  ! additional mass matrix for ocean load
+  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE_OCEANS) :: rmass_ocean_load
+
+  ! flag to mask ocean-bottom degrees of freedom for ocean load
+  logical, dimension(NGLOB_CRUST_MANTLE_OCEANS) :: updated_dof_ocean_load
+
+  !-----------------------------------------------------------------
+  ! ellipticity
+  !-----------------------------------------------------------------
+
+  ! for ellipticity
+  integer :: nspl
+  double precision :: rspl(NR),espl(NR),espl2(NR)
+
+  !-----------------------------------------------------------------
+  ! rotation
+  !-----------------------------------------------------------------
+
+  ! non-dimensionalized rotation rate of the Earth times two
+  real(kind=CUSTOM_REAL) :: two_omega_earth
+
+  ! for the Euler scheme for rotation
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROTATION) :: &
+    A_array_rotation,B_array_rotation
+
+  !ADJOINT
+  real(kind=CUSTOM_REAL) b_two_omega_earth
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ROT_ADJOINT) :: &
+    b_A_array_rotation,b_B_array_rotation
+
+  !-----------------------------------------------------------------
+  ! gravity
+  !-----------------------------------------------------------------
+
+  ! lookup table every km for gravity
+  real(kind=CUSTOM_REAL) :: minus_g_cmb,minus_g_icb
+  double precision, dimension(NRAD_GRAVITY) :: minus_gravity_table, &
+    minus_deriv_gravity_table,density_table,d_ln_density_dr_table,minus_rho_g_over_kappa_fluid
+
+  !-----------------------------------------------------------------
+  ! time scheme
+  !-----------------------------------------------------------------
+
+  integer :: it
+
+  ! Newmark time scheme parameters and non-dimensionalization
+  double precision :: scale_t,scale_t_inv,scale_displ,scale_veloc
+  real(kind=CUSTOM_REAL) :: deltat,deltatover2,deltatsqover2
+  ! ADJOINT
+  real(kind=CUSTOM_REAL) :: b_deltat,b_deltatover2,b_deltatsqover2
+
+#ifdef _HANDOPT
+  integer :: imodulo_NGLOB_CRUST_MANTLE,imodulo_NGLOB_CRUST_MANTLE4, &
+            imodulo_NGLOB_INNER_CORE,imodulo_NGLOB_OUTER_CORE
+#endif
+
+  !-----------------------------------------------------------------
+  ! sources
+  !-----------------------------------------------------------------
+
+  ! parameters for the source
+  integer :: NSOURCES,nsources_local
+  integer, dimension(:), allocatable :: islice_selected_source,ispec_selected_source
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: sourcearrays
+  double precision, dimension(:,:,:) ,allocatable:: nu_source
+
+  double precision, dimension(:), allocatable :: Mxx,Myy,Mzz,Mxy,Mxz,Myz
+  double precision, dimension(:), allocatable :: xi_source,eta_source,gamma_source
+  double precision, dimension(:), allocatable :: tshift_cmt,hdur,hdur_gaussian
+  double precision, dimension(:), allocatable :: theta_source,phi_source
+  double precision, external :: comp_source_time_function
+  double precision :: t0
+
+  !-----------------------------------------------------------------
+  ! receivers
+  !-----------------------------------------------------------------
+
+  ! receiver information
+  integer :: nrec,nrec_local
+  integer, dimension(:), allocatable :: islice_selected_rec,ispec_selected_rec
+  integer, dimension(:), allocatable :: number_receiver_global
+  double precision, dimension(:), allocatable :: xi_receiver,eta_receiver,gamma_receiver
+  double precision, dimension(:,:,:), allocatable :: nu
+  double precision, allocatable, dimension(:) :: stlat,stlon,stele,stbur
+  character(len=MAX_LENGTH_STATION_NAME), dimension(:), allocatable  :: station_name
+  character(len=MAX_LENGTH_NETWORK_NAME), dimension(:), allocatable :: network_name
+  character(len=150) :: STATIONS,rec_filename
+
+  ! Lagrange interpolators at receivers
+  double precision, dimension(:,:), allocatable :: hxir_store,hetar_store,hgammar_store
+
+  !ADJOINT
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:,:), allocatable :: adj_sourcearrays
+  integer :: nrec_simulation, nadj_rec_local
+  integer :: NSTEP_SUB_ADJ  ! to read input in chunks
+  integer, dimension(:,:), allocatable :: iadjsrc ! to read input in chunks
+  integer, dimension(:), allocatable :: iadjsrc_len,iadj_vec
+  ! source frechet derivatives
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: moment_der
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: sloc_der
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: stshift_der, shdur_der
+  double precision, dimension(:,:), allocatable :: hpxir_store,hpetar_store,hpgammar_store
+  integer :: nadj_hprec_local
+
+  !-----------------------------------------------------------------
+  ! seismograms
+  !-----------------------------------------------------------------
+
+  ! seismograms
+  integer :: it_begin,it_end,nit_written
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: seismograms
+  integer :: seismo_offset, seismo_current
+
+  ! for SAC headers for seismograms
+  integer :: yr_SAC,jda_SAC,ho_SAC,mi_SAC
+  real :: mb_SAC
+  double precision :: t_cmt_SAC,t_shift_SAC,elat_SAC,elon_SAC,depth_SAC, &
+    cmt_lat_SAC,cmt_lon_SAC,cmt_depth_SAC,cmt_hdur_SAC,sec_SAC
+  character(len=20) :: event_name_SAC
+
+
+  !-----------------------------------------------------------------
+  ! GLL points & weights
+  !-----------------------------------------------------------------
+
+  ! Gauss-Lobatto-Legendre points of integration and weights
+  double precision, dimension(NGLLX) :: xigll,wxgll
+  double precision, dimension(NGLLY) :: yigll,wygll
+  double precision, dimension(NGLLZ) :: zigll,wzgll
+
+  ! product of weights for gravity term
+  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
+
+  ! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xx,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xxT,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
+  real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+
+  !-----------------------------------------------------------------
+  ! file parameters
+  !-----------------------------------------------------------------
+
+  ! parameters read from parameter file
+  integer MIN_ATTENUATION_PERIOD,MAX_ATTENUATION_PERIOD,NER_CRUST, &
+          NER_80_MOHO,NER_220_80,NER_400_220,NER_600_400,NER_670_600,NER_771_670, &
+          NER_TOPDDOUBLEPRIME_771,NER_CMB_TOPDDOUBLEPRIME,NER_OUTER_CORE, &
+          NER_TOP_CENTRAL_CUBE_ICB,NEX_XI,NEX_ETA, &
+          NTSTEP_BETWEEN_OUTPUT_SEISMOS,&
+          NTSTEP_BETWEEN_READ_ADJSRC,NSTEP,NTSTEP_BETWEEN_FRAMES, &
+          NTSTEP_BETWEEN_OUTPUT_INFO,NUMBER_OF_RUNS,NUMBER_OF_THIS_RUN,SIMULATION_TYPE, &
+          MOVIE_VOLUME_TYPE,MOVIE_START,MOVIE_STOP,NOISE_TOMOGRAPHY
+
+  double precision DT,ROCEAN,RMIDDLE_CRUST, &
+          RMOHO,R80,R220,R400,R600,R670,R771,RTOPDDOUBLEPRIME,RCMB,RICB, &
+          RHO_TOP_OC,RHO_BOTTOM_OC,RHO_OCEANS,HDUR_MOVIE, &
+          MOVIE_TOP,MOVIE_BOTTOM,MOVIE_WEST,MOVIE_EAST,MOVIE_NORTH,MOVIE_SOUTH, &
+          ANGULAR_WIDTH_XI_IN_DEGREES
+
+  logical ONE_CRUST,TOPOGRAPHY,MOVIE_SURFACE,MOVIE_VOLUME,MOVIE_COARSE, &
+          RECEIVERS_CAN_BE_BURIED,PRINT_SOURCE_TIME_FUNCTION, &
+          SAVE_MESH_FILES,ABSORBING_CONDITIONS,INCLUDE_CENTRAL_CUBE,SAVE_FORWARD, &
+          OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM,OUTPUT_SEISMOS_SAC_BINARY, &
+          ROTATE_SEISMOGRAMS_RT,HONOR_1D_SPHERICAL_MOHO,WRITE_SEISMOGRAMS_BY_MASTER,&
+          SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE
+
+  character(len=150) OUTPUT_FILES,LOCAL_PATH,MODEL
+  ! process/partition name
+  character(len=150) :: prname
+
+
+  !-----------------------------------------------------------------
+  ! mesh
+  !-----------------------------------------------------------------
+
+  ! this for all the regions
+  integer, dimension(MAX_NUM_REGIONS) :: NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX, &
+    NSPEC2D_BOTTOM,NSPEC2D_TOP, &
+    NGLOB1D_RADIAL, &
+    NGLOB2DMAX_XMIN_XMAX,NGLOB2DMAX_YMIN_YMAX
+
+  ! computed in read_compute_parameters
+  integer, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: ner,ratio_sampling_array
+  integer, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: doubling_index
+  double precision, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: r_bottom,r_top
+  logical, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: this_region_has_a_doubling
+  double precision, dimension(MAX_NUMBER_OF_MESH_LAYERS) :: rmins,rmaxs
+
+  ! temporary arrays for elements on slices or edges
+  logical, dimension(:),allocatable :: is_on_a_slice_edge_crust_mantle, &
+    is_on_a_slice_edge_inner_core,is_on_a_slice_edge_outer_core
+
+  !-----------------------------------------------------------------
+  ! MPI partitions
+  !-----------------------------------------------------------------
+
+  ! for addressing of the slices
+  integer, dimension(NCHUNKS_VAL,0:NPROC_XI_VAL-1,0:NPROC_ETA_VAL-1) :: addressing
+  integer, dimension(0:NPROCTOT_VAL-1) :: ichunk_slice,iproc_xi_slice,iproc_eta_slice
+
+  ! proc numbers for MPI
+  integer :: myrank
+  integer :: ichunk,iproc_xi,iproc_eta
+
+  ! time loop timing
+  double precision :: time_start,tCPU
+
+  !-----------------------------------------------------------------
+  ! assembly
+  !-----------------------------------------------------------------
+
+  ! ---- arrays to assemble between chunks
+  ! communication pattern for faces between chunks
+  integer, dimension(NUMMSGS_FACES_VAL) :: iprocfrom_faces,iprocto_faces,imsg_type
+  ! communication pattern for corners between chunks
+  integer, dimension(NCORNERSCHUNKS_VAL) :: iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners
+
+  ! indirect addressing for each message for faces and corners of the chunks
+  ! a given slice can belong to at most one corner and at most two faces
+  integer :: NGLOB2DMAX_XY
+
+  ! this for non blocking MPI
+
+  ! buffers for send and receive between faces of the slices and the chunks
+  ! we use the same buffers to assemble scalars and vectors because vectors are
+  ! always three times bigger and therefore scalars can use the first part
+  ! of the vector buffer in memory even if it has an additional index here
+  integer :: npoin2D_max_all_CM_IC
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_send_faces,buffer_received_faces, &
+                                                           b_buffer_send_faces,b_buffer_received_faces
+
+  ! buffers for send and receive between corners of the chunks
+  real(kind=CUSTOM_REAL), dimension(NGLOB1D_RADIAL_CM) :: &
+    buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar, &
+    b_buffer_send_chunkcorn_scalar,b_buffer_recv_chunkcorn_scalar
+
+  ! size of buffers is the sum of two sizes because we handle two regions in the same MPI call
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB1D_RADIAL_CM + NGLOB1D_RADIAL_IC) :: &
+     buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector, &
+     b_buffer_send_chunkcorn_vector,b_buffer_recv_chunkcorn_vector
+
+  ! request ids for non-blocking MPI
+  integer :: request_send,request_receive
+  integer, dimension(NUMFACES_SHARED) :: request_send_array,request_receive_array
+  integer :: request_send_cc,request_receive_cc
+  integer, dimension(NPROC_XI_VAL+4) :: request_send_array_cc,request_receive_array_cc
+
+  integer :: b_request_send,b_request_receive
+  integer, dimension(NUMFACES_SHARED) :: b_request_send_array,b_request_receive_array
+  integer :: b_request_send_cc,b_request_receive_cc
+  integer, dimension(NPROC_XI_VAL+4) :: b_request_send_array_cc,b_request_receive_array_cc
+
+
+  logical, dimension(NGLOB_CRUST_MANTLE) :: mask_ibool
+
+  ! number of faces between chunks
+  integer :: NUMMSGS_FACES
+
+  ! number of corners between chunks
+  integer :: NCORNERSCHUNKS
+
+  ! number of message types
+  integer :: NUM_MSG_TYPES
+
+
+  ! collected MPI interfaces
+  ! MPI crust/mantle mesh
+  integer :: num_interfaces_crust_mantle
+  integer :: max_nibool_interfaces_crust_mantle
+  integer, dimension(:), allocatable :: my_neighbours_crust_mantle,nibool_interfaces_crust_mantle
+  integer, dimension(:,:), allocatable :: ibool_interfaces_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_send_vector_crust_mantle,buffer_recv_vector_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: b_buffer_send_vector_crust_mantle,b_buffer_recv_vector_crust_mantle
+
+  integer, dimension(:), allocatable :: request_send_vector_crust_mantle,request_recv_vector_crust_mantle
+  integer, dimension(:), allocatable :: b_request_send_vector_crust_mantle,b_request_recv_vector_crust_mantle
+
+  ! MPI inner core mesh
+  integer :: num_interfaces_inner_core
+  integer :: max_nibool_interfaces_inner_core
+  integer, dimension(:), allocatable :: my_neighbours_inner_core,nibool_interfaces_inner_core
+  integer, dimension(:,:), allocatable :: ibool_interfaces_inner_core
+
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_send_vector_inner_core,buffer_recv_vector_inner_core
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core
+
+  integer, dimension(:), allocatable :: request_send_vector_inner_core,request_recv_vector_inner_core
+  integer, dimension(:), allocatable :: b_request_send_vector_inner_core,b_request_recv_vector_inner_core
+
+  ! MPI outer core mesh
+  integer :: num_interfaces_outer_core
+  integer :: max_nibool_interfaces_outer_core
+  integer, dimension(:), allocatable :: my_neighbours_outer_core,nibool_interfaces_outer_core
+  integer, dimension(:,:), allocatable :: ibool_interfaces_outer_core  
+
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core
+  
+  integer, dimension(:), allocatable :: request_send_scalar_outer_core,request_recv_scalar_outer_core
+  integer, dimension(:), allocatable :: b_request_send_scalar_outer_core,b_request_recv_scalar_outer_core
+
+  !-----------------------------------------------------------------
+  ! gpu
+  !-----------------------------------------------------------------
+
+  ! CUDA mesh pointer<->integer wrapper
+  integer(kind=8) :: Mesh_pointer
+  logical :: GPU_MODE
+
+end module specfem_par
+
+
+!=====================================================================
+
+module specfem_par_crustmantle
+
+! parameter module for elastic solver in crust/mantle region
+
+  use constants
+  implicit none
+
+  ! ----------------- crust, mantle and oceans ---------------------
+  ! mesh parameters
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
+    xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle,&
+    etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+    gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: &
+    xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle
+
+  ! arrays for isotropic elements stored only where needed to save space
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
+    rhostore_crust_mantle,kappavstore_crust_mantle,muvstore_crust_mantle
+
+  ! arrays for anisotropic elements stored only where needed to save space
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_TISO_MANTLE) :: &
+    kappahstore_crust_mantle,muhstore_crust_mantle,eta_anisostore_crust_mantle
+
+  ! arrays for full anisotropy only when needed
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_MANTLE) :: &
+    c11store_crust_mantle,c12store_crust_mantle,c13store_crust_mantle, &
+    c14store_crust_mantle,c15store_crust_mantle,c16store_crust_mantle, &
+    c22store_crust_mantle,c23store_crust_mantle,c24store_crust_mantle, &
+    c25store_crust_mantle,c26store_crust_mantle,c33store_crust_mantle, &
+    c34store_crust_mantle,c35store_crust_mantle,c36store_crust_mantle, &
+    c44store_crust_mantle,c45store_crust_mantle,c46store_crust_mantle, &
+    c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle
+
+  ! local to global mapping
+  !  integer, dimension(NSPEC_CRUST_MANTLE) :: idoubling_crust_mantle
+  logical, dimension(NSPEC_CRUST_MANTLE) :: ispec_is_tiso_crust_mantle
+
+  ! mass matrix
+  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: rmass_crust_mantle
+
+  ! displacement, velocity, acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: &
+     displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle
+  ! ADJOINT
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE_ADJOINT) :: &
+    b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle
+
+  ! memory variables and standard linear solids for attenuation
+  real(kind=CUSTOM_REAL), dimension(ATT1,ATT2,ATT3,ATT4) :: &
+    one_minus_sum_beta_crust_mantle, factor_scale_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1,ATT2,ATT3,ATT4) :: &
+    factor_common_crust_mantle
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: &
+!    R_memory_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUAT) :: &
+    R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle,R_xz_crust_mantle,R_yz_crust_mantle
+
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
+!    epsilondev_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
+    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+    epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: &
+    eps_trace_over_3_crust_mantle
+
+  ! ADJOINT
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT) :: &
+!    b_R_memory_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_AND_ATT) :: &
+    b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle,b_R_xz_crust_mantle,b_R_yz_crust_mantle
+
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
+!    b_epsilondev_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
+    b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,b_epsilondev_xy_crust_mantle, &
+    b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
+    b_eps_trace_over_3_crust_mantle
+
+  ! for crust/oceans coupling
+  integer, dimension(NSPEC2DMAX_XMIN_XMAX_CM) :: ibelm_xmin_crust_mantle,ibelm_xmax_crust_mantle
+  integer, dimension(NSPEC2DMAX_YMIN_YMAX_CM) :: ibelm_ymin_crust_mantle,ibelm_ymax_crust_mantle
+  integer, dimension(NSPEC2D_BOTTOM_CM) :: ibelm_bottom_crust_mantle
+  integer, dimension(NSPEC2D_TOP_CM) :: ibelm_top_crust_mantle
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_BOTTOM_CM) :: &
+    jacobian2D_bottom_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_TOP_CM) :: &
+    jacobian2D_top_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_CM) :: &
+    jacobian2D_xmin_crust_mantle,jacobian2D_xmax_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_CM) :: &
+    jacobian2D_ymin_crust_mantle,jacobian2D_ymax_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_CM) :: &
+    normal_xmin_crust_mantle,normal_xmax_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2DMAX_YMIN_YMAX_CM) :: &
+    normal_ymin_crust_mantle,normal_ymax_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_BOTTOM_CM) :: &
+    normal_bottom_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_TOP_CM) :: &
+    normal_top_crust_mantle
+
+  ! Stacey
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STACEY) :: &
+    rho_vp_crust_mantle,rho_vs_crust_mantle
+  integer :: nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
+            nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle
+  integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_CM) :: nimin_crust_mantle,nimax_crust_mantle,nkmin_eta_crust_mantle
+  integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_CM) :: njmin_crust_mantle,njmax_crust_mantle,nkmin_xi_crust_mantle
+
+  ! daniel: not sure why name ...5 and dimensions with one additional?
+  !real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: absorb_xmin_crust_mantle5, &
+  !  absorb_xmax_crust_mantle5, absorb_ymin_crust_mantle5, absorb_ymax_crust_mantle5
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: absorb_xmin_crust_mantle, &
+    absorb_xmax_crust_mantle, absorb_ymin_crust_mantle, absorb_ymax_crust_mantle
+
+  integer :: nabs_xmin_cm,nabs_xmax_cm,nabs_ymin_cm,nabs_ymax_cm
+
+  integer :: reclen_xmin_crust_mantle, reclen_xmax_crust_mantle, &
+            reclen_ymin_crust_mantle,reclen_ymax_crust_mantle
+
+  ! assembly
+  integer :: npoin2D_faces_crust_mantle(NUMFACES_SHARED)
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle
+
+  ! indirect addressing for each corner of the chunks
+  integer, dimension(NGLOB1D_RADIAL_CM,NUMCORNERS_SHARED) :: iboolcorner_crust_mantle
+
+  ! 2-D addressing and buffers for summation between slices
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_CM) :: iboolleft_xi_crust_mantle,iboolright_xi_crust_mantle
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_CM) :: iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle
+
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_crust_mantle
+
+  ! kernels
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
+    rho_kl_crust_mantle,beta_kl_crust_mantle, alpha_kl_crust_mantle, Sigma_kl_crust_mantle
+  ! For anisotropic kernels (see compute_kernels.f90 for a definition of the array)
+  real(kind=CUSTOM_REAL), dimension(21,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
+    cijkl_kl_crust_mantle
+  ! approximate hessian
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable :: hess_kl_crust_mantle
+
+  ! Boundary Mesh and Kernels
+  integer :: k_top,k_bot,iregion_code
+  integer, dimension(NSPEC2D_MOHO) :: ibelm_moho_top,ibelm_moho_bot
+  integer, dimension(NSPEC2D_400) :: ibelm_400_top,ibelm_400_bot
+  integer, dimension(NSPEC2D_670) :: ibelm_670_top,ibelm_670_bot
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_MOHO) :: normal_moho
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_400) :: normal_400
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_670) :: normal_670
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_MOHO) :: moho_kl, moho_kl_top, moho_kl_bot
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_400) :: d400_kl, d400_kl_top, d400_kl_bot
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_670) ::  d670_kl, d670_kl_top, d670_kl_bot
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_CMB) :: cmb_kl, cmb_kl_top, cmb_kl_bot
+
+  ! NOISE_TOMOGRAPHY
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: noise_sourcearray
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: &
+    normal_x_noise,normal_y_noise,normal_z_noise, mask_noise
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: noise_surface_movie
+  integer :: irec_master_noise
+
+  ! inner / outer elements crust/mantle region
+  integer :: num_phase_ispec_crust_mantle
+  integer :: nspec_inner_crust_mantle,nspec_outer_crust_mantle
+  integer, dimension(:,:), allocatable :: phase_ispec_inner_crust_mantle
+
+end module specfem_par_crustmantle
+
+!=====================================================================
+
+module specfem_par_innercore
+
+! parameter module for elastic solver in inner core region
+
+  use constants
+  implicit none
+
+  ! ----------------- inner core ---------------------
+  ! mesh parameters
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: ibool_inner_core
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE) :: &
+    xix_inner_core,xiy_inner_core,xiz_inner_core,&
+    etax_inner_core,etay_inner_core,etaz_inner_core, &
+    gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+    rhostore_inner_core, kappavstore_inner_core,muvstore_inner_core
+
+  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: &
+    xstore_inner_core,ystore_inner_core,zstore_inner_core
+
+  ! arrays for inner-core anisotropy only when needed
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_IC) :: &
+    c11store_inner_core,c33store_inner_core,c12store_inner_core, &
+    c13store_inner_core,c44store_inner_core
+
+  ! local to global mapping
+  integer, dimension(NSPEC_INNER_CORE) :: idoubling_inner_core
+  ! only needed for compute_boundary_kernel() routine
+  !logical, dimension(NSPEC_INNER_CORE) :: ispec_is_tiso_inner_core
+
+  ! mass matrix
+  real(kind=CUSTOM_REAL), dimension(NGLOB_INNER_CORE) :: rmass_inner_core
+
+  ! displacement, velocity, acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE) :: &
+    displ_inner_core,veloc_inner_core,accel_inner_core
+  ! ADJOINT
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_INNER_CORE_ADJOINT) :: &
+    b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core
+
+  ! memory variables and standard linear solids for attenuation
+  real(kind=CUSTOM_REAL), dimension(ATT1,ATT2,ATT3,ATT5) :: &
+    one_minus_sum_beta_inner_core, factor_scale_inner_core
+  real(kind=CUSTOM_REAL), dimension(N_SLS,ATT1,ATT2,ATT3,ATT5) :: &
+    factor_common_inner_core
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
+!    R_memory_inner_core
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ATTENUATION) :: &
+    R_xx_inner_core,R_yy_inner_core,R_xy_inner_core,R_xz_inner_core,R_yz_inner_core
+
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: &
+!    epsilondev_inner_core
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: &
+    epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+    epsilondev_xz_inner_core,epsilondev_yz_inner_core
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STRAIN_ONLY) :: &
+    eps_trace_over_3_inner_core
+  ! ADJOINT
+!  real(kind=CUSTOM_REAL), dimension(5,N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT) :: &
+!    b_R_memory_inner_core
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_AND_ATT) :: &
+    b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core,b_R_xz_inner_core,b_R_yz_inner_core
+
+
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: &
+!    b_epsilondev_inner_core
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: &
+    b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+    b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core
+
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: &
+    b_eps_trace_over_3_inner_core
+
+  ! assembly
+  ! for matching with central cube in inner core
+  integer, dimension(:), allocatable :: sender_from_slices_to_cube
+  integer, dimension(:,:), allocatable :: ibool_central_cube
+  double precision, dimension(:,:), allocatable :: buffer_slices,b_buffer_slices,buffer_slices2
+  double precision, dimension(:,:,:), allocatable :: buffer_all_cube_from_slices,b_buffer_all_cube_from_slices
+  integer nb_msgs_theor_in_cube,non_zero_nb_msgs_theor_in_cube,npoin2D_cube_from_slices,receiver_cube_from_slices
+
+  integer :: nspec2D_xmin_inner_core,nspec2D_xmax_inner_core, &
+            nspec2D_ymin_inner_core,nspec2D_ymax_inner_core
+
+  integer, dimension(NSPEC2DMAX_XMIN_XMAX_IC) :: ibelm_xmin_inner_core,ibelm_xmax_inner_core
+  integer, dimension(NSPEC2DMAX_YMIN_YMAX_IC) :: ibelm_ymin_inner_core,ibelm_ymax_inner_core
+  integer, dimension(NSPEC2D_BOTTOM_IC) :: ibelm_bottom_inner_core
+  integer, dimension(NSPEC2D_TOP_IC) :: ibelm_top_inner_core
+
+  integer :: npoin2D_faces_inner_core(NUMFACES_SHARED)
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_inner_core,npoin2D_eta_inner_core
+
+  ! indirect addressing for each corner of the chunks
+  integer, dimension(NGLOB1D_RADIAL_IC,NUMCORNERS_SHARED) :: iboolcorner_inner_core
+
+  ! 2-D addressing and buffers for summation between slices
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_IC) :: iboolleft_xi_inner_core,iboolright_xi_inner_core
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_IC) :: iboolleft_eta_inner_core,iboolright_eta_inner_core
+
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_inner_core
+
+  ! adjoint kernels
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_ADJOINT) :: &
+    rho_kl_inner_core,beta_kl_inner_core, alpha_kl_inner_core
+
+  ! Boundary Mesh and Kernels
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_ICB) :: icb_kl, icb_kl_top, icb_kl_bot
+  logical :: fluid_solid_boundary
+
+  ! inner / outer elements inner core region
+  integer :: num_phase_ispec_inner_core
+  integer :: nspec_inner_inner_core,nspec_outer_inner_core
+  integer, dimension(:,:), allocatable :: phase_ispec_inner_inner_core
+
+
+end module specfem_par_innercore
+
+!=====================================================================
+
+module specfem_par_outercore
+
+! parameter module for acoustic solver in outer core region
+
+  use constants
+  implicit none
+
+  ! ----------------- outer core ---------------------
+  ! mesh parameters
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: ibool_outer_core
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: &
+    xix_outer_core,xiy_outer_core,xiz_outer_core,&
+    etax_outer_core,etay_outer_core,etaz_outer_core, &
+    gammax_outer_core,gammay_outer_core,gammaz_outer_core
+
+  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: &
+    xstore_outer_core,ystore_outer_core,zstore_outer_core
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: &
+    rhostore_outer_core,kappavstore_outer_core
+
+  ! local to global mapping
+  !integer, dimension(NSPEC_OUTER_CORE) :: idoubling_outer_core
+  ! only needed for compute_boundary_kernel()
+  !logical, dimension(NSPEC_OUTER_CORE) :: ispec_is_tiso_outer_core
+
+  ! mass matrix
+  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: rmass_outer_core
+
+  ! velocity potential
+  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE) :: &
+    displ_outer_core,veloc_outer_core,accel_outer_core
+  ! ADJOINT
+  real(kind=CUSTOM_REAL), dimension(NGLOB_OUTER_CORE_ADJOINT) :: &
+    b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core
+
+
+  ! Stacey
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_STACEY) :: vp_outer_core
+  integer :: nspec2D_xmin_outer_core,nspec2D_xmax_outer_core, &
+            nspec2D_ymin_outer_core,nspec2D_ymax_outer_core
+  integer, dimension(2,NSPEC2DMAX_YMIN_YMAX_OC) :: nimin_outer_core,nimax_outer_core,nkmin_eta_outer_core
+  integer, dimension(2,NSPEC2DMAX_XMIN_XMAX_OC) :: njmin_outer_core,njmax_outer_core,nkmin_xi_outer_core
+
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: absorb_xmin_outer_core, &
+     absorb_xmax_outer_core, absorb_ymin_outer_core, absorb_ymax_outer_core, &
+     absorb_zmin_outer_core
+  integer :: nabs_xmin_oc,nabs_xmax_oc,nabs_ymin_oc,nabs_ymax_oc,nabs_zmin_oc
+
+  integer :: reclen_xmin_outer_core, reclen_xmax_outer_core, &
+            reclen_ymin_outer_core, reclen_ymax_outer_core
+  integer :: reclen_zmin
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_OUTER_CORE) :: vector_accel_outer_core,&
+             vector_displ_outer_core, b_vector_displ_outer_core
+
+  ! arrays to couple with the fluid regions by pointwise matching
+  integer, dimension(NSPEC2DMAX_XMIN_XMAX_OC) :: ibelm_xmin_outer_core,ibelm_xmax_outer_core
+  integer, dimension(NSPEC2DMAX_YMIN_YMAX_OC) :: ibelm_ymin_outer_core,ibelm_ymax_outer_core
+  integer, dimension(NSPEC2D_BOTTOM_OC) :: ibelm_bottom_outer_core
+  integer, dimension(NSPEC2D_TOP_OC) :: ibelm_top_outer_core
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_OC) :: normal_xmin_outer_core,normal_xmax_outer_core
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_OC) :: normal_ymin_outer_core,normal_ymax_outer_core
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: normal_bottom_outer_core
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NSPEC2D_TOP_OC) :: normal_top_outer_core
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_BOTTOM_OC) :: jacobian2D_bottom_outer_core
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NSPEC2D_TOP_OC) :: jacobian2D_top_outer_core
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ,NSPEC2DMAX_XMIN_XMAX_OC) :: jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ,NSPEC2DMAX_YMIN_YMAX_OC) :: jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core
+
+  ! assembly
+  integer :: npoin2D_faces_outer_core(NUMFACES_SHARED)
+  integer, dimension(NB_SQUARE_EDGES_ONEDIR) :: npoin2D_xi_outer_core,npoin2D_eta_outer_core
+
+  ! indirect addressing for each corner of the chunks
+  integer, dimension(NGLOB1D_RADIAL_OC,NUMCORNERS_SHARED) :: iboolcorner_outer_core
+
+  ! 2-D addressing and buffers for summation between slices
+  integer, dimension(NGLOB2DMAX_XMIN_XMAX_OC) :: iboolleft_xi_outer_core,iboolright_xi_outer_core
+  integer, dimension(NGLOB2DMAX_YMIN_YMAX_OC) :: iboolleft_eta_outer_core,iboolright_eta_outer_core
+
+  integer, dimension(NGLOB2DMAX_XY_VAL,NUMFACES_SHARED) :: iboolfaces_outer_core
+
+  ! adjoint kernels
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ADJOINT) :: &
+    rho_kl_outer_core,alpha_kl_outer_core
+
+  ! kernel runs
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ADJOINT) :: &
+    div_displ_outer_core
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE_ADJOINT) :: &
+    b_div_displ_outer_core
+
+  ! check for deviatoric kernel for outer core region
+  real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable :: beta_kl_outer_core
+  integer :: nspec_beta_kl_outer_core
+  logical,parameter:: deviatoric_outercore = .false.
+
+  ! inner / outer elements outer core region
+  integer :: num_phase_ispec_outer_core
+  integer :: nspec_inner_outer_core,nspec_outer_outer_core
+  integer, dimension(:,:), allocatable :: phase_ispec_inner_outer_core
+
+
+end module specfem_par_outercore
+
+
+!=====================================================================
+
+module specfem_par_movie
+
+! parameter module for movies/shakemovies
+
+  use constants
+
+  implicit none
+
+  ! to save movie frames
+  integer :: nmovie_points,NIT
+  real(kind=CUSTOM_REAL), dimension(:), allocatable :: &
+      store_val_x,store_val_y,store_val_z, &
+      store_val_ux,store_val_uy,store_val_uz
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: &
+      store_val_x_all,store_val_y_all,store_val_z_all, &
+      store_val_ux_all,store_val_uy_all,store_val_uz_all
+
+  ! to save movie volume
+  integer :: npoints_3dmovie,nspecel_3dmovie
+  integer, dimension(NGLOB_CRUST_MANTLE) :: num_ibool_3dmovie
+  double precision :: scalingval
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
+    muvstore_crust_mantle_3dmovie
+  real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: nu_3dmovie
+  logical, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: mask_3dmovie
+
+  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
+    Iepsilondev_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: &
+    Ieps_trace_over_3_crust_mantle
+
+end module specfem_par_movie

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_output.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_output.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_output.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -0,0 +1,142 @@
+!=====================================================================
+!
+!          S p e c f e m 3 D  G l o b e  V e r s i o n  5 . 1
+!          --------------------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!                        Princeton University, USA
+!             and University of Pau / CNRS / INRIA, France
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+  subroutine write_movie_output()
+
+  use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
+  use specfem_par_movie
+  implicit none
+
+  ! local parameters
+!daniel: debugging
+!  character(len=256) :: filename
+!  logical, parameter :: SNAPSHOT_INNER_CORE = .true.
+
+  ! save movie on surface
+  if( MOVIE_SURFACE ) then
+    if( mod(it,NTSTEP_BETWEEN_FRAMES) == 0) then
+      ! save velocity here to avoid static offset on displacement for movies
+      call write_movie_surface(myrank,nmovie_points,scale_veloc,veloc_crust_mantle, &
+                    scale_displ,displ_crust_mantle, &
+                    xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
+                    store_val_x,store_val_y,store_val_z, &
+                    store_val_x_all,store_val_y_all,store_val_z_all, &
+                    store_val_ux,store_val_uy,store_val_uz, &
+                    store_val_ux_all,store_val_uy_all,store_val_uz_all, &
+                    ibelm_top_crust_mantle,ibool_crust_mantle, &
+                    NSPEC2D_TOP(IREGION_CRUST_MANTLE), &
+                    NIT,it,OUTPUT_FILES,MOVIE_VOLUME_TYPE)
+    endif
+  endif
+
+
+  ! save movie in full 3D mesh
+  if(MOVIE_VOLUME ) then
+    if( mod(it-MOVIE_START,NTSTEP_BETWEEN_FRAMES) == 0  &
+      .and. it >= MOVIE_START .and. it <= MOVIE_STOP) then
+
+      if (MOVIE_VOLUME_TYPE == 1) then  ! output strains
+
+        call  write_movie_volume_strains(myrank,npoints_3dmovie, &
+                    LOCAL_PATH,MOVIE_VOLUME_TYPE,MOVIE_COARSE, &
+                    it,eps_trace_over_3_crust_mantle, &
+                    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                    epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                    muvstore_crust_mantle_3dmovie, &
+                    mask_3dmovie,nu_3dmovie)
+
+      else if (MOVIE_VOLUME_TYPE == 2 .or. MOVIE_VOLUME_TYPE == 3) then
+        ! output the Time Integral of Strain, or \mu*TIS
+        call  write_movie_volume_strains(myrank,npoints_3dmovie, &
+                    LOCAL_PATH,MOVIE_VOLUME_TYPE,MOVIE_COARSE, &
+                    it,Ieps_trace_over_3_crust_mantle,Iepsilondev_crust_mantle, &
+                    muvstore_crust_mantle_3dmovie, &
+                    mask_3dmovie,nu_3dmovie)
+
+      else if (MOVIE_VOLUME_TYPE == 4) then ! output divergence and curl in whole volume
+
+        call write_movie_volume_divcurl(myrank,it,eps_trace_over_3_crust_mantle,&
+                        div_displ_outer_core, &
+                        accel_outer_core,kappavstore_outer_core,rhostore_outer_core,ibool_outer_core, &
+                        eps_trace_over_3_inner_core, &
+                        epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                        epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                        epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+                        epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+                        LOCAL_PATH, &
+                        displ_crust_mantle,displ_inner_core,displ_outer_core, &
+                        veloc_crust_mantle,veloc_inner_core,veloc_outer_core, &
+                        accel_crust_mantle,accel_inner_core, &
+                        ibool_crust_mantle,ibool_inner_core)
+
+      else if (MOVIE_VOLUME_TYPE == 5) then !output displacement
+        scalingval = scale_displ
+        call write_movie_volume_vector(myrank,it,npoints_3dmovie, &
+                    LOCAL_PATH,MOVIE_VOLUME_TYPE, &
+                    MOVIE_COARSE,ibool_crust_mantle,displ_crust_mantle, &
+                    scalingval,mask_3dmovie,nu_3dmovie)
+
+      else if (MOVIE_VOLUME_TYPE == 6) then !output velocity
+        scalingval = scale_veloc
+        call write_movie_volume_vector(myrank,it,npoints_3dmovie, &
+                    LOCAL_PATH,MOVIE_VOLUME_TYPE, &
+                    MOVIE_COARSE,ibool_crust_mantle,veloc_crust_mantle, &
+                    scalingval,mask_3dmovie,nu_3dmovie)
+
+      else
+
+        call exit_MPI(myrank, 'MOVIE_VOLUME_TYPE has to be 1,2,3,4,5 or 6')
+
+      endif ! MOVIE_VOLUME_TYPE
+    endif
+  endif ! MOVIE_VOLUME
+
+!daniel: debugging
+!  if( SNAPSHOT_INNER_CORE .and. mod(it-MOVIE_START,NTSTEP_BETWEEN_FRAMES) == 0  &
+!      .and. it >= MOVIE_START .and. it <= MOVIE_STOP) then
+!    ! VTK file output
+!    ! displacement values
+!    !write(prname,'(a,i6.6,a)') trim(LOCAL_PATH)//'/'//'proc',myrank,'_'
+!    !write(filename,'(a,a,i6.6)') prname(1:len_trim(prname)),'reg_3_displ_',it
+!    !call write_VTK_data_cr(idoubling_inner_core,NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
+!    !                    xstore_inner_core,ystore_inner_core,zstore_inner_core,ibool_inner_core, &
+!    !                    displ_inner_core,filename)
+!
+!    write(prname,'(a)') 'OUTPUT_FILES/snapshot_all_'
+!    write(filename,'(a,a,i6.6)') prname(1:len_trim(prname)),'reg_3_displ_',it
+!    call write_VTK_data_cr_all(myrank,idoubling_inner_core, &
+!                        NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
+!                        xstore_inner_core,ystore_inner_core,zstore_inner_core,ibool_inner_core, &
+!                        displ_inner_core,filename)
+!
+!  endif
+
+
+  end subroutine write_movie_output

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_volume.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_volume.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_volume.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -256,8 +256,11 @@
 ! ---------------------------------------------
 
   subroutine write_movie_volume_strains(myrank,npoints_3dmovie,LOCAL_PATH,MOVIE_VOLUME_TYPE,MOVIE_COARSE, &
-                    it,eps_trace_over_3_crust_mantle,epsilondev_crust_mantle,muvstore_crust_mantle_3dmovie, &
-                    mask_3dmovie,nu_3dmovie)
+                                        it,eps_trace_over_3_crust_mantle, &
+                                        epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                                        epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                                        muvstore_crust_mantle_3dmovie, &
+                                        mask_3dmovie,nu_3dmovie)
 
 
   implicit none
@@ -268,7 +271,12 @@
   ! input
   integer :: myrank,npoints_3dmovie,MOVIE_VOLUME_TYPE,it
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: eps_trace_over_3_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev_crust_mantle
+
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
+    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+    epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle
+
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: muvstore_crust_mantle_3dmovie
   logical, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: mask_3dmovie
   logical :: MOVIE_COARSE
@@ -316,13 +324,13 @@
       if(mask_3dmovie(i,j,k,ispec)) then
        ipoints_3dmovie=ipoints_3dmovie+1
        muv_3dmovie=muvstore_crust_mantle_3dmovie(i,j,k,ispec)
-       eps_loc(1,1)=eps_trace_over_3_crust_mantle(i,j,k,ispec) + epsilondev_crust_mantle(1,i,j,k,ispec)
-       eps_loc(2,2)=eps_trace_over_3_crust_mantle(i,j,k,ispec) + epsilondev_crust_mantle(2,i,j,k,ispec)
+       eps_loc(1,1)=eps_trace_over_3_crust_mantle(i,j,k,ispec) + epsilondev_xx_crust_mantle(i,j,k,ispec)
+       eps_loc(2,2)=eps_trace_over_3_crust_mantle(i,j,k,ispec) + epsilondev_yy_crust_mantle(i,j,k,ispec)
        eps_loc(3,3)=eps_trace_over_3_crust_mantle(i,j,k,ispec)- &
-                 epsilondev_crust_mantle(1,i,j,k,ispec) - epsilondev_crust_mantle(2,i,j,k,ispec)
-       eps_loc(1,2)=epsilondev_crust_mantle(3,i,j,k,ispec)
-       eps_loc(1,3)=epsilondev_crust_mantle(4,i,j,k,ispec)
-       eps_loc(2,3)=epsilondev_crust_mantle(5,i,j,k,ispec)
+                 epsilondev_xx_crust_mantle(i,j,k,ispec) - epsilondev_yy_crust_mantle(i,j,k,ispec)
+       eps_loc(1,2)=epsilondev_xy_crust_mantle(i,j,k,ispec)
+       eps_loc(1,3)=epsilondev_xz_crust_mantle(i,j,k,ispec)
+       eps_loc(2,3)=epsilondev_yz_crust_mantle(i,j,k,ispec)
        eps_loc(2,1)=eps_loc(1,2)
        eps_loc(3,1)=eps_loc(1,3)
        eps_loc(3,2)=eps_loc(2,3)
@@ -469,15 +477,18 @@
 !--------------------
 
  subroutine write_movie_volume_divcurl(myrank,it,eps_trace_over_3_crust_mantle,&
-                        div_displ_outer_core, &
-                        accel_outer_core,kappavstore_outer_core,rhostore_outer_core,ibool_outer_core, &
-                        eps_trace_over_3_inner_core, &
-                        epsilondev_crust_mantle,epsilondev_inner_core, &
-                        LOCAL_PATH, &
-                        displ_crust_mantle,displ_inner_core,displ_outer_core, &
-                        veloc_crust_mantle,veloc_inner_core,veloc_outer_core, &
-                        accel_crust_mantle,accel_inner_core, &
-                        ibool_crust_mantle,ibool_inner_core)
+                                      div_displ_outer_core, &
+                                      accel_outer_core,kappavstore_outer_core,rhostore_outer_core,ibool_outer_core, &
+                                      eps_trace_over_3_inner_core, &
+                                      epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                                      epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                                      epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+                                      epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+                                      LOCAL_PATH, &
+                                      displ_crust_mantle,displ_inner_core,displ_outer_core, &
+                                      veloc_crust_mantle,veloc_inner_core,veloc_outer_core, &
+                                      accel_crust_mantle,accel_inner_core, &
+                                      ibool_crust_mantle,ibool_inner_core)
 
   implicit none
   include "constants.h"
@@ -493,9 +504,17 @@
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_OUTER_CORE) :: ibool_outer_core
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STRAIN_ONLY) :: eps_trace_over_3_inner_core
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev_crust_mantle
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: epsilondev_inner_core
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilondev_crust_mantle
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
+    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+    epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle
 
+!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: epsilondev_inner_core
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_INNER_CORE_STR_OR_ATT) :: &
+    epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+    epsilondev_xz_inner_core,epsilondev_yz_inner_core
+
+
   character(len=150) LOCAL_PATH
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE) :: displ_crust_mantle
@@ -587,12 +606,20 @@
     ! crust mantle
     write(outputname,"('proc',i6.6,'_crust_mantle_epsdev_displ_it',i6.6,'.bin')") myrank,it
     open(unit=27,file=trim(LOCAL_PATH)//trim(outputname),status='unknown',form='unformatted')
-    write(27) epsilondev_crust_mantle
+    write(27) epsilondev_xx_crust_mantle
+    write(27) epsilondev_yy_crust_mantle
+    write(27) epsilondev_xy_crust_mantle
+    write(27) epsilondev_xz_crust_mantle
+    write(27) epsilondev_yz_crust_mantle
     close(27)
     ! inner core
     write(outputname,"('proc',i6.6,'inner_core_epsdev_displ_it',i6.6,'.bin')") myrank,it
     open(unit=27,file=trim(LOCAL_PATH)//trim(outputname),status='unknown',form='unformatted')
-    write(27) epsilondev_inner_core
+    write(27) epsilondev_xx_inner_core
+    write(27) epsilondev_yy_inner_core
+    write(27) epsilondev_xy_inner_core
+    write(27) epsilondev_xz_inner_core
+    write(27) epsilondev_yz_inner_core
     close(27)
   endif
 
@@ -609,11 +636,11 @@
       do k = 1, NGLLZ
         do j = 1, NGLLY
           do i = 1, NGLLX
-            tmp_data(i,j,k,ispec) = sqrt( epsilondev_crust_mantle(1,i,j,k,ispec)**2 &
-                                          + epsilondev_crust_mantle(2,i,j,k,ispec)**2 &
-                                          + epsilondev_crust_mantle(3,i,j,k,ispec)**2 &
-                                          + epsilondev_crust_mantle(4,i,j,k,ispec)**2 &
-                                          + epsilondev_crust_mantle(5,i,j,k,ispec)**2)
+            tmp_data(i,j,k,ispec) = sqrt( epsilondev_xx_crust_mantle(i,j,k,ispec)**2 &
+                                          + epsilondev_yy_crust_mantle(i,j,k,ispec)**2 &
+                                          + epsilondev_xy_crust_mantle(i,j,k,ispec)**2 &
+                                          + epsilondev_xz_crust_mantle(i,j,k,ispec)**2 &
+                                          + epsilondev_yz_crust_mantle(i,j,k,ispec)**2)
           enddo
         enddo
       enddo
@@ -636,11 +663,11 @@
       do k = 1, NGLLZ
         do j = 1, NGLLY
           do i = 1, NGLLX
-            tmp_data(i,j,k,ispec) = sqrt( epsilondev_inner_core(1,i,j,k,ispec)**2 &
-                                          + epsilondev_inner_core(2,i,j,k,ispec)**2 &
-                                          + epsilondev_inner_core(3,i,j,k,ispec)**2 &
-                                          + epsilondev_inner_core(4,i,j,k,ispec)**2 &
-                                          + epsilondev_inner_core(5,i,j,k,ispec)**2)
+            tmp_data(i,j,k,ispec) = sqrt( epsilondev_xx_inner_core(i,j,k,ispec)**2 &
+                                          + epsilondev_yy_inner_core(i,j,k,ispec)**2 &
+                                          + epsilondev_xy_inner_core(i,j,k,ispec)**2 &
+                                          + epsilondev_xz_inner_core(i,j,k,ispec)**2 &
+                                          + epsilondev_yz_inner_core(i,j,k,ispec)**2)
           enddo
         enddo
       enddo

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90	2012-02-13 23:52:29 UTC (rev 19621)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90	2012-02-14 15:11:07 UTC (rev 19622)
@@ -25,8 +25,93 @@
 !
 !=====================================================================
 
+  subroutine write_seismograms()
+
+! computes the maximum of the norm of the displacement
+! in all the slices using an MPI reduction
+! and output timestamp file to check that simulation is running fine
+
+  use specfem_par
+  use specfem_par_crustmantle
+  implicit none
+
+  ! update position in seismograms
+  seismo_current = seismo_current + 1
+
+  ! compute & store the seismograms only if there is at least one receiver located in this slice
+  if (nrec_local > 0) then
+    if (SIMULATION_TYPE == 1) then
+      call compute_seismograms(nrec_local,nrec,displ_crust_mantle, &
+                                nu,hxir_store,hetar_store,hgammar_store, &
+                                scale_displ,ibool_crust_mantle, &
+                                ispec_selected_rec,number_receiver_global, &
+                                seismo_current,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
+                                seismograms)
+
+    else if (SIMULATION_TYPE == 2) then
+      call compute_seismograms_adjoint(NSOURCES,nrec_local,displ_crust_mantle, &
+                    eps_trace_over_3_crust_mantle, &
+                    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                    epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                    nu_source,Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
+                    hxir_store,hetar_store,hgammar_store, &
+                    hpxir_store,hpetar_store,hpgammar_store, &
+                    tshift_cmt,hdur_gaussian,DT,t0,scale_displ, &
+                    hprime_xx,hprime_yy,hprime_zz, &
+                    xix_crust_mantle,xiy_crust_mantle,xiz_crust_mantle, &
+                    etax_crust_mantle,etay_crust_mantle,etaz_crust_mantle, &
+                    gammax_crust_mantle,gammay_crust_mantle,gammaz_crust_mantle, &
+                    moment_der,sloc_der,stshift_der,shdur_der, &
+                    NTSTEP_BETWEEN_OUTPUT_SEISMOS,seismograms,deltat, &
+                    ibool_crust_mantle,ispec_selected_source,number_receiver_global, &
+                    NSTEP,it,nit_written)
+
+    else if (SIMULATION_TYPE == 3) then
+      call compute_seismograms_backward(nrec_local,nrec,b_displ_crust_mantle, &
+                                nu,hxir_store,hetar_store,hgammar_store, &
+                                scale_displ,ibool_crust_mantle, &
+                                ispec_selected_rec,number_receiver_global, &
+                                seismo_current,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
+                                seismograms)
+
+    endif
+  endif ! nrec_local
+
+  ! write the current or final seismograms
+  if(seismo_current == NTSTEP_BETWEEN_OUTPUT_SEISMOS .or. it == it_end) then
+    if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
+      call write_seismograms_to_file(myrank,seismograms,number_receiver_global,station_name, &
+            network_name,stlat,stlon,stele,stbur, &
+            nrec,nrec_local,ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,DT,t0,it_end, &
+            yr_SAC,jda_SAC,ho_SAC,mi_SAC,sec_SAC,t_cmt_SAC,t_shift_SAC, &
+            elat_SAC,elon_SAC,depth_SAC,event_name_SAC,cmt_lat_SAC,cmt_lon_SAC,&
+            cmt_depth_SAC,cmt_hdur_SAC,NPROCTOT_VAL, &
+            OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
+            OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
+            seismo_offset,seismo_current,WRITE_SEISMOGRAMS_BY_MASTER, &
+            SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE,MODEL)
+      if(myrank==0) then
+        write(IMAIN,*)
+        write(IMAIN,*) ' Total number of time steps written: ', it-it_begin+1
+        write(IMAIN,*)
+      endif
+    else
+      if( nrec_local > 0 ) &
+        call write_adj_seismograms(seismograms,number_receiver_global, &
+                                  nrec_local,it,nit_written,DT, &
+                                  NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS,t0,LOCAL_PATH)
+        nit_written = it
+    endif
+    seismo_offset = seismo_offset + seismo_current
+    seismo_current = 0
+  endif
+
+  end subroutine write_seismograms
+
+!=====================================================================
+
 ! write seismograms to files
-  subroutine write_seismograms(myrank,seismograms,number_receiver_global,station_name, &
+  subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global,station_name, &
             network_name,stlat,stlon,stele,stbur, &
             nrec,nrec_local,ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,DT,hdur,it_end, &
             yr,jda,ho,mi,sec,tshift_cmt,t_shift, &
@@ -275,7 +360,7 @@
 
   deallocate(one_seismogram)
 
-  end subroutine write_seismograms
+  end subroutine write_seismograms_to_file
 
 !=====================================================================