[cig-commits] r20392 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER: examples/homogeneous_halfspace/in_data_files in_data_files src/cuda src/decompose_mesh_SCOTCH src/generate_databases src/shared src/specfem3D

danielpeter at geodynamics.org danielpeter at geodynamics.org
Tue Jun 19 15:23:02 PDT 2012


Author: danielpeter
Date: 2012-06-19 15:23:01 -0700 (Tue, 19 Jun 2012)
New Revision: 20392

Added:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_ipati.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic_Dev2.f90
Modified:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/Makefile.in
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_regions_mesh.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_coupling_surfaces.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_external_values.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_gll.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/save_arrays_solver.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/combine_vol_data.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/get_element_face.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/read_parameter_file.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/sum_kernels.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic_Dev.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/locate_receivers.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90
Log:
adds ipati model to mesher; updates formatting; moves higher-order Deville routines into separate file

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file	2012-06-19 22:23:01 UTC (rev 20392)
@@ -54,13 +54,14 @@
 LOCAL_PATH                      = ../in_out_files/DATABASES_MPI
 
 # interval at which we output time step info and max of norm of displacement
-NTSTEP_BETWEEN_OUTPUT_INFO      = 500
+NTSTEP_BETWEEN_OUTPUT_INFO      = 100
 
 # interval in time steps for writing of seismograms
 NTSTEP_BETWEEN_OUTPUT_SEISMOS   = 10000
 
 # interval in time steps for reading adjoint traces
-NTSTEP_BETWEEN_READ_ADJSRC      = 0      # 0 = read the whole adjoint sources at the same time
+# 0 = read the whole adjoint sources at the same time
+NTSTEP_BETWEEN_READ_ADJSRC      = 0      
 
 # print source time function
 PRINT_SOURCE_TIME_FUNCTION      = .false.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file	2012-06-19 22:23:01 UTC (rev 20392)
@@ -33,6 +33,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .false.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2012-06-19 22:23:01 UTC (rev 20392)
@@ -37,7 +37,6 @@
 #include "mesh_constants_cuda.h"
 // #include "epik_user.h"
 
-
 //  cuda constant arrays
 __device__ realw d_hprime_xx[NGLL2];
 
@@ -54,11 +53,14 @@
 __constant__ realw d_wgll_cube[NGLL3]; // needed only for gravity case
 
 // prototype for the fortran function to do non-blocking mpi send
-extern "C" void assemble_mpi_vector_send_cuda_(void*,void*,void*,void*,void*,void*,void*,void*,void*);
+void FC_FUNC_(assemble_mpi_vector_send_cuda,
+              ASSEMBLE_MPI_VECTOR_SEND_CUDA)(void*,void*,void*,void*,void*,void*,void*,void*,void*);
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // prepares a device array with with all inter-element edge-nodes -- this
 // is followed by a memcpy and MPI operations
+
 __global__ void prepare_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
                                                  int num_interfaces_ext_mesh,
                                                  int max_nibool_interfaces_ext_mesh,
@@ -151,9 +153,15 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-extern "C" void FC_FUNC_(transfer_boundary_from_device_asynchronously,TRANSFER_BOUNDARY_FROM_DEVICE_ASYNCHRONOUSLY)(long* Mesh_pointer,int* nspec_outer_elastic) {
+extern "C"
+void FC_FUNC_(transfer_boundary_from_device_a,
+              TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer,
+                                               int* nspec_outer_elastic) {
 
-  TRACE("transfer_boundary_from_device_asynchronously");
+// asynchronous transfer from device to host
+
+  TRACE("transfer_boundary_from_device_a");
+
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
   int num_blocks_x = *nspec_outer_elastic;
@@ -173,7 +181,11 @@
                                                                           mp->d_nibool_interfaces_ext_mesh,
                                                                           mp->d_ibool_interfaces_ext_mesh);
   // wait until kernel is finished before starting async memcpy
+#if CUDA_VERSION >= 4000
   cudaDeviceSynchronize();
+#else
+  cudaThreadSynchronize();
+#endif
 
   cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
                   3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
@@ -229,13 +241,16 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(transfer_boundary_to_device_asynchronously,TRANSFER_BOUNDARY_TO_DEVICE_ASYNCHRONOUSLY)(long* Mesh_pointer,
-                                                                           realw* buffer_recv_vector_ext_mesh,
-                                                                           int* num_interfaces_ext_mesh,
-                                                                           int* max_nibool_interfaces_ext_mesh) {
+void FC_FUNC_(transfer_boundary_to_device_a,
+              TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
+                                             realw* buffer_recv_vector_ext_mesh,
+                                             int* num_interfaces_ext_mesh,
+                                             int* max_nibool_interfaces_ext_mesh) {
 
-  TRACE("transfer_boundary_to_device_asynchronously");
+// asynchronous transfer from host to device
 
+  TRACE("transfer_boundary_to_device_a");
+
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_recv_buffer*sizeof(realw));
@@ -1664,7 +1679,7 @@
     // memory copy is now finished, so non-blocking MPI send can proceed
     // MPI based halo exchange
 
-    assemble_mpi_vector_send_cuda_(&(mp->NPROCS),
+    assemble_mpi_vector_send_cuda(&(mp->NPROCS),
                                    mp->send_buffer, /* "regular" memory */
                                    // mp->h_send_accel_buffer, /* pinned memory **CRASH** */
                                    mp->buffer_recv_vector_ext_mesh,

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-06-19 22:23:01 UTC (rev 20392)
@@ -120,15 +120,24 @@
 // leads up to ~ 5% performance increase
 //#define USE_MESH_COLORING_GPU
 
-// use textures for d_displ and d_accel -- 10% performance boost
+// Texture memory usage:
+// requires CUDA version >= 4.0, see check below
+// Use textures for d_displ and d_accel -- 10% performance boost
 #define USE_TEXTURES_FIELDS
-
+//
 // Using texture memory for the hprime-style constants is slower on
 // Fermi generation hardware, but *may* be faster on Kepler
 // generation.
-/* Use textures for hprime_xx */
-/* #define USE_TEXTURES_CONSTANTS */
+// Use textures for hprime_xx
+//#define USE_TEXTURES_CONSTANTS
 
+// CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize()
+#if CUDA_VERSION < 4000
+#undef USE_TEXTURES_FIELDS
+#undef USE_TEXTURES_CONSTANTS
+#endif
+
+
 // (optional) unrolling loops
 // leads up to ~1% performance increase
 //#define MANUALLY_UNROLLED_LOOPS

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2012-06-19 22:23:01 UTC (rev 20392)
@@ -103,7 +103,7 @@
   //char* errorstr;
   if(fp == 0) {
     //errorstr = (char*) strerror(errno);
-    printf("FILE ERROR:%s\n",strerror(errno));
+    printf("FILE ERROR:%s\n",(char*) strerror(errno));
     perror("file error\n");
     exit(1);
   }
@@ -183,7 +183,7 @@
   if(fp_cpu == 0) {
     //errorstr = (char*) strerror(errno);
     //printf("CPU FILE ERROR:%s\n",errorstr);
-    printf("CPU FILE ERROR:%s\n",strerror(errno));
+    printf("CPU FILE ERROR:%s\n",(char*) strerror(errno));
     perror("cpu file error\n");
   }
   FILE* fp_gpu;
@@ -192,7 +192,7 @@
   if(fp_gpu == NULL) {
     //errorstr = (char*) strerror(errno);
     //printf("GPU FILE ERROR:%s\n",errorstr);
-    printf("GPU FILE ERROR:%s\n",strerror(errno));
+    printf("GPU FILE ERROR:%s\n",(char*) strerror(errno));
     perror("gpu file error\n");
   }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-06-19 22:23:01 UTC (rev 20392)
@@ -1,4 +1,4 @@
-/*
+/* 
 !=====================================================================
 !
 !               S p e c f e m 3 D  V e r s i o n  2 . 0
@@ -33,51 +33,51 @@
 
 typedef float realw;
 
+ 
 
-
 //
 // src/cuda/check_fields_cuda.cu
 //
 
 void FC_FUNC_(check_max_norm_displ_gpu,
-              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_vector,
-              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_displ,
-              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
+              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_displ_gpu,
-              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_accel_gpu,
-              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_veloc_gpu,
-              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_displ,
-              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
+              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_accel,
-              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
+              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {} 
 
 void FC_FUNC_(check_error_vectors,
-              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
+              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {} 
 
 void FC_FUNC_(get_max_accel,
-              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
+              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} 
 
 void FC_FUNC_(get_norm_acoustic_from_device,
               GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
-                                                  int* SIMULATION_TYPE) {}
+                                                  int* SIMULATION_TYPE) {} 
 
 void FC_FUNC_(get_norm_elastic_from_device,
               GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
                                                  long* Mesh_pointer_f,
-                                                 int* SIMULATION_TYPE) {}
+                                                 int* SIMULATION_TYPE) {} 
 
 
 //
@@ -90,7 +90,7 @@
                                                  int* NSOURCESf,
                                                  int* SIMULATION_TYPEf,
                                                  double* h_stf_pre_compute,
-                                                 int* myrankf) {}
+                                                 int* myrankf) {} 
 
 void FC_FUNC_(compute_add_sources_ac_s3_cuda,
               COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
@@ -98,7 +98,7 @@
                                                       int* NSOURCESf,
                                                       int* SIMULATION_TYPEf,
                                                       double* h_stf_pre_compute,
-                                                      int* myrankf) {}
+                                                      int* myrankf) {} 
 
 void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
               ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
@@ -112,7 +112,7 @@
                                                int* time_index,
                                                int* h_islice_selected_rec,
                                                int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {}
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {} 
 
 
 //
@@ -124,21 +124,21 @@
                                             int* phase_is_innerf,
                                             int* NSOURCESf,
                                             double* h_stf_pre_compute,
-                                            int* myrankf) {}
+                                            int* myrankf) {} 
 
 void FC_FUNC_(compute_add_sources_el_s3_cuda,
               COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
                                               double* h_stf_pre_compute,
                                               int* NSOURCESf,
                                               int* phase_is_inner,
-                                              int* myrank) {}
+                                              int* myrank) {} 
 
 void FC_FUNC_(add_source_master_rec_noise_cu,
               ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
                                                 int* myrank_f,
                                                 int* it_f,
                                                 int* irec_master_noise_f,
-                                                int* islice_selected_rec) {}
+                                                int* islice_selected_rec) {} 
 
 void FC_FUNC_(add_sources_el_sim_type_2_or_3,
               ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
@@ -152,7 +152,7 @@
                                                int* time_index,
                                                int* h_islice_selected_rec,
                                                int* nadj_rec_local,
-                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {}
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {} 
 
 
 //
@@ -163,13 +163,13 @@
               COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f,
                                            int* phase_is_innerf,
                                            int* num_coupling_ac_el_facesf,
-                                           int* SIMULATION_TYPEf) {}
+                                           int* SIMULATION_TYPEf) {} 
 
 void FC_FUNC_(compute_coupling_el_ac_cuda,
               COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f,
                                            int* phase_is_innerf,
                                            int* num_coupling_ac_el_facesf,
-                                           int* SIMULATION_TYPEf) {}
+                                           int* SIMULATION_TYPEf) {} 
 
 
 //
@@ -186,7 +186,7 @@
                                               int* max_nibool_interfaces_ext_mesh,
                                               int* nibool_interfaces_ext_mesh,
                                               int* ibool_interfaces_ext_mesh,
-                                              int* FORWARD_OR_ADJOINT){}
+                                              int* FORWARD_OR_ADJOINT){} 
 
 void FC_FUNC_(transfer_asmbl_pot_to_device,
               TRANSFER_ASMBL_POT_TO_DEVICE)(
@@ -197,31 +197,31 @@
                                                 int* max_nibool_interfaces_ext_mesh,
                                                 int* nibool_interfaces_ext_mesh,
                                                 int* ibool_interfaces_ext_mesh,
-                                                int* FORWARD_OR_ADJOINT) {}
+                                                int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_forces_acoustic_cuda,
               COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
                                             int* iphase,
                                             int* nspec_outer_acoustic,
                                             int* nspec_inner_acoustic,
-                                            int* SIMULATION_TYPE) {}
+                                            int* SIMULATION_TYPE) {} 
 
 void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
                              long* Mesh_pointer,
                              int* size_F,
-                             int* SIMULATION_TYPE) {}
+                             int* SIMULATION_TYPE) {} 
 
 void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
                                                              long* Mesh_pointer,
                                                              int* size_F,
                                                              realw* deltatover2_F,
                                                              int* SIMULATION_TYPE,
-                                                             realw* b_deltatover2_F) {}
+                                                             realw* b_deltatover2_F) {} 
 
 void FC_FUNC_(acoustic_enforce_free_surf_cuda,
               ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
                                                   int* SIMULATION_TYPE,
-                                                  int* ABSORB_FREE_SURFACE) {}
+                                                  int* ABSORB_FREE_SURFACE) {} 
 
 
 //
@@ -235,8 +235,27 @@
                                                     int* max_nibool_interfaces_ext_mesh,
                                                     int* nibool_interfaces_ext_mesh,
                                                     int* ibool_interfaces_ext_mesh,
-                                                    int* FORWARD_OR_ADJOINT){}
+                                                    int* FORWARD_OR_ADJOINT){} 
 
+void FC_FUNC_(transfer_boundary_from_device_a,
+              TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer,
+                                               int* nspec_outer_elastic) {} 
+
+void FC_FUNC_(transfer_boundary_to_device_a,
+              TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
+                                             realw* buffer_recv_vector_ext_mesh,
+                                             int* num_interfaces_ext_mesh,
+                                             int* max_nibool_interfaces_ext_mesh) {} 
+
+void FC_FUNC_(assemble_accel_on_device,
+              ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel,
+                                              realw* buffer_recv_vector_ext_mesh,
+                                              int* num_interfaces_ext_mesh,
+                                              int* max_nibool_interfaces_ext_mesh,
+                                              int* nibool_interfaces_ext_mesh,
+                                              int* ibool_interfaces_ext_mesh,
+                                              int* FORWARD_OR_ADJOINT) {} 
+
 void FC_FUNC_(transfer_asmbl_accel_to_device,
               TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel,
                                                     realw* buffer_recv_vector_ext_mesh,
@@ -244,7 +263,7 @@
                                                     int* max_nibool_interfaces_ext_mesh,
                                                     int* nibool_interfaces_ext_mesh,
                                                     int* ibool_interfaces_ext_mesh,
-                                                    int* FORWARD_OR_ADJOINT) {}
+                                                    int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(compute_forces_elastic_cuda,
               COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f,
@@ -254,7 +273,7 @@
                                            int* SIMULATION_TYPE,
                                            int* COMPUTE_AND_STORE_STRAIN,
                                            int* ATTENUATION,
-                                           int* ANISOTROPY) {}
+                                           int* ANISOTROPY) {} 
 
 void FC_FUNC_(kernel_3_a_cuda,
               KERNEL_3_A_CUDA)(long* Mesh_pointer,
@@ -262,18 +281,18 @@
                                realw* deltatover2_F,
                                int* SIMULATION_TYPE_f,
                                realw* b_deltatover2_F,
-                               int* OCEANS) {}
+                               int* OCEANS) {} 
 
 void FC_FUNC_(kernel_3_b_cuda,
               KERNEL_3_B_CUDA)(long* Mesh_pointer,
                              int* size_F,
                              realw* deltatover2_F,
                              int* SIMULATION_TYPE_f,
-                             realw* b_deltatover2_F) {}
+                             realw* b_deltatover2_F) {} 
 
 void FC_FUNC_(elastic_ocean_load_cuda,
               ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
-                                       int* SIMULATION_TYPE) {}
+                                       int* SIMULATION_TYPE) {} 
 
 
 //
@@ -282,23 +301,23 @@
 
 void FC_FUNC_(compute_kernels_elastic_cuda,
               COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
-                                            realw* deltat_f) {}
+                                            realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_strgth_noise_cu,
               COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
                                                     realw* h_noise_surface_movie,
-                                                    realw* deltat) {}
+                                                    realw* deltat) {} 
 
 void FC_FUNC_(compute_kernels_acoustic_cuda,
               COMPUTE_KERNELS_ACOUSTIC_CUDA)(
                                              long* Mesh_pointer,
-                                             realw* deltat_f) {}
+                                             realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_hess_cuda,
               COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
                                          realw* deltat_f,
                                          int* ELASTIC_SIMULATION,
-                                         int* ACOUSTIC_SIMULATION) {}
+                                         int* ACOUSTIC_SIMULATION) {} 
 
 
 //
@@ -311,7 +330,7 @@
                                     int* phase_is_innerf,
                                     int* SIMULATION_TYPEf,
                                     int* SAVE_FORWARDf,
-                                    realw* h_b_absorb_potential) {}
+                                    realw* h_b_absorb_potential) {} 
 
 
 //
@@ -323,7 +342,7 @@
                                            int* phase_is_innerf,
                                            int* SIMULATION_TYPEf,
                                            int* SAVE_FORWARDf,
-                                           realw* h_b_absorb_field) {}
+                                           realw* h_b_absorb_field) {} 
 
 
 //
@@ -339,7 +358,7 @@
                                                  int* SIMULATION_TYPE,
                                                  realw* b_deltat_F,
                                                  realw* b_deltatsqover2_F,
-                                                 realw* b_deltatover2_F) {}
+                                                 realw* b_deltatover2_F) {} 
 
 void FC_FUNC_(it_update_displacement_ac_cuda,
               it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
@@ -350,53 +369,53 @@
                                                int* SIMULATION_TYPE,
                                                realw* b_deltat_F,
                                                realw* b_deltatsqover2_F,
-                                               realw* b_deltatover2_F) {}
+                                               realw* b_deltatover2_F) {} 
 
 
 //
 // src/cuda/noise_tomography_cuda.cu
 //
 
-void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){} 
 
-void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {} 
 
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {} 
 
-void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {} 
 
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {} 
 
 void FC_FUNC_(transfer_surface_to_host,
               TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
-                                        realw* h_noise_surface_movie) {}
+                                        realw* h_noise_surface_movie) {} 
 
 void FC_FUNC_(noise_read_add_surface_movie_cu,
               NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
                                                realw* h_noise_surface_movie,
-                                               int* NOISE_TOMOGRAPHYf) {}
+                                               int* NOISE_TOMOGRAPHYf) {} 
 
 
 //
 // src/cuda/prepare_mesh_constants_cuda.cu
 //
 
-void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {}
+void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {} 
 
 void FC_FUNC_(output_free_device_memory,
-              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {} 
 
 void FC_FUNC_(show_free_device_memory,
-              SHOW_FREE_DEVICE_MEMORY)() {}
+              SHOW_FREE_DEVICE_MEMORY)() {} 
 
 void FC_FUNC_(get_free_device_memory,
-              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {} 
 
 void FC_FUNC_(prepare_cuda_device,
-              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { 
  fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
  exit(1);
-}
+} 
 
 void FC_FUNC_(prepare_constants_device,
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
@@ -431,7 +450,12 @@
                                         int* nrec_local_f,
                                         int* SIMULATION_TYPE,
                                         int* USE_MESH_COLORING_GPU_f,
-                                        int* nspec_acoustic,int* nspec_elastic) {}
+                                        int* nspec_acoustic,int* nspec_elastic,
+                                        int* my_neighbours_ext_mesh,
+                                        int* request_send_vector_ext_mesh,
+                                        int* request_recv_vector_ext_mesh,
+                                        realw* buffer_recv_vector_ext_mesh
+                                        ) {} 
 
 void FC_FUNC_(prepare_fields_acoustic_device,
               PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
@@ -456,12 +480,12 @@
                                               realw* coupling_ac_el_jacobian2Dw,
                                               int* num_colors_outer_acoustic,
                                               int* num_colors_inner_acoustic,
-                                              int* num_elem_colors_acoustic) {}
+                                              int* num_elem_colors_acoustic) {} 
 
 void FC_FUNC_(prepare_fields_acoustic_adj_dev,
               PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
                                               int* SIMULATION_TYPE,
-                                              int* APPROXIMATE_HESS_KL) {}
+                                              int* APPROXIMATE_HESS_KL) {} 
 
 void FC_FUNC_(prepare_fields_elastic_device,
               PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
@@ -516,7 +540,7 @@
                                              realw *c46store,
                                              realw *c55store,
                                              realw *c56store,
-                                             realw *c66store){}
+                                             realw *c66store){} 
 
 void FC_FUNC_(prepare_fields_elastic_adj_dev,
               PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
@@ -531,7 +555,7 @@
                                              int* R_size,
                                              realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
                                              realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
-                                             int* APPROXIMATE_HESS_KL){}
+                                             int* APPROXIMATE_HESS_KL){} 
 
 void FC_FUNC_(prepare_sim2_or_3_const_device,
               PREPARE_SIM2_OR_3_CONST_DEVICE)(
@@ -540,7 +564,7 @@
                                               int* islice_selected_rec_size,
                                               int* nadj_rec_local,
                                               int* nrec,
-                                              int* myrank) {}
+                                              int* myrank) {} 
 
 void FC_FUNC_(prepare_fields_noise_device,
               PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
@@ -556,7 +580,7 @@
                                            realw* normal_y_noise,
                                            realw* normal_z_noise,
                                            realw* mask_noise,
-                                           realw* free_surface_jacobian2Dw) {}
+                                           realw* free_surface_jacobian2Dw) {} 
 
 void FC_FUNC_(prepare_fields_gravity_device,
               PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
@@ -565,8 +589,11 @@
                                              realw* minus_g,
                                              realw* h_wgll_cube,
                                              int* ACOUSTIC_SIMULATION,
-                                             realw* rhostore) {}
+                                             realw* rhostore) {} 
 
+void FC_FUNC_(prepare_seismogram_fields,
+              PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {} 
+
 void FC_FUNC_(prepare_cleanup_device,
               PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
                                       int* SIMULATION_TYPE,
@@ -579,7 +606,7 @@
                                       int* ATTENUATION,
                                       int* ANISOTROPY,
                                       int* OCEANS,
-                                      int* APPROXIMATE_HESS_KL) {}
+                                      int* APPROXIMATE_HESS_KL) {} 
 
 
 //
@@ -587,41 +614,41 @@
 //
 
 void FC_FUNC_(transfer_fields_el_to_device,
-              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_el_from_device,
-              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_to_device,
               TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                           long* Mesh_pointer_f) {}
+                                           long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_from_device,
-              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {}
+              TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_accel_to_device,
-              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_accel_from_device,
-              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_accel_from_device,
-              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
+              TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_sigma_from_device,
-              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {}
+              TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_displ_from_device,
-              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {}
+              TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_displ_from_device,
-              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {}
+              TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_compute_kernel_answers_from_device,
               TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
                                                            realw* rho_kl,int* size_rho,
                                                            realw* mu_kl, int* size_mu,
-                                                           realw* kappa_kl, int* size_kappa) {}
+                                                           realw* kappa_kl, int* size_kappa) {} 
 
 void FC_FUNC_(transfer_compute_kernel_fields_from_device,
               TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
@@ -644,7 +671,7 @@
                                                           realw* kappa_kl, int* size_kappa,
                                                           realw* epsilon_trace_over_3,
                                                           realw* b_epsilon_trace_over_3,
-                                                          int* size_epsilon_trace_over_3) {}
+                                                          int* size_epsilon_trace_over_3) {} 
 
 void FC_FUNC_(transfer_b_fields_att_to_device,
               TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
@@ -655,7 +682,7 @@
                                              realw* b_epsilondev_xy,
                                              realw* b_epsilondev_xz,
                                              realw* b_epsilondev_yz,
-                                             int* size_epsilondev) {}
+                                             int* size_epsilondev) {} 
 
 void FC_FUNC_(transfer_fields_att_from_device,
               TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
@@ -666,19 +693,19 @@
                                                realw* epsilondev_xy,
                                                realw* epsilondev_xz,
                                                realw* epsilondev_yz,
-                                               int* size_epsilondev) {}
+                                               int* size_epsilondev) {} 
 
 void FC_FUNC_(transfer_kernels_el_to_host,
               TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
                                                     realw* h_rho_kl,
                                                     realw* h_mu_kl,
                                                     realw* h_kappa_kl,
-                                                    int* NSPEC_AB) {}
+                                                    int* NSPEC_AB) {} 
 
 void FC_FUNC_(transfer_kernels_noise_to_host,
               TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
                                                           realw* h_Sigma_kl,
-                                                          int* NSPEC_AB) {}
+                                                          int* NSPEC_AB) {} 
 
 void FC_FUNC_(transfer_fields_ac_to_device,
               TRANSFER_FIELDS_AC_TO_DEVICE)(
@@ -686,7 +713,7 @@
                                                   realw* potential_acoustic,
                                                   realw* potential_dot_acoustic,
                                                   realw* potential_dot_dot_acoustic,
-                                                  long* Mesh_pointer_f) {}
+                                                  long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_ac_to_device,
               TRANSFER_B_FIELDS_AC_TO_DEVICE)(
@@ -694,14 +721,14 @@
                                                     realw* b_potential_acoustic,
                                                     realw* b_potential_dot_acoustic,
                                                     realw* b_potential_dot_dot_acoustic,
-                                                    long* Mesh_pointer_f) {}
+                                                    long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_ac_from_device,
               TRANSFER_FIELDS_AC_FROM_DEVICE)(int* size,
                                               realw* potential_acoustic,
                                               realw* potential_dot_acoustic,
                                               realw* potential_dot_dot_acoustic,
-                                              long* Mesh_pointer_f) {}
+                                              long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_ac_from_device,
               TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
@@ -709,41 +736,50 @@
                                                       realw* b_potential_acoustic,
                                                       realw* b_potential_dot_acoustic,
                                                       realw* b_potential_dot_dot_acoustic,
-                                                      long* Mesh_pointer_f) {}
+                                                      long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_dot_dot_from_device,
-              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {}
+              TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_dot_dot_from_device,
-              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {}
+              TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_kernels_ac_to_host,
               TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
                                                              realw* h_rho_ac_kl,
                                                              realw* h_kappa_ac_kl,
-                                                             int* NSPEC_AB) {}
+                                                             int* NSPEC_AB) {} 
 
 void FC_FUNC_(transfer_kernels_hess_el_tohost,
               TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
                                               realw* h_hess_kl,
-                                              int* NSPEC_AB) {}
+                                              int* NSPEC_AB) {} 
 
 void FC_FUNC_(transfer_kernels_hess_ac_tohost,
               TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
                                              realw* h_hess_ac_kl,
-                                             int* NSPEC_AB) {}
+                                             int* NSPEC_AB) {} 
 
 
 //
 // src/cuda/write_seismograms_cuda.cu
 //
 
+void FC_FUNC_(transfer_seismograms_el_from_d,
+              TRANSFER_SEISMOGRAMS_EL_FROM_D)(int* nrec_local,
+                                              long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPEf,
+                                              realw* seismograms_d,
+                                              realw* seismograms_v,
+                                              realw* seismograms_a,
+                                              int* it) {} 
+
 void FC_FUNC_(transfer_station_el_from_device,
               TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
                                                    realw* b_displ, realw* b_veloc, realw* b_accel,
                                                    long* Mesh_pointer_f,int* number_receiver_global,
                                                    int* ispec_selected_rec,int* ispec_selected_source,
-                                                   int* ibool,int* SIMULATION_TYPEf) {}
+                                                   int* ibool,int* SIMULATION_TYPEf) {} 
 
 void FC_FUNC_(transfer_station_ac_from_device,
               TRANSFER_STATION_AC_FROM_DEVICE)(
@@ -758,5 +794,5 @@
                                                 int* ispec_selected_rec,
                                                 int* ispec_selected_source,
                                                 int* ibool,
-                                                int* SIMULATION_TYPEf) {}
+                                                int* SIMULATION_TYPEf) {} 
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2012-06-19 22:23:01 UTC (rev 20392)
@@ -59,7 +59,7 @@
 // Initially sets the blocks_x to be the num_blocks, and adds rows as
 // needed. If an additional row is added, the row length is cut in
 // half. If the block count is odd, there will be 1 too many blocks,
-// which must be managed at runtime with an if statement. 
+// which must be managed at runtime with an if statement.
 void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
   *num_blocks_x = num_blocks;
   *num_blocks_y = 1;
@@ -70,6 +70,8 @@
   return;
 }
 
+/* ----------------------------------------------------------------------------------------------- */
+
 __device__ double atomicAdd(double* address, double val)
 {
     unsigned long long int* address_as_ull =
@@ -84,20 +86,22 @@
     return __longlong_as_double(old);
 }
 
+/* ----------------------------------------------------------------------------------------------- */
+
 __global__ void compute_interpolated_dva_plus_seismogram(int nrec_local,
-							 realw* displ, realw* veloc, realw* accel,
-							 int* ibool,
-							 double* hxir, double* hetar, double* hgammar,
-							 realw* seismograms_d, realw* seismograms_v, realw* seismograms_a,
-							 double* nu,
-							 int* number_receiver_global,
-							 int* ispec_selected_rec) {
+               realw* displ, realw* veloc, realw* accel,
+               int* ibool,
+               double* hxir, double* hetar, double* hgammar,
+               realw* seismograms_d, realw* seismograms_v, realw* seismograms_a,
+               double* nu,
+               int* number_receiver_global,
+               int* ispec_selected_rec) {
   int irec_local = blockIdx.x + blockIdx.y*gridDim.x;
   int i = threadIdx.x;
   int j = threadIdx.y;
   int k = threadIdx.z;
   int ijk = i+5*(j+5*(k));
-  
+
   // we do the **d variable reduction in shared memory, because the
   // atomicAdd() should be faster on the shared memory registers
   // according to
@@ -111,7 +115,7 @@
   __shared__ double sh_axd[NGLL3];
   __shared__ double sh_ayd[NGLL3];
   __shared__ double sh_azd[NGLL3];
-  
+
   if(irec_local < nrec_local) {
     int irec = number_receiver_global[irec_local]-1;
     int ispec = ispec_selected_rec[irec]-1;
@@ -127,14 +131,14 @@
 
     sh_axd[ijk] = hlagrange*accel[0+3*iglob];
     sh_ayd[ijk] = hlagrange*accel[1+3*iglob];
-    sh_azd[ijk] = hlagrange*accel[2+3*iglob];        
+    sh_azd[ijk] = hlagrange*accel[2+3*iglob];
 
     // the reduction has to skip the first element (we don't need to
     // add element 0 to itself) This reduction serializes the code,
     // but it should be fast enough --- it can be made faster with a
     // proper reduction algorithm.
     __syncthreads();
-    
+
     // if(ijk>0) {
     // reduction needs to be done atomically to avoid race conditions
       // atomicAdd(&sh_dxd[0],sh_dxd[ijk]);
@@ -153,53 +157,55 @@
     if(ijk==0) {
       // a loop in thread 0 is 4 times faster than atomic operations
       for(int i=1;i<125;i++) {
-      	sh_dxd[0] += sh_dxd[i];
-      	sh_dyd[0] += sh_dyd[i];
-      	sh_dzd[0] += sh_dzd[i];
-		  
-      	sh_vxd[0] += sh_vxd[i];
-      	sh_vyd[0] += sh_vyd[i];
-      	sh_vzd[0] += sh_vzd[i];
-		  
-      	sh_axd[0] += sh_axd[i];
-      	sh_ayd[0] += sh_ayd[i];
-      	sh_azd[0] += sh_azd[i];
-	
+        sh_dxd[0] += sh_dxd[i];
+        sh_dyd[0] += sh_dyd[i];
+        sh_dzd[0] += sh_dzd[i];
+
+        sh_vxd[0] += sh_vxd[i];
+        sh_vyd[0] += sh_vyd[i];
+        sh_vzd[0] += sh_vzd[i];
+
+        sh_axd[0] += sh_axd[i];
+        sh_ayd[0] += sh_ayd[i];
+        sh_azd[0] += sh_azd[i];
+
       }
-      
+
       seismograms_d[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_dxd[0] + nu[0+3*(1+3*irec)]*sh_dyd[0] + nu[0+3*(2+3*irec)]*sh_dzd[0];
       seismograms_d[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_dxd[0] + nu[1+3*(1+3*irec)]*sh_dyd[0] + nu[1+3*(2+3*irec)]*sh_dzd[0];
       seismograms_d[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_dxd[0] + nu[2+3*(1+3*irec)]*sh_dyd[0] + nu[2+3*(2+3*irec)]*sh_dzd[0];
-            
+
       seismograms_v[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_vxd[0] + nu[0+3*(1+3*irec)]*sh_vyd[0] + nu[0+3*(2+3*irec)]*sh_vzd[0];
       seismograms_v[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_vxd[0] + nu[1+3*(1+3*irec)]*sh_vyd[0] + nu[1+3*(2+3*irec)]*sh_vzd[0];
       seismograms_v[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_vxd[0] + nu[2+3*(1+3*irec)]*sh_vyd[0] + nu[2+3*(2+3*irec)]*sh_vzd[0];
-      
+
       seismograms_a[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_axd[0] + nu[0+3*(1+3*irec)]*sh_ayd[0] + nu[0+3*(2+3*irec)]*sh_azd[0];
       seismograms_a[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_axd[0] + nu[1+3*(1+3*irec)]*sh_ayd[0] + nu[1+3*(2+3*irec)]*sh_azd[0];
-      seismograms_a[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_axd[0] + nu[2+3*(1+3*irec)]*sh_ayd[0] + nu[2+3*(2+3*irec)]*sh_azd[0];      
-      
+      seismograms_a[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_axd[0] + nu[2+3*(1+3*irec)]*sh_ayd[0] + nu[2+3*(2+3*irec)]*sh_azd[0];
+
     }
-  }    
+  }
 }
-							 
 
 
+/* ----------------------------------------------------------------------------------------------- */
+
 extern "C"
-void FC_FUNC_(transfer_seismograms_el_from_device,
-              TRANSFER_SEISMOGRAMS_EL_FROM_DEVICE)(int* nrec_local,
-                                                   long* Mesh_pointer_f,int* SIMULATION_TYPEf,
-						   realw* seismograms_d,
-						   realw* seismograms_v,
-						   realw* seismograms_a,
-						   int* it) {
-  
+void FC_FUNC_(transfer_seismograms_el_from_d,
+              TRANSFER_SEISMOGRAMS_EL_FROM_D)(int* nrec_local,
+                                              long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPEf,
+                                              realw* seismograms_d,
+                                              realw* seismograms_v,
+                                              realw* seismograms_a,
+                                              int* it) {
 
+// transfers seismograms from device to host
 
-  TRACE("transfer_seismograms_el_from_device");
+  TRACE("transfer_seismograms_el_from_d");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-  
+
   int num_blocks_x, num_blocks_y;
   get_blocks_xy(*nrec_local,&num_blocks_x,&num_blocks_y);
   dim3 grid(num_blocks_x,num_blocks_y);
@@ -214,21 +220,21 @@
   // cudaEventCreate(&start);
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
-  
+
   compute_interpolated_dva_plus_seismogram<<<grid,threads,0,mp->compute_stream>>>(*nrec_local,
-										  mp->d_displ,mp->d_veloc,mp->d_accel,
-										  mp->d_ibool,
-										  mp->d_hxir, mp->d_hetar, mp->d_hgammar,
-										  mp->d_seismograms_d,
-										  mp->d_seismograms_v,
-										  mp->d_seismograms_a,
-										  mp->d_nu,
-										  mp->d_number_receiver_global,
-										  mp->d_ispec_selected_rec
-										  );
+                      mp->d_displ,mp->d_veloc,mp->d_accel,
+                      mp->d_ibool,
+                      mp->d_hxir, mp->d_hetar, mp->d_hgammar,
+                      mp->d_seismograms_d,
+                      mp->d_seismograms_v,
+                      mp->d_seismograms_a,
+                      mp->d_nu,
+                      mp->d_number_receiver_global,
+                      mp->d_ispec_selected_rec
+                      );
 
   // cudaMemcpy(h_debug,d_debug,125*sizeof(double),cudaMemcpyDeviceToHost);
-  
+
   cudaMemcpy(mp->h_seismograms_d_it,mp->d_seismograms_d,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
   cudaMemcpy(mp->h_seismograms_v_it,mp->d_seismograms_v,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
   cudaMemcpy(mp->h_seismograms_a_it,mp->d_seismograms_a,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
@@ -239,23 +245,24 @@
   // cudaEventDestroy( start );
   // cudaEventDestroy( stop );
   // printf("seismogram Execution Time: %f ms\n",time);
-  
+
   // if(abs(mp->h_seismograms_d_it[0]) < 1e-25) printf("seismo1_x=%e\n",mp->h_seismograms_d_it[0]);
   // if(abs(mp->h_seismograms_d_it[1]) < 1e-25) printf("seismo1_y=%e\n",mp->h_seismograms_d_it[1]);
-  
+
   // if(abs(mp->h_seismograms_d_it[2]) < 1e-25) {
-  
+
   // printf("%d:seismo1_z=%e\n",*it,mp->h_seismograms_d_it[2]);
-  
+
   // }
-  
-  
+
+
   memcpy(&seismograms_d[3**nrec_local*(*it-1)],mp->h_seismograms_d_it,3* *nrec_local*sizeof(realw));
   memcpy(&seismograms_v[3**nrec_local*(*it-1)],mp->h_seismograms_v_it,3* *nrec_local*sizeof(realw));
-  memcpy(&seismograms_a[3**nrec_local*(*it-1)],mp->h_seismograms_a_it,3* *nrec_local*sizeof(realw));  
+  memcpy(&seismograms_a[3**nrec_local*(*it-1)],mp->h_seismograms_a_it,3* *nrec_local*sizeof(realw));
 
 }
 
+/* ----------------------------------------------------------------------------------------------- */
 
 __global__ void transfer_stations_fields_from_device_kernel(int* number_receiver_global,
                                                             int* ispec_selected_rec,

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -683,6 +683,7 @@
     integer, dimension(:),allocatable  :: num_material
     integer :: ier
 
+    ! starts from 0
     elmnts(:,:) = elmnts(:,:) - 1
 
     ! determines maximum neighbors based on 1 common node
@@ -725,8 +726,8 @@
     call acoustic_elastic_poro_load(elmnts_load,nspec,count_def_mat,count_undef_mat, &
                                   num_material,mat_prop,undef_mat_prop)
 
-    deallocate(num_material)
 
+
     ! SCOTCH partitioning
 
     ! we use default strategy for partitioning, thus omit specifing explicit strategy .
@@ -811,25 +812,24 @@
        stop 'ERROR : MAIN : Cannot destroy strat'
     endif
 
-  ! re-partitioning puts poroelastic-elastic coupled elements into same partition
-  !  integer  :: nfaces_coupled
-  !  integer, dimension(:,:), pointer  :: faces_coupled
+    ! re-partitioning puts poroelastic-elastic coupled elements into same partition
+    !  integer  :: nfaces_coupled
+    !  integer, dimension(:,:), pointer  :: faces_coupled
 
     ! TODO: supposed to rebalance, but currently broken
-    ! call poro_elastic_repartitioning (nspec, nnodes, elmnts, &
-    ! count_def_mat, mat(1,:) , mat_prop, &
-    ! sup_neighbour, nsize, &
-    ! nparts, part)
+    call poro_elastic_repartitioning (nspec, nnodes, elmnts, &
+                     count_def_mat, num_material , mat_prop, &
+                     sup_neighbour, nsize, &
+                     nparts, part)
 
-    !nparts, part, nfaces_coupled, faces_coupled)
+    deallocate(num_material)
 
     ! re-partitioning puts moho-surface coupled elements into same partition
-    ! call moho_surface_repartitioning (nspec, nnodes, elmnts, &
-    ! sup_neighbour, nsize, nparts, part, &
-    ! nspec2D_moho,ibelm_moho,nodes_ibelm_moho )
+    call moho_surface_repartitioning (nspec, nnodes, elmnts, &
+                     sup_neighbour, nsize, nparts, part, &
+                     nspec2D_moho,ibelm_moho,nodes_ibelm_moho )
 
-
-  ! local number of each element for each partition
+    ! local number of each element for each partition
     call build_glob2loc_elmnts(nspec, part, glob2loc_elmnts,nparts)
 
     ! local number of each node for each partition
@@ -887,13 +887,13 @@
                                   glob2loc_nodes_nparts, glob2loc_nodes_parts, &
                                   glob2loc_nodes, nnodes, 1)
 
-
-
        call write_partition_database(IIN_database, ipart, nspec_local, nspec, elmnts, &
                                   glob2loc_elmnts, glob2loc_nodes_nparts, &
                                   glob2loc_nodes_parts, glob2loc_nodes, part, mat, ngnod, 1)
 
-       print*, ipart,": nspec_local=",nspec_local, " nnodes_local=", nnodes_loc
+       !debug
+       !print*, ipart,": nspec_local=",nspec_local, " nnodes_local=", nnodes_loc
+
        ! writes out node coordinate locations
        !write(IIN_database,*) nnodes_loc
        write(IIN_database) nnodes_loc

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -1385,12 +1385,11 @@
                         nb_materials, num_material, mat_prop, &
                         sup_neighbour, nsize, &
                         nproc, part)
-                        !nproc, part, nfaces_coupled, faces_coupled)
 
     implicit none
 
-    integer,intent(in) :: nspec
-    integer, intent(in)  :: nnodes, nproc, nb_materials
+    integer, intent(in) :: nspec
+    integer, intent(in) :: nnodes, nproc, nb_materials
     integer, intent(in) :: sup_neighbour
     integer, intent(in) :: nsize
 
@@ -1401,11 +1400,10 @@
     integer, dimension(0:nspec-1)  :: part
     integer, dimension(0:esize*nspec-1)  :: elmnts
 
+    ! local parameters
     integer  :: nfaces_coupled
-    !integer, intent(out)  :: nfaces_coupled
     integer, dimension(:,:), pointer  :: faces_coupled
 
-
     logical, dimension(nb_materials)  :: is_poroelastic, is_elastic
 
     ! neighbors
@@ -1432,8 +1430,8 @@
     enddo
 
     ! checks if any poroelastic/elastic elements are set
-    !if( .not. any(is_poroelastic) ) return
-    !if( .not. any(is_elastic) ) return
+    if( .not. any(is_poroelastic) ) return
+    if( .not. any(is_elastic) ) return
 
     ! gets neighbors by 4 common nodes (face)
     allocate(xadj(0:nspec),stat=ier)
@@ -1464,6 +1462,7 @@
     ! coupled elements
     allocate(faces_coupled(2,nfaces_coupled),stat=ier)
     if( ier /= 0 ) stop 'error allocating array faces_coupled'
+    faces_coupled(:,:) = -1
 
     ! stores elements indices
     nfaces_coupled = 0
@@ -1497,8 +1496,11 @@
        endif
     enddo
 
- end subroutine poro_elastic_repartitioning
+    deallocate(xadj,adjncy,nnodes_elmnts,nodes_elmnts)
+    deallocate(faces_coupled)
 
+  end subroutine poro_elastic_repartitioning
+
   !--------------------------------------------------
   ! Repartitioning : two coupled moho surface elements are transfered to the same partition
   !--------------------------------------------------
@@ -1627,6 +1629,7 @@
     ! coupled elements
     allocate(faces_coupled(2,nfaces_coupled),stat=ier)
     if( ier /= 0 ) stop 'error allocating array faces_coupled'
+    faces_coupled(:,:) = -1
 
     ! stores elements indices
     nfaces_coupled = 0
@@ -1661,6 +1664,10 @@
        endif
     enddo
 
+    deallocate(is_moho,node_is_moho)
+    deallocate(xadj,adjncy,nnodes_elmnts,nodes_elmnts)
+    deallocate(faces_coupled)
+
  end subroutine moho_surface_repartitioning
 
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/Makefile.in	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/Makefile.in	2012-06-19 22:23:01 UTC (rev 20392)
@@ -99,6 +99,7 @@
 	$O/model_aniso.o \
 	$O/model_default.o \
 	$O/model_external_values.o \
+	$O/model_ipati.o \
 	$O/model_gll.o \
 	$O/model_salton_trough.o \
 	$O/model_tomography.o \
@@ -228,6 +229,9 @@
 $O/model_external_values.o:  ${SHARED}/constants.h model_external_values.f90
 	${FCCOMPILE_CHECK} -c -o $O/model_external_values.o model_external_values.f90
 
+$O/model_ipati.o:  ${SHARED}/constants.h model_ipati.f90
+	${FCCOMPILE_CHECK} -c -o $O/model_ipati.o model_ipati.f90
+
 $O/model_gll.o:  ${SHARED}/constants.h model_gll.f90
 	${FCCOMPILE_CHECK} -c -o $O/model_gll.o model_gll.f90
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_regions_mesh.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_regions_mesh.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_regions_mesh.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -201,26 +201,6 @@
 !  integer,dimension(:),allocatable :: itest_flag
 !  integer, dimension(:), allocatable :: elem_flag
 
-! For Piero Basini :
-! integer :: doubling_value_found_for_Piero
-!   double precision :: xmesh,ymesh,zmesh
-!   double precision :: rho,vp,vs
-
-!   integer,dimension(nspec) ::  idoubling
-!   integer :: doubling_value_found_for_Piero
-!   integer, parameter :: NUMBER_OF_STATIONS = 6
-!   double precision, parameter :: RADIUS_TO_EXCLUDE = 250.d0
-!   double precision, dimension(NUMBER_OF_STATIONS) :: utm_x_station,utm_y_station
-
-!   logical :: is_around_a_station
-!   integer :: istation
-
-! ! store bedrock values
-!   integer ::  icornerlat,icornerlong
-!   double precision ::  lat,long,elevation_bedrock
-!   double precision ::  lat_corner,long_corner,ratio_xi,ratio_eta
-!real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: ibedrock
-
 ! initializes arrays
   call sync_all()
   if( myrank == 0) then
@@ -231,7 +211,6 @@
                         nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax, &
                         nspec2D_bottom,nspec2D_top,ANISOTROPY)
 
-
 ! fills location and weights for Gauss-Lobatto-Legendre points, shape and derivations,
 ! returns jacobianstore,xixstore,...gammazstore
 ! and GLL-point locations in xstore,ystore,zstore
@@ -266,7 +245,6 @@
                         num_interfaces_ext_mesh,max_interface_size_ext_mesh,&
                         my_neighbours_ext_mesh,NPROC)
 
-
 ! sets up absorbing/free surface boundaries
   call sync_all()
   if( myrank == 0) then
@@ -280,6 +258,18 @@
                             nspec2D_xmin,nspec2D_xmax,nspec2D_ymin,nspec2D_ymax, &
                             nspec2D_bottom,nspec2D_top)
 
+! sets up up Moho surface
+  NSPEC2D_MOHO = 0
+  if( SAVE_MOHO_MESH ) then
+    call sync_all()
+    if( myrank == 0) then
+      write(IMAIN,*) '  ...setting up Moho surface'
+    endif
+    call crm_setup_moho(myrank,nspec, &
+                      nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, &
+                      nodes_coords_ext_mesh,nnodes_ext_mesh,ibool )
+  endif
+
 ! sets material velocities
   call sync_all()
   if( myrank == 0) then
@@ -288,9 +278,8 @@
   call get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
                         materials_ext_mesh,nmat_ext_mesh, &
                         undef_mat_prop,nundefMat_ext_mesh, &
-                        ANISOTROPY,LOCAL_PATH)
+                        ANISOTROPY)
 
-
 ! sets up acoustic-elastic-poroelastic coupling surfaces
   call sync_all()
   if( myrank == 0) then
@@ -302,18 +291,27 @@
                         num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
                         my_neighbours_ext_mesh)
 
-! sets up up Moho surface
-  NSPEC2D_MOHO = 0
-  if( SAVE_MOHO_MESH ) then
-    call sync_all()
-    if( myrank == 0) then
-      write(IMAIN,*) '  ...setting up Moho surface'
-    endif
-    call crm_setup_moho(myrank,nspec, &
-                      nspec2D_moho_ext,ibelm_moho,nodes_ibelm_moho, &
-                      nodes_coords_ext_mesh,nnodes_ext_mesh,ibool )
+! locates inner and outer elements
+  call sync_all()
+  if( myrank == 0) then
+    write(IMAIN,*) '  ...element inner/outer separation '
   endif
+  call crm_setup_inner_outer_elemnts(myrank,nspec, &
+                                    num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
+                                    nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                    ibool,SAVE_MESH_FILES)
 
+! colors mesh if requested
+  call sync_all()
+  if( myrank == 0) then
+    write(IMAIN,*) '  ...element mesh coloring '
+  endif
+  call crm_setup_color_perm(myrank,nspec,nglob,ibool,ANISOTROPY,SAVE_MESH_FILES)
+
+! overwrites material parameters from external binary files
+  call sync_all()
+  call get_model_binaries(myrank,nspec,LOCAL_PATH)
+
 ! creates mass matrix
   call sync_all()
   if( myrank == 0) then
@@ -330,23 +328,6 @@
                                       UTM_PROJECTION_ZONE,SUPPRESS_UTM_PROJECTION, &
                                       NX_TOPO,NY_TOPO,itopo_bathy)
 
-! locates inner and outer elements
-  call sync_all()
-  if( myrank == 0) then
-    write(IMAIN,*) '  ...element inner/outer separation '
-  endif
-  call crm_setup_inner_outer_elemnts(myrank,nspec, &
-                                    num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                                    nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                                    ibool,SAVE_MESH_FILES)
-
-! colors mesh if requested
-  call sync_all()
-  if( myrank == 0) then
-    write(IMAIN,*) '  ...element mesh coloring '
-  endif
-  call crm_setup_color_perm(myrank,nspec,nglob,ibool,ANISOTROPY,SAVE_MESH_FILES)
-
 ! saves the binary mesh files
   call sync_all()
   if( myrank == 0) then
@@ -354,48 +335,52 @@
   endif
   !call create_name_database(prname,myrank,LOCAL_PATH)
   call save_arrays_solver_ext_mesh(nspec,nglob_dummy, &
-                        xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore,&
-                        gammaxstore,gammaystore,gammazstore, &
-                        jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, &
-                        rhostore,kappastore,mustore, &
-                        rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, &
-                        rho_vpI,rho_vpII,rho_vsI, &
-                        rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, &
-                        OCEANS,rmass_ocean_load,NGLOB_OCEAN, &
+!                        xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore,&
+!                        gammaxstore,gammaystore,gammazstore, &
+!                        jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, &
+!                        rhostore,kappastore,mustore, &
+!                        rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, &
+!                        rho_vpI,rho_vpII,rho_vsI, &
+!                        rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, &
+                        OCEANS, &
+!                        rmass_ocean_load,NGLOB_OCEAN, &
                         ibool, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        abs_boundary_normal,abs_boundary_jacobian2Dw, &
-                        abs_boundary_ijk,abs_boundary_ispec,num_abs_boundary_faces, &
-                        free_surface_normal,free_surface_jacobian2Dw, &
-                        free_surface_ijk,free_surface_ispec, &
-                        num_free_surface_faces, &
-                        coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
-                        coupling_ac_el_ijk,coupling_ac_el_ispec, &
-                        num_coupling_ac_el_faces, &
-                        coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, &
-                        coupling_ac_po_ijk,coupling_ac_po_ispec, &
-                        num_coupling_ac_po_faces, &
-                        coupling_el_po_normal,coupling_el_po_jacobian2Dw, &
-                        coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, &
-                        coupling_po_el_ispec,num_coupling_el_po_faces, &
+!                        xstore_dummy,ystore_dummy,zstore_dummy, &
+!                        abs_boundary_normal,abs_boundary_jacobian2Dw, &
+!                        abs_boundary_ijk,abs_boundary_ispec,num_abs_boundary_faces, &
+!                        free_surface_normal,free_surface_jacobian2Dw, &
+!                        free_surface_ijk,free_surface_ispec, &
+!                        num_free_surface_faces, &
+!                        coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
+!                        coupling_ac_el_ijk,coupling_ac_el_ispec, &
+!                        num_coupling_ac_el_faces, &
+!                        coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, &
+!                        coupling_ac_po_ijk,coupling_ac_po_ispec, &
+!                        num_coupling_ac_po_faces, &
+!                        coupling_el_po_normal,coupling_el_po_jacobian2Dw, &
+!                        coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, &
+!                        coupling_po_el_ispec,num_coupling_el_po_faces, &
                         num_interfaces_ext_mesh,my_neighbours_ext_mesh,nibool_interfaces_ext_mesh, &
                         max_interface_size_ext_mesh,ibool_interfaces_ext_mesh, &
-                        prname,SAVE_MESH_FILES, &
-                        ANISOTROPY,NSPEC_ANISO, &
-                        c11store,c12store,c13store,c14store,c15store,c16store, &
-                        c22store,c23store,c24store,c25store,c26store,c33store, &
-                        c34store,c35store,c36store,c44store,c45store,c46store, &
-                        c55store,c56store,c66store, &
-                        ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
-                        ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, &
-                        nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, &
-                        num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
-                        num_phase_ispec_elastic,phase_ispec_inner_elastic, &
-                        num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic, &
-                        num_colors_outer_acoustic,num_colors_inner_acoustic, &
-                        num_elem_colors_acoustic, &
-                        num_colors_outer_elastic,num_colors_inner_elastic, &
-                        num_elem_colors_elastic)
+!                        prname, &
+                        SAVE_MESH_FILES, &
+                        ANISOTROPY &
+!                        NSPEC_ANISO, &
+!                        c11store,c12store,c13store,c14store,c15store,c16store, &
+!                        c22store,c23store,c24store,c25store,c26store,c33store, &
+!                        c34store,c35store,c36store,c44store,c45store,c46store, &
+!                        c55store,c56store,c66store, &
+!                        ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
+!                        ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, &
+!                        nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, &
+!                        num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+!                        num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+!                        num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic, &
+!                        num_colors_outer_acoustic,num_colors_inner_acoustic, &
+!                        num_elem_colors_acoustic, &
+!                        num_colors_outer_elastic,num_colors_inner_elastic, &
+!                        num_elem_colors_elastic, &
+                      )
 
 ! saves moho surface
   if( SAVE_MOHO_MESH ) then
@@ -410,12 +395,21 @@
 
 ! checks the mesh, stability and resolved period
   call sync_all()
-!chris: check for poro: At the moment cpI & cpII are for eta=0
-  call check_mesh_resolution_poro(myrank,nspec,nglob_dummy,ibool,&
+
+  if( POROELASTIC_SIMULATION ) then
+    !chris: check for poro: At the moment cpI & cpII are for eta=0
+    call check_mesh_resolution_poro(myrank,nspec,nglob_dummy,ibool,&
                             xstore_dummy,ystore_dummy,zstore_dummy, &
                             -1.0d0, model_speed_max,min_resolved_period, &
                             phistore,tortstore,rhoarraystore,rho_vpI,rho_vpII,rho_vsI, &
                             LOCAL_PATH,SAVE_MESH_FILES )
+  else
+    call check_mesh_resolution(myrank,nspec,nglob_dummy, &
+                              ibool,xstore_dummy,ystore_dummy,zstore_dummy, &
+                              kappastore,mustore,rho_vp,rho_vs, &
+                              -1.0d0,model_speed_max,min_resolved_period, &
+                              LOCAL_PATH,SAVE_MESH_FILES)
+  endif
 
 ! saves binary mesh files for attenuation
   if( ATTENUATION ) then
@@ -1188,6 +1182,8 @@
   character(len=256) :: filename
   logical,dimension(:),allocatable :: iglob_is_inner
 
+  logical,parameter :: DEBUG = .false.
+
   ! allocates arrays
   allocate(ispec_is_inner(nspec),stat=ier)
   if( ier /= 0 ) stop 'error allocating array ispec_is_inner'
@@ -1221,7 +1217,7 @@
   ! frees temporary array
   deallocate( iglob_is_inner )
 
-  if( SAVE_MESH_FILES ) then
+  if( SAVE_MESH_FILES .and. DEBUG ) then
     filename = prname(1:len_trim(prname))//'ispec_is_inner'
     call write_VTK_data_elem_l(nspec,nglob_dummy, &
                         xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
@@ -1513,6 +1509,8 @@
   character(len=2),dimension(2) :: str_domain = (/ "ac", "el" /)
   character(len=256) :: filename
 
+  logical, parameter :: DEBUG = .false.
+
   !!!! David Michea: detection of the edges, coloring and permutation separately
 
   ! implement mesh coloring for GPUs if needed, to create subsets of disconnected elements
@@ -1576,7 +1574,7 @@
   endif
 
   ! debug: file output
-  if( SAVE_MESH_FILES ) then
+  if( SAVE_MESH_FILES .and. DEBUG ) then
     filename = prname(1:len_trim(prname))//'color_'//str_domain(idomain)
     call write_VTK_data_elem_i(nspec,nglob, &
                               xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
@@ -1588,7 +1586,7 @@
   deallocate(color)
 
   ! debug: no mesh coloring, only creates dummy coloring arrays
-  if( .false. ) then
+  if( DEBUG ) then
     nb_colors_outer_elements = 0
     nb_colors_inner_elements = 0
     ispec_counter = 0
@@ -1631,7 +1629,7 @@
   endif ! debug
 
   ! debug: saves mesh coloring numbers into files
-  if( SAVE_MESH_FILES ) then
+  if( DEBUG ) then
     ! debug file output
     filename = prname(1:len_trim(prname))//'num_of_elems_in_this_color_'//str_domain(idomain)//'.dat'
     open(unit=99,file=trim(filename),status='unknown',iostat=ier)
@@ -1732,12 +1730,12 @@
   endif
 
   ! debug: outputs permutation array as vtk file
-  !if( SAVE_MESH_FILES ) then
-  !  filename = prname(1:len_trim(prname))//'perm_'//str_domain(idomain)
-  !  call write_VTK_data_elem_i(nspec,nglob, &
-  !                      xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
-  !                      perm,filename)
-  !endif
+  if( DEBUG ) then
+    filename = prname(1:len_trim(prname))//'perm_'//str_domain(idomain)
+    call write_VTK_data_elem_i(nspec,nglob, &
+                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                        perm,filename)
+  endif
 
   deallocate(num_of_elems_in_this_color)
 
@@ -1776,6 +1774,8 @@
 
   character(len=256) :: filename
 
+  logical,parameter :: DEBUG = .false.
+
   ! sorts array according to permutation
   allocate(temp_perm_global(nspec),stat=ier)
   if( ier /= 0 ) stop 'error temp_perm_global array'
@@ -1894,7 +1894,7 @@
   endif
 
   ! outputs permutation array as vtk file
-  if( SAVE_MESH_FILES ) then
+  if( SAVE_MESH_FILES .and. DEBUG ) then
     filename = prname(1:len_trim(prname))//'perm_global'
     call write_VTK_data_elem_i(nspec,nglob, &
                         xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
@@ -1933,6 +1933,7 @@
   call permute_elements_real(gammazstore,temp_array_real,perm,nspec)
   call permute_elements_real(jacobianstore,temp_array_real,perm,nspec)
 
+  ! material parameters
   call permute_elements_real(kappastore,temp_array_real,perm,nspec)
   call permute_elements_real(mustore,temp_array_real,perm,nspec)
 
@@ -1970,6 +1971,11 @@
   endif
   deallocate(temp_array_real)
 
+  ! poroelastic arrays
+  if( POROELASTIC_SIMULATION ) then
+    stop 'mesh permutation for poroelastic simulations not supported yet'
+  endif
+
   ! boundary surface
   if( num_abs_boundary_faces > 0 ) then
     do iface = 1,num_abs_boundary_faces
@@ -1996,6 +2002,25 @@
       coupling_ac_el_ispec(iface) = new_ispec
     enddo
   endif
+  if( num_coupling_ac_po_faces > 0 ) then
+    do iface = 1,num_coupling_ac_po_faces
+      old_ispec = coupling_ac_po_ispec(iface)
+      new_ispec = perm(old_ispec)
+      coupling_ac_po_ispec(iface) = new_ispec
+    enddo
+  endif
+  if( num_coupling_el_po_faces > 0 ) then
+    do iface = 1,num_coupling_el_po_faces
+      ! elastic-poroelastic
+      old_ispec = coupling_el_po_ispec(iface)
+      new_ispec = perm(old_ispec)
+      coupling_el_po_ispec(iface) = new_ispec
+      ! poroelastic-elastic
+      old_ispec = coupling_po_el_ispec(iface)
+      new_ispec = perm(old_ispec)
+      coupling_po_el_ispec(iface) = new_ispec
+    enddo
+  endif
 
   ! moho surface
   if( NSPEC2D_MOHO > 0 ) then

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -437,6 +437,8 @@
     write(IMAIN,'(a)',advance='yes') '  tomo'
     case( IMODEL_USER_EXTERNAL )
     write(IMAIN,'(a)',advance='yes') '  external'
+    case( IMODEL_IPATI )
+    write(IMAIN,'(a)',advance='yes') '  ipati'
     end select
 
     write(IMAIN,*)

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_coupling_surfaces.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_coupling_surfaces.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_coupling_surfaces.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -32,6 +32,7 @@
                         my_neighbours_ext_mesh)
 
 ! determines coupling surface for acoustic-elastic domains
+! based on ispec_is_acoustic, ispec_is_elastic and ispec_is_poroelastic arrays
 
   use create_regions_mesh_ext_par
   implicit none
@@ -49,43 +50,114 @@
             ibool_interfaces_ext_mesh
   integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
 
+  ! local parameters
+  integer, dimension(:), allocatable :: elastic_flag,acoustic_flag,poroelastic_flag
+  integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
+  integer :: max_nibool_interfaces_ext_mesh
+  integer :: count_elastic,count_acoustic,count_poroelastic
+  integer :: ispec,i,j,k,iglob,ier,inum
+
   ! initializes number of coupling faces
   num_coupling_ac_el_faces = 0
   num_coupling_ac_po_faces = 0
   num_coupling_el_po_faces = 0
 
-  ! acoustic - elastic domain coupling
-  call get_coupling_surfaces_ac_el(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
+  ! sets flags for acoustic / elastic / poroelastic on global points
+  allocate(acoustic_flag(nglob_dummy),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array acoustic_flag'
+  allocate(elastic_flag(nglob_dummy),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array elastic_flag'
+  allocate(poroelastic_flag(nglob_dummy),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array poroelastic_flag'
+
+  acoustic_flag(:) = 0
+  elastic_flag(:) = 0
+  poroelastic_flag(:) = 0
+
+  count_acoustic = 0
+  count_elastic = 0
+  count_poroelastic = 0
+
+  do ispec = 1, nspec
+    ! counts elements
+    if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1
+    if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1
+    if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1
+
+    ! sets flags on global points
+    do k = 1, NGLLZ
+      do j = 1, NGLLY
+        do i = 1, NGLLX
+          ! global index
+          iglob = ibool(i,j,k,ispec)
+          ! sets acoustic flag
+          if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) =  myrank+1
+          ! sets elastic flag
+          if( ispec_is_elastic(ispec) ) elastic_flag(iglob) =  myrank+1
+          ! sets poroelastic flag
+          if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) =  myrank+1
+        enddo
+      enddo
+    enddo
+  enddo
+  call sum_all_i(count_acoustic,inum)
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     total acoustic elements   :',inum
+  endif
+  call sum_all_i(count_elastic,inum)
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     total elastic elements    :',inum
+  endif
+  call sum_all_i(count_poroelastic,inum)
+  if( myrank == 0 ) then
+    write(IMAIN,*) '     total poroelastic elements:',inum
+  endif
+
+  ! collects contributions from different MPI partitions
+  ! sets up MPI communications
+  max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) )
+  allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy'
+  do i = 1, num_interfaces_ext_mesh
+     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
+  enddo
+  ! sums acoustic flags
+  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
                         my_neighbours_ext_mesh)
+  ! sums elastic flags
+  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
+                        my_neighbours_ext_mesh)
+  ! sums poroelastic flags
+  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
+                        my_neighbours_ext_mesh)
 
+
+  ! determines common faces between different domains
+  ! acoustic - elastic domain coupling
+  call get_coupling_surfaces_ac_el(myrank,nspec,ibool,elastic_flag)
+
   ! acoustic - poroelastic domain coupling
-  call get_coupling_surfaces_ac_poro(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                        my_neighbours_ext_mesh)
+  call get_coupling_surfaces_ac_poro(myrank,nspec,ibool,acoustic_flag)
 
   ! elastic - poroelastic domain coupling
-  call get_coupling_surfaces_el_poro(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                        my_neighbours_ext_mesh)
+  call get_coupling_surfaces_el_poro(myrank,nspec,ibool,elastic_flag)
 
+  ! frees temporary arrays
+  deallocate(acoustic_flag,elastic_flag,poroelastic_flag)
+
   end subroutine get_coupling_surfaces
 
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine get_coupling_surfaces_ac_el(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                        my_neighbours_ext_mesh)
+  subroutine get_coupling_surfaces_ac_el(myrank,nspec,ibool,elastic_flag)
 
 ! determines coupling surface for acoustic-elastic domains
 
@@ -93,17 +165,12 @@
   implicit none
 
 ! number of spectral elements in each block
-  integer :: myrank,nspec,NPROC
+  integer :: myrank,nspec
 
 ! arrays with the mesh
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
 
-! MPI communication
-  integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
-  integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: &
-            ibool_interfaces_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer,dimension(nglob_dummy) :: elastic_flag
 
 ! local parameters
   ! (assumes NGLLX=NGLLY=NGLLZ)
@@ -116,17 +183,10 @@
   integer,dimension(:,:,:),allocatable :: tmp_ijk
   integer,dimension(:),allocatable :: tmp_ispec
 
-  integer,dimension(NGNOD2D) :: iglob_corners_ref !,iglob_corners
-  integer :: ispec,i,j,k,igll,ier,iglob
-  integer :: inum,iface_ref,icorner,iglob_midpoint ! iface,ispec_neighbor
-  integer :: count_elastic,count_acoustic
+  integer,dimension(NGNOD2D) :: iglob_corners_ref
+  integer :: ispec,i,j,igll,ier
+  integer :: inum,iface_ref
 
-  ! mpi interface communication
-  integer, dimension(:), allocatable :: elastic_flag,acoustic_flag,test_flag
-  integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
-  integer :: max_nibool_interfaces_ext_mesh
-  logical, dimension(:), allocatable :: mask_ibool
-
   ! corners indices of reference cube faces
   integer,dimension(3,4),parameter :: iface1_corner_ijk = &
              reshape( (/ 1,1,1, 1,NGLLY,1, 1,NGLLY,NGLLZ, 1,1,NGLLZ /),(/3,4/))   ! xmin
@@ -153,7 +213,7 @@
   !integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: gll_data
   !character(len=256):: prname_file
 
-! allocates temporary arrays
+  ! allocates temporary arrays
   allocate(tmp_normal(NDIM,NGLLSQUARE,nspec*6),stat=ier)
   if( ier /= 0 ) stop 'error allocating array tmp_normal'
   allocate(tmp_jacobian2Dw(NGLLSQUARE,nspec*6),stat=ier)
@@ -167,78 +227,8 @@
   tmp_normal(:,:,:) = 0.0
   tmp_jacobian2Dw(:,:) = 0.0
 
-  ! sets flags for acoustic / elastic on global points
-  allocate(elastic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array elastic_flag'
-  allocate(acoustic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array acoustic_flag'
-  allocate(test_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array test_flag'
-  allocate(mask_ibool(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array mask_ibool'
-  elastic_flag(:) = 0
-  acoustic_flag(:) = 0
-  !test_flag(:) = 0
-  count_elastic = 0
-  count_acoustic = 0
-  do ispec = 1, nspec
-    ! counts elements
-    if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1
-    if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1
-
-    ! sets flags on global points
-    do k = 1, NGLLZ
-      do j = 1, NGLLY
-        do i = 1, NGLLX
-          ! global index
-          iglob = ibool(i,j,k,ispec)
-          ! sets elastic flag
-          if( ispec_is_elastic(ispec) ) elastic_flag(iglob) =  myrank+1
-          ! sets acoustic flag
-          if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) =  myrank+1
-          ! sets test flag
-          !test_flag(iglob) = myrank+1
-        enddo
-      enddo
-    enddo
-  enddo
-  call sum_all_i(count_acoustic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total acoustic elements:',inum
-  endif
-  call sum_all_i(count_elastic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total elastic elements :',inum
-  endif
-
-  ! collects contributions from different MPI partitions
-  ! sets up MPI communications
-  max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) )
-  allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy'
-  do i = 1, num_interfaces_ext_mesh
-     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
-  enddo
-  ! sums elastic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-  ! sums acoustic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
-  ! sums test flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
   ! loops over all element faces and
   ! counts number of coupling faces between acoustic and elastic elements
-  mask_ibool(:) = .false.
   inum = 0
 
   ! coupling surfaces: takes point of view from acoustic elements, i.e. if element is acoustic
@@ -248,112 +238,67 @@
   ! note: we use acoustic elements as reference elements because we will need
   !          density from acoustic element when coupling pressure in case of gravity
   do ispec=1,nspec
+
     if( ispec_is_acoustic(ispec) ) then
 
       ! loops over each face
       do iface_ref= 1, 6
 
         ! takes indices of corners of reference face
-        do icorner = 1,NGNOD2D
-          i = iface_all_corner_ijk(1,icorner,iface_ref)
-          j = iface_all_corner_ijk(2,icorner,iface_ref)
-          k = iface_all_corner_ijk(3,icorner,iface_ref)
-          ! global reference indices
-          iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
+        call get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, &
+                                ibool,nspec,nglob_dummy,xstore_dummy,ystore_dummy,zstore_dummy, &
+                                iface_all_corner_ijk)
 
-          ! reference corner coordinates
-          xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-          ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-          zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
-        enddo
+        ! checks if face is has an elastic side
+        if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
+           elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
+           elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
+           elastic_flag( iglob_corners_ref(4) ) >= 1) then
 
-        ! checks if face has acoustic side
-        if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-           acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-           acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-           acoustic_flag( iglob_corners_ref(4) ) >= 1) then
-          ! checks if face is has an elastic side
-          if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-             elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-             elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-             elastic_flag( iglob_corners_ref(4) ) >= 1) then
+          ! gets face GLL points i,j,k indices from element face
+          call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
 
-            ! reference midpoint on face (used to avoid redundant face counting)
-            i = iface_all_midpointijk(1,iface_ref)
-            j = iface_all_midpointijk(2,iface_ref)
-            k = iface_all_midpointijk(3,iface_ref)
-            iglob_midpoint = ibool(i,j,k,ispec)
+          ! gets face GLL 2Djacobian, weighted from element face
+          call get_jacobian_boundary_face(myrank,nspec, &
+                    xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
+                    dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
+                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                    ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
 
-            ! checks if points on this face are masked already
-            if( .not. mask_ibool(iglob_midpoint) .and. &
-                ( acoustic_flag(iglob_midpoint) >= 1 .and. elastic_flag(iglob_midpoint) >= 1) ) then
+          ! normal convention: points away from acoustic, reference element
+          !                                switch normal direction if necessary
+          do j=1,NGLLY
+            do i=1,NGLLX
+                ! directs normals such that they point outwards of element
+                call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
+                                            ibool,nspec,nglob_dummy, &
+                                            xstore_dummy,ystore_dummy,zstore_dummy, &
+                                            normal_face(:,i,j) )
+                ! makes sure that it always points away from acoustic element,
+                ! otherwise switch direction
+                ! note: this should not happen, since we only loop over acoustic elements
+                if( ispec_is_elastic(ispec) ) stop 'error acoustic-elastic coupling surface'
+            enddo
+          enddo
 
-              ! gets face GLL points i,j,k indices from element face
-              call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
+          ! stores informations about this face
+          inum = inum + 1
+          tmp_ispec(inum) = ispec
+          igll = 0
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! adds all gll points on this face
+              igll = igll + 1
 
+              ! do we need to store local i,j,k,ispec info? or only global indices iglob?
+              tmp_ijk(:,igll,inum) = ijk_face(:,i,j)
 
-              ! gets face GLL 2Djacobian, weighted from element face
-              call get_jacobian_boundary_face(myrank,nspec, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
-
-              ! normal convention: points away from acoustic, reference element
-              !                                switch normal direction if necessary
-              do j=1,NGLLY
-                do i=1,NGLLX
-                    ! directs normals such that they point outwards of element
-                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-                                                ibool,nspec,nglob_dummy, &
-                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-                                                normal_face(:,i,j) )
-                    ! makes sure that it always points away from acoustic element,
-                    ! otherwise switch direction
-                    ! note: this should not happen, since we only loop over acoustic elements
-                    !if( ispec_is_elastic(ispec) ) normal_face(:,i,j) = - normal_face(:,i,j)
-                    if( ispec_is_elastic(ispec) ) stop 'error acoustic-elastic coupling surface'
-
-                enddo
-              enddo
-
-              ! stores informations about this face
-              inum = inum + 1
-              tmp_ispec(inum) = ispec
-              igll = 0
-              do j=1,NGLLY
-                do i=1,NGLLX
-                  ! adds all gll points on this face
-                  igll = igll + 1
-
-                  ! do we need to store local i,j,k,ispec info? or only global indices iglob?
-                  tmp_ijk(:,igll,inum) = ijk_face(:,i,j)
-
-                  ! stores weighted jacobian and normals
-                  tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
-                  tmp_normal(:,igll,inum) = normal_face(:,i,j)
-
-                  ! masks global points ( to avoid redundant counting of faces)
-                  iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec)
-                  mask_ibool(iglob) = .true.
-                enddo
-              enddo
-
-              ! test_flags shouldn't matter, there is only 1 acoustic element touching a coupled surface
-              ! which will be considered in the MPI partition which contains it
-              !else
-              !  ! assumes to be already collected by lower rank process, masks face points
-              !  do j=1,NGLLY
-              !    do i=1,NGLLX
-              !      iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec)
-              !      mask_ibool(iglob) = .true.
-              !    enddo
-              !  enddo
-              !endif ! test_flag
-
-            endif ! mask_ibool
-          endif ! elastic_flag
-        endif ! acoustic_flag
+              ! stores weighted jacobian and normals
+              tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
+              tmp_normal(:,igll,inum) = normal_face(:,i,j)
+            enddo
+          enddo
+        endif ! elastic_flag
       enddo ! iface_ref
     endif ! ispec_is_acoustic
   enddo ! ispec
@@ -381,8 +326,7 @@
 ! user output
   call sum_all_i(num_coupling_ac_el_faces,inum)
   if( myrank == 0 ) then
-    write(IMAIN,*) '     acoustic-elastic coupling:'
-    write(IMAIN,*) '     total number of faces = ',inum
+    write(IMAIN,*) '     acoustic-elastic coupling    : total number of faces = ',inum
   endif
 
   end subroutine get_coupling_surfaces_ac_el
@@ -392,11 +336,7 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine get_coupling_surfaces_ac_poro(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                        my_neighbours_ext_mesh)
+  subroutine get_coupling_surfaces_ac_poro(myrank,nspec,ibool,acoustic_flag)
 
 ! determines coupling surface for acoustic-poroelastic domains
 
@@ -404,17 +344,12 @@
   implicit none
 
 ! number of spectral elements in each block
-  integer :: myrank,nspec,NPROC
+  integer :: myrank,nspec
 
 ! arrays with the mesh
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
 
-! MPI communication
-  integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
-  integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: &
-            ibool_interfaces_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer,dimension(nglob_dummy) :: acoustic_flag
 
 ! local parameters
   ! (assumes NGLLX=NGLLY=NGLLZ)
@@ -428,15 +363,9 @@
   integer,dimension(:),allocatable :: tmp_ispec
 
   integer,dimension(NGNOD2D) :: iglob_corners_ref
-  integer :: ispec,i,j,k,igll,ier,iglob
-  integer :: inum,iface_ref,icorner
-  integer :: count_poroelastic,count_acoustic
+  integer :: ispec,i,j,igll,ier
+  integer :: inum,iface_ref
 
-  ! mpi interface communication
-  integer, dimension(:), allocatable :: poroelastic_flag,acoustic_flag,test_flag
-  integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
-  integer :: max_nibool_interfaces_ext_mesh
-
   ! corners indices of reference cube faces
   integer,dimension(3,4),parameter :: iface1_corner_ijk = &
              reshape( (/ 1,1,1, 1,NGLLY,1, 1,NGLLY,NGLLZ, 1,1,NGLLZ /),(/3,4/))   ! xmin
@@ -463,7 +392,7 @@
   !integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: gll_data
   !character(len=256):: prname_file
 
-! allocates temporary arrays
+  ! allocates temporary arrays
   allocate(tmp_normal(NDIM,NGLLSQUARE,nspec*6),stat=ier)
   if( ier /= 0 ) stop 'error allocating array tmp_normal'
   allocate(tmp_jacobian2Dw(NGLLSQUARE,nspec*6),stat=ier)
@@ -477,147 +406,71 @@
   tmp_normal(:,:,:) = 0.0
   tmp_jacobian2Dw(:,:) = 0.0
 
-  ! sets flags for acoustic / poroelastic on global points
-  allocate(poroelastic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array poroelastic_flag'
-  allocate(acoustic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array acoustic_flag'
-  allocate(test_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array test_flag'
-  poroelastic_flag(:) = 0
-  acoustic_flag(:) = 0
-  test_flag(:) = 0
-  count_poroelastic = 0
-  count_acoustic = 0
-  do ispec = 1, nspec
-    ! counts elements
-    if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1
-    if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1
-
-    ! sets flags on global points
-    do k = 1, NGLLZ
-      do j = 1, NGLLY
-        do i = 1, NGLLX
-          ! global index
-          iglob = ibool(i,j,k,ispec)
-          ! sets poroelastic flag
-          if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) =  myrank+1
-          ! sets acoustic flag
-          if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) =  myrank+1
-          ! sets test flag
-          test_flag(iglob) = myrank+1
-        enddo
-      enddo
-    enddo
-  enddo
-  call sum_all_i(count_acoustic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total acoustic elements:',inum
-  endif
-  call sum_all_i(count_poroelastic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total poroelastic elements :',inum
-  endif
-
-  ! collects contributions from different MPI partitions
-  ! sets up MPI communications
-  max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) )
-  allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy'
-  do i = 1, num_interfaces_ext_mesh
-     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
-  enddo
-  ! sums poroelastic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-  ! sums acoustic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
-  ! sums test flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
   ! loops over all element faces and
   ! counts number of coupling faces between acoustic and poroelastic elements
   inum = 0
   do ispec=1,nspec
 
-   if(ispec_is_poroelastic(ispec)) then
+    if(ispec_is_poroelastic(ispec)) then
 
-    ! loops over each face
-    do iface_ref= 1, 6
+      ! loops over each face
+      do iface_ref= 1, 6
 
-      ! takes indices of corners of reference face
-      do icorner = 1,NGNOD2D
-        i = iface_all_corner_ijk(1,icorner,iface_ref)
-        j = iface_all_corner_ijk(2,icorner,iface_ref)
-        k = iface_all_corner_ijk(3,icorner,iface_ref)
-        ! global reference indices
-        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
+        ! takes indices of corners of reference face
+        call get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, &
+                                ibool,nspec,nglob_dummy,xstore_dummy,ystore_dummy,zstore_dummy, &
+                                iface_all_corner_ijk)
 
-        ! reference corner coordinates
-        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
-      enddo
+        ! checks if face has acoustic side
+        if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. &
+           acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. &
+           acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. &
+           acoustic_flag( iglob_corners_ref(4) ) >= 1) then
 
-      ! checks if face has acoustic side
-      if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-         acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-         acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-         acoustic_flag( iglob_corners_ref(4) ) >= 1) then
+          ! gets face GLL points i,j,k indices from poroelastic element face
+          call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
 
-            ! gets face GLL points i,j,k indices from poroelastic element face
-            call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
+          ! gets face GLL 2Djacobian, weighted from element face
+          call get_jacobian_boundary_face(myrank,nspec, &
+                    xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
+                    dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
+                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                    ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
 
-              ! gets face GLL 2Djacobian, weighted from element face
-              call get_jacobian_boundary_face(myrank,nspec, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
+          ! normal convention: points away from acoustic, reference element
+          !                                switch normal direction if necessary
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! directs normals such that they point outwards of element
+              call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
+                                          ibool,nspec,nglob_dummy, &
+                                          xstore_dummy,ystore_dummy,zstore_dummy, &
+                                          normal_face(:,i,j) )
+              ! reverse the sign, we know we are in a poroelastic element
+              normal_face(:,i,j) = - normal_face(:,i,j)
+            enddo
+          enddo
 
-              ! normal convention: points away from acoustic, reference element
-              !                                switch normal direction if necessary
-              do j=1,NGLLY
-                do i=1,NGLLX
-                    ! directs normals such that they point outwards of element
-                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-                                                ibool,nspec,nglob_dummy, &
-                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-                                                normal_face(:,i,j) )
-                    ! reverse the sign, we know we are in a poroelastic element
-                    normal_face(:,i,j) = - normal_face(:,i,j)
-                enddo
-              enddo
+          ! stores informations about this face
+          inum = inum + 1
+          tmp_ispec(inum) = ispec
+          igll = 0
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! adds all gll points on this face
+              igll = igll + 1
 
-              ! stores informations about this face
-              inum = inum + 1
-              tmp_ispec(inum) = ispec
-              igll = 0
-              do j=1,NGLLY
-                do i=1,NGLLX
-                  ! adds all gll points on this face
-                  igll = igll + 1
+              ! we need to store local i,j,k,ispec info
+              tmp_ijk(:,igll,inum) = ijk_face(:,i,j)
 
-                  ! we need to store local i,j,k,ispec info
-                  tmp_ijk(:,igll,inum) = ijk_face(:,i,j)
-
-                  ! stores weighted jacobian and normals
-                  tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
-                  tmp_normal(:,igll,inum) = normal_face(:,i,j)
-                enddo
-              enddo
-      endif ! acoustic_flag
-    enddo ! iface_ref
-   endif ! ispec_is_poroelastic
+              ! stores weighted jacobian and normals
+              tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
+              tmp_normal(:,igll,inum) = normal_face(:,i,j)
+            enddo
+          enddo
+        endif ! acoustic_flag
+      enddo ! iface_ref
+    endif ! ispec_is_poroelastic
   enddo ! ispec
 
 ! stores completed coupling face informations
@@ -644,8 +497,7 @@
 ! user output
   call sum_all_i(num_coupling_ac_po_faces,inum)
   if( myrank == 0 ) then
-    write(IMAIN,*) '     acoustic-poroelastic coupling:'
-    write(IMAIN,*) '     total number of faces = ',inum
+    write(IMAIN,*) '     acoustic-poroelastic coupling: total number of faces = ',inum
   endif
 
   end subroutine get_coupling_surfaces_ac_poro
@@ -654,11 +506,7 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine get_coupling_surfaces_el_poro(myrank, &
-                        nspec,ibool,NPROC, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-                        my_neighbours_ext_mesh)
+  subroutine get_coupling_surfaces_el_poro(myrank,nspec,ibool,elastic_flag)
 
 ! determines coupling surface for elastic-poroelastic domains
 
@@ -666,17 +514,12 @@
   implicit none
 
 ! number of spectral elements in each block
-  integer :: myrank,nspec,NPROC
+  integer :: myrank,nspec
 
 ! arrays with the mesh
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
 
-! MPI communication
-  integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
-  integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: &
-            ibool_interfaces_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer,dimension(nglob_dummy) :: elastic_flag
 
 ! local parameters
   ! (assumes NGLLX=NGLLY=NGLLZ)
@@ -690,15 +533,10 @@
   integer,dimension(:),allocatable :: tmp_ispec,tmp_ispec_el
 
   integer,dimension(NGNOD2D) :: iglob_corners_ref,iglob_corners_ref_el
-  integer :: ispec,i,j,k,igll,ier,iglob,ispec_el,ispec_ref_el
+  integer :: ispec,i,j,k,igll,ier
+  integer :: ispec_el,ispec_ref_el
   integer :: inum,iface_ref,iface_ref_el,iface_el,icorner
-  integer :: count_poroelastic,count_elastic
 
-  ! mpi interface communication
-  integer, dimension(:), allocatable :: poroelastic_flag,elastic_flag,test_flag
-  integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
-  integer :: max_nibool_interfaces_ext_mesh
-
   ! corners indices of reference cube faces
   integer,dimension(3,4),parameter :: iface1_corner_ijk = &
              reshape( (/ 1,1,1, 1,NGLLY,1, 1,NGLLY,NGLLZ, 1,1,NGLLZ /),(/3,4/))   ! xmin
@@ -745,175 +583,100 @@
   tmp_normal(:,:,:) = 0.0
   tmp_jacobian2Dw(:,:) = 0.0
 
-  ! sets flags for elastic / poroelastic on global points
-  allocate(poroelastic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array poroelastic_flag'
-  allocate(elastic_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array elastic_flag'
-  allocate(test_flag(nglob_dummy),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array test_flag'
-  poroelastic_flag(:) = 0
-  elastic_flag(:) = 0
-  test_flag(:) = 0
-  count_poroelastic = 0
-  count_elastic = 0
-  do ispec = 1, nspec
-    ! counts elements
-    if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1
-    if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1
-
-    ! sets flags on global points
-    do k = 1, NGLLZ
-      do j = 1, NGLLY
-        do i = 1, NGLLX
-          ! global index
-          iglob = ibool(i,j,k,ispec)
-          ! sets poroelastic flag
-          if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) =  myrank+1
-          ! sets elastic flag
-          if( ispec_is_elastic(ispec) ) elastic_flag(iglob) =  myrank+1
-          ! sets test flag
-          test_flag(iglob) = myrank+1
-        enddo
-      enddo
-    enddo
-  enddo
-  call sum_all_i(count_elastic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total elastic elements:',inum
-  endif
-  call sum_all_i(count_poroelastic,inum)
-  if( myrank == 0 ) then
-    write(IMAIN,*) '     total poroelastic elements :',inum
-  endif
-
-  ! collects contributions from different MPI partitions
-  ! sets up MPI communications
-  max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) )
-  allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-  if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy'
-  do i = 1, num_interfaces_ext_mesh
-     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
-  enddo
-  ! sums poroelastic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-  ! sums elastic flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
-  ! sums test flags
-  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-                        my_neighbours_ext_mesh)
-
   ! loops over all element faces and
   ! counts number of coupling faces between elastic and poroelastic elements
   inum = 0
   do ispec=1,nspec
 
-   if(ispec_is_poroelastic(ispec)) then
+    if(ispec_is_poroelastic(ispec)) then
 
-    ! loops over each face
-    do iface_ref= 1, 6
+      ! loops over each face
+      do iface_ref= 1, 6
 
-      ! takes indices of corners of reference face
-      do icorner = 1,NGNOD2D
-        i = iface_all_corner_ijk(1,icorner,iface_ref)
-        j = iface_all_corner_ijk(2,icorner,iface_ref)
-        k = iface_all_corner_ijk(3,icorner,iface_ref)
-        ! global reference indices
-        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
+        ! takes indices of corners of reference face
+        call get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, &
+                                ibool,nspec,nglob_dummy,xstore_dummy,ystore_dummy,zstore_dummy, &
+                                iface_all_corner_ijk)
 
-        ! reference corner coordinates
-        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
+        ! checks if face has elastic side
+        if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
+           elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
+           elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
+           elastic_flag( iglob_corners_ref(4) ) >= 1) then
 
-      enddo
-
-      ! checks if face has elastic side
-      if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-         elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-         elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-         elastic_flag( iglob_corners_ref(4) ) >= 1) then
-
-      ! need to find elastic element for coupling
+          ! need to find elastic element for coupling
+          !
+          ! note: this assumes that both, elastic and poroelastic element, are in the same
+          !          partition; check with decomposition that this is valid for this mesh partitioning
           do ispec_el=1,nspec
             if(ispec_is_elastic(ispec_el))then
-     do iface_el=6,1,-1
-      ! takes indices of corners of reference face
-      do icorner = 1,NGNOD2D
-        i = iface_all_corner_ijk(1,icorner,iface_el)
-        j = iface_all_corner_ijk(2,icorner,iface_el)
-        k = iface_all_corner_ijk(3,icorner,iface_el)
-        ! global reference indices
-        iglob_corners_ref_el(icorner) = ibool(i,j,k,ispec_el)
+              do iface_el=6,1,-1
+                ! takes indices of corners of reference face
+                do icorner = 1,NGNOD2D
+                  i = iface_all_corner_ijk(1,icorner,iface_el)
+                  j = iface_all_corner_ijk(2,icorner,iface_el)
+                  k = iface_all_corner_ijk(3,icorner,iface_el)
+                  ! global reference indices
+                  iglob_corners_ref_el(icorner) = ibool(i,j,k,ispec_el)
+                enddo
 
-      enddo
+                if ( (iglob_corners_ref(1) == iglob_corners_ref_el(3)) .and. &
+                  (iglob_corners_ref(3) == iglob_corners_ref_el(1)) ) then
 
-      if ( (iglob_corners_ref(1) == iglob_corners_ref_el(3)) .and. &
-      (iglob_corners_ref(3) == iglob_corners_ref_el(1)) ) then
+                  iface_ref_el = iface_el ![CM]: for some reason this shows a wrong orientation
+                                          ! but the calcul is ok.
+                  ispec_ref_el = ispec_el
 
-           iface_ref_el = iface_el ![CM]: for some reason this shows a wrong orientation
-                                   ! but the calcul is ok.
-           ispec_ref_el = ispec_el
+                  ! gets face GLL points i,j,k indices from poroelastic element face
+                  call get_element_face_gll_indices(iface_ref,ijk_face_po,NGLLX,NGLLY)
+                  ! gets face GLL points i,j,k indices from elastic element face
+                  call get_element_face_gll_indices(iface_ref_el,ijk_face_el,NGLLX,NGLLY)
 
-            ! gets face GLL points i,j,k indices from poroelastic element face
-            call get_element_face_gll_indices(iface_ref,ijk_face_po,NGLLX,NGLLY)
-            ! gets face GLL points i,j,k indices from elastic element face
-            call get_element_face_gll_indices(iface_ref_el,ijk_face_el,NGLLX,NGLLY)
+                  ! gets face GLL 2Djacobian, weighted from element face
+                  call get_jacobian_boundary_face(myrank,nspec, &
+                          xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
+                          dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
+                          wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                          ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
 
-              ! gets face GLL 2Djacobian, weighted from element face
-              call get_jacobian_boundary_face(myrank,nspec, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
+                  ! normal convention: points away from poroelastic, reference element
+                  do j=1,NGLLY
+                    do i=1,NGLLX
+                      ! directs normals such that they point outwards of poroelastic element
+                      call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
+                                                  ibool,nspec,nglob_dummy, &
+                                                  xstore_dummy,ystore_dummy,zstore_dummy, &
+                                                  normal_face(:,i,j) )
+                    enddo
+                  enddo
 
-              ! normal convention: points away from poroelastic, reference element
-              do j=1,NGLLY
-                do i=1,NGLLX
-                    ! directs normals such that they point outwards of poroelastic element
-                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-                                                ibool,nspec,nglob_dummy, &
-                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-                                                normal_face(:,i,j) )
-                enddo
-              enddo
+                  ! stores informations about this face
+                  inum = inum + 1
+                  tmp_ispec(inum) = ispec
+                  tmp_ispec_el(inum) = ispec_ref_el
+                  igll = 0
+                  do j=1,NGLLY
+                    do i=1,NGLLX
+                      ! adds all gll points on this face
+                      igll = igll + 1
 
-              ! stores informations about this face
-              inum = inum + 1
-              tmp_ispec(inum) = ispec
-              tmp_ispec_el(inum) = ispec_ref_el
-              igll = 0
-              do j=1,NGLLY
-                do i=1,NGLLX
-                  ! adds all gll points on this face
-                  igll = igll + 1
+                      ! we need to store local i,j,k,ispec info
+                      tmp_ijk(:,igll,inum) = ijk_face_po(:,i,j)
+                      tmp_ijk_el(:,igll,inum) = ijk_face_el(:,NGLLY-j+1,NGLLX-i+1)
 
-                  ! we need to store local i,j,k,ispec info
-                  tmp_ijk(:,igll,inum) = ijk_face_po(:,i,j)
-                  tmp_ijk_el(:,igll,inum) = ijk_face_el(:,NGLLY-j+1,NGLLX-i+1)
+                      ! stores weighted jacobian and normals
+                      tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
+                      tmp_normal(:,igll,inum) = normal_face(:,i,j)
+                    enddo
+                  enddo
+                endif ! if
 
-                  ! stores weighted jacobian and normals
-                  tmp_jacobian2Dw(igll,inum) = jacobian2Dw_face(i,j)
-                  tmp_normal(:,igll,inum) = normal_face(:,i,j)
-                enddo
-              enddo
-       endif ! if
-
-     enddo ! do iface_ref_el=1,6
-             endif ! if(ispec_is_elastic(ispec_el))then
+              enddo ! do iface_ref_el=1,6
+            endif ! if(ispec_is_elastic(ispec_el))then
           enddo ! do ispec_el=1,nspec
-      endif ! elastic_flag
-    enddo ! iface_ref
-   endif ! ispec_is_poroelastic
+        endif ! elastic_flag
+      enddo ! iface_ref
+    endif ! ispec_is_poroelastic
   enddo ! ispec
 
 ! stores completed coupling face informations
@@ -946,575 +709,8 @@
 ! user output
   call sum_all_i(num_coupling_el_po_faces,inum)
   if( myrank == 0 ) then
-    write(IMAIN,*) '     elastic-poroelastic coupling:'
-    write(IMAIN,*) '     total number of faces = ',inum
+    write(IMAIN,*) '     elastic-poroelastic coupling : total number of faces = ',inum
   endif
 
   end subroutine get_coupling_surfaces_el_poro
 
-!
-!-------------------------------------------------------------------------------------------------
-!
-
-! not working properly yet...
-
-!  subroutine get_coupling_surfaces_comb(myrank, &
-!                        nspec,ibool,NPROC, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-!                        num_interfaces_ext_mesh,max_interface_size_ext_mesh, &
-!                        my_neighbours_ext_mesh)
-!
-!! determines coupling surface for acoustic-elastic-poroelastic domains
-!
-!  use create_regions_mesh_ext_par
-!  implicit none
-!
-!! number of spectral elements in each block
-!  integer :: myrank,nspec,NPROC
-!
-!! arrays with the mesh
-!  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
-!
-!! MPI communication
-!  integer :: num_interfaces_ext_mesh,max_interface_size_ext_mesh
-!  integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
-!  integer, dimension(NGLLX*NGLLX*max_interface_size_ext_mesh,num_interfaces_ext_mesh) :: &
-!            ibool_interfaces_ext_mesh
-!  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
-!
-!! local parameters
-!  ! (assumes NGLLX=NGLLY=NGLLZ)
-!  real(kind=CUSTOM_REAL),dimension(NGNOD2D) :: xcoord,ycoord,zcoord
-!  real(kind=CUSTOM_REAL) :: jacobian2Dw_face(NGLLX,NGLLY)
-!  real(kind=CUSTOM_REAL) :: normal_face(NDIM,NGLLX,NGLLY)
-!  real(kind=CUSTOM_REAL),dimension(:,:,:),allocatable :: tmp_normal
-!  real(kind=CUSTOM_REAL),dimension(:,:),allocatable :: tmp_jacobian2Dw
-!  integer :: ijk_face(3,NGLLX,NGLLY)
-!  integer,dimension(:,:,:),allocatable :: tmp_ijk
-!  integer,dimension(:),allocatable :: tmp_ispec
-!
-!  integer,dimension(NGNOD2D) :: iglob_corners_ref !,iglob_corners
-!  integer :: ispec,i,j,k,igll,ier,iglob
-!  integer :: inum,inum_ac,inum_el,inum_po,iface_ref,icorner,iglob_midpoint ! iface,ispec_neighbor
-!  integer :: inum_ac_el,inum_el_po,inum_ac_po
-!  integer :: count_elastic,count_acoustic,count_poroelastic
-!
-!  ! mpi interface communication
-!  integer, dimension(:), allocatable :: elastic_flag,acoustic_flag,poroelastic_flag,test_flag
-!  integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
-!  integer :: max_nibool_interfaces_ext_mesh
-!  logical, dimension(:), allocatable :: mask_ibool_ac_el,mask_ibool_ac_po,mask_ibool_el_po
-!
-!  ! corners indices of reference cube faces
-!  integer,dimension(3,4),parameter :: iface1_corner_ijk = &
-!             reshape( (/ 1,1,1, 1,NGLLY,1, 1,NGLLY,NGLLZ, 1,1,NGLLZ /),(/3,4/))   ! xmin
-!  integer,dimension(3,4),parameter :: iface2_corner_ijk = &
-!             reshape( (/ NGLLX,1,1, NGLLX,NGLLY,1, NGLLX,NGLLY,NGLLZ, NGLLX,1,NGLLZ  /),(/3,4/))   ! xmax
-!  integer,dimension(3,4),parameter :: iface3_corner_ijk = &
-!             reshape( (/ 1,1,1, 1,1,NGLLZ, NGLLX,1,NGLLZ, NGLLX,1,1  /),(/3,4/))   ! ymin
-!  integer,dimension(3,4),parameter :: iface4_corner_ijk = &
-!             reshape( (/ 1,NGLLY,1, NGLLX,NGLLY,1, NGLLX,NGLLY,NGLLZ, 1,NGLLY,NGLLZ /),(/3,4/))   ! ymax
-!  integer,dimension(3,4),parameter :: iface5_corner_ijk = &
-!             reshape( (/ 1,1,1, 1,NGLLY,1, NGLLX,NGLLY,1, NGLLX,1,1 /),(/3,4/))  ! bottom
-!  integer,dimension(3,4),parameter :: iface6_corner_ijk = &
-!             reshape( (/ 1,1,NGLLZ, NGLLX,1,NGLLZ, NGLLX,NGLLY,NGLLZ, 1,NGLLY,NGLLZ  /),(/3,4/))   ! top
-!  integer,dimension(3,4,6),parameter :: iface_all_corner_ijk = &
-!             reshape( (/ iface1_corner_ijk,iface2_corner_ijk, &
-!                 iface3_corner_ijk,iface4_corner_ijk, &
-!                 iface5_corner_ijk,iface6_corner_ijk /),(/3,4,6/))   ! all faces
-!  ! midpoint indices for each face (xmin,xmax,ymin,ymax,zmin,zmax)
-!  integer,dimension(3,6),parameter :: iface_all_midpointijk = &
-!             reshape( (/ 1,2,2, NGLLX,2,2, 2,1,2, 2,NGLLY,2, 2,2,1, 2,2,NGLLZ  /),(/3,6/))   ! top
-!
-!
-!  ! test vtk output
-!  !integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC) :: gll_data
-!  !character(len=256):: prname_file
-!
-!  ! initializes number of coupling faces
-!  num_coupling_ac_el_faces = 0
-!  num_coupling_ac_po_faces = 0
-!  num_coupling_el_po_faces = 0
-!
-!  ! allocates temporary arrays
-!  allocate(tmp_normal(NDIM,NGLLSQUARE,nspec*6),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array tmp_normal'
-!  allocate(tmp_jacobian2Dw(NGLLSQUARE,nspec*6),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array tmp_jacobian2Dw'
-!  allocate(tmp_ijk(3,NGLLSQUARE,nspec*6),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array tmp_ijk'
-!  allocate(tmp_ispec(nspec*6),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array tmp_ispec'
-!  tmp_ispec(:) = 0
-!  tmp_ijk(:,:,:) = 0
-!  tmp_normal(:,:,:) = 0.0
-!  tmp_jacobian2Dw(:,:) = 0.0
-!
-!  ! sets flags for acoustic / elastic /poroelastic on global points
-!  allocate(elastic_flag(nglob_dummy),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array elastic_flag'
-!  allocate(acoustic_flag(nglob_dummy),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array acoustic_flag'
-!  allocate(poroelastic_flag(nglob_dummy),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array poroelastic_flag'
-!  allocate(test_flag(nglob_dummy),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array test_flag'
-!  allocate(mask_ibool_ac_el(nglob_dummy),stat=ier)
-!  allocate(mask_ibool_ac_po(nglob_dummy),stat=ier)
-!  allocate(mask_ibool_el_po(nglob_dummy),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array mask_ibool'
-!  elastic_flag(:) = 0
-!  acoustic_flag(:) = 0
-!  poroelastic_flag(:) = 0
-!  test_flag(:) = 0
-!  count_elastic = 0
-!  count_acoustic = 0
-!  count_poroelastic = 0
-!  do ispec = 1, nspec
-!    ! counts elements
-!    if( ispec_is_elastic(ispec) ) count_elastic = count_elastic + 1
-!    if( ispec_is_acoustic(ispec) ) count_acoustic = count_acoustic + 1
-!    if( ispec_is_poroelastic(ispec) ) count_poroelastic = count_poroelastic + 1
-!
-!    ! sets flags on global points
-!    do k = 1, NGLLZ
-!      do j = 1, NGLLY
-!        do i = 1, NGLLX
-!          ! global index
-!          iglob = ibool(i,j,k,ispec)
-!          ! sets elastic flag
-!          if( ispec_is_elastic(ispec) ) elastic_flag(iglob) =  myrank+1
-!          ! sets acoustic flag
-!          if( ispec_is_acoustic(ispec) ) acoustic_flag(iglob) =  myrank+1
-!          ! sets poroelastic flag
-!          if( ispec_is_poroelastic(ispec) ) poroelastic_flag(iglob) =  myrank+1
-!          ! sets test flag
-!          test_flag(iglob) = myrank+1
-!        enddo
-!      enddo
-!    enddo
-!  enddo
-!  call sum_all_i(count_acoustic,inum_ac)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     total acoustic elements    :',inum_ac
-!  endif
-!  call sum_all_i(count_elastic,inum_el)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     total elastic elements     :',inum_el
-!  endif
-!  call sum_all_i(count_poroelastic,inum_po)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     total poroelastic elements :',inum_po
-!  endif
-!
-!  ! collects contributions from different MPI partitions
-!  ! sets up MPI communications
-!  max_nibool_interfaces_ext_mesh = maxval( nibool_interfaces_ext_mesh(:) )
-!  allocate(ibool_interfaces_ext_mesh_dummy(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array ibool_interfaces_ext_mesh_dummy'
-!  do i = 1, num_interfaces_ext_mesh
-!     ibool_interfaces_ext_mesh_dummy(:,i) = ibool_interfaces_ext_mesh(1:max_nibool_interfaces_ext_mesh,i)
-!  enddo
-!  ! sums elastic flags
-!  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, &
-!                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-!                        my_neighbours_ext_mesh)
-!  ! sums acoustic flags
-!  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, &
-!                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-!                        my_neighbours_ext_mesh)
-!  ! sums poroelastic flags
-!  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, &
-!                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-!                        my_neighbours_ext_mesh)
-!  ! sums test flags
-!  call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,test_flag, &
-!                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-!                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
-!                        my_neighbours_ext_mesh)
-!
-!!----------------------
-!! acoustic-elastic
-!!----------------------
-!  inum_ac_el = 0
-!  !if (inum_el >0 .and. inum_ac >0) then
-!  ! loops over all element faces and
-!  ! counts number of coupling faces between acoustic and elastic elements
-!  mask_ibool_ac_el(:) = .false.
-!  do ispec=1,nspec
-!
-!    ! loops over each face
-!    do iface_ref= 1, 6
-!
-!      ! takes indices of corners of reference face
-!      do icorner = 1,NGNOD2D
-!        i = iface_all_corner_ijk(1,icorner,iface_ref)
-!        j = iface_all_corner_ijk(2,icorner,iface_ref)
-!        k = iface_all_corner_ijk(3,icorner,iface_ref)
-!        ! global reference indices
-!        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
-!
-!        ! reference corner coordinates
-!        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-!        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-!        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
-!      enddo
-!
-!      ! checks if face has acoustic side
-!      if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(4) ) >= 1) then
-!        ! checks if face is has an elastic side
-!        if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-!           elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-!           elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-!           elastic_flag( iglob_corners_ref(4) ) >= 1) then
-!
-!          ! reference midpoint on face (used to avoid redundant face counting)
-!          i = iface_all_midpointijk(1,iface_ref)
-!          j = iface_all_midpointijk(2,iface_ref)
-!          k = iface_all_midpointijk(3,iface_ref)
-!          iglob_midpoint = ibool(i,j,k,ispec)
-!
-!          ! checks if points on this face are masked already
-!          if( .not. mask_ibool_ac_el(iglob_midpoint) ) then
-!
-!            ! gets face GLL points i,j,k indices from element face
-!            call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
-!
-!            ! takes each element face only once, if it lies on an MPI interface
-!            ! note: this is not exactly load balanced
-!            !          lowest rank process collects as many faces as possible, second lowest as so forth
-!            if( (test_flag(iglob_midpoint) == myrank+1) .or. &
-!               (test_flag(iglob_midpoint) > 2*(myrank+1)) ) then
-!
-!              ! gets face GLL 2Djacobian, weighted from element face
-!              call get_jacobian_boundary_face(myrank,nspec, &
-!                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-!                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-!                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-!                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
-!
-!              ! normal convention: points away from acoustic, reference element
-!              !                                switch normal direction if necessary
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                    ! directs normals such that they point outwards of element
-!                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-!                                                ibool,nspec,nglob_dummy, &
-!                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-!                                                normal_face(:,i,j) )
-!                    ! makes sure that it always points away from acoustic element,
-!                    ! otherwise switch direction
-!                    if( ispec_is_elastic(ispec) ) normal_face(:,i,j) = - normal_face(:,i,j)
-!                enddo
-!              enddo
-!
-!              ! stores informations about this face
-!              inum_ac_el = inum_ac_el + 1
-!              tmp_ispec(inum_ac_el) = ispec
-!              igll = 0
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                  ! adds all gll points on this face
-!                  igll = igll + 1
-!
-!                  ! do we need to store local i,j,k,ispec info? or only global indices iglob?
-!                  tmp_ijk(:,igll,inum_ac_el) = ijk_face(:,i,j)
-!
-!                  ! stores weighted jacobian and normals
-!                  tmp_jacobian2Dw(igll,inum_ac_el) = jacobian2Dw_face(i,j)
-!                  tmp_normal(:,igll,inum_ac_el) = normal_face(:,i,j)
-!
-!                  ! masks global points ( to avoid redundant counting of faces)
-!                  iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec)
-!                  mask_ibool_ac_el(iglob) = .true.
-!                enddo
-!              enddo
-!            else
-!              ! assumes to be already collected by lower rank process, masks face points
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                  iglob = ibool(ijk_face(1,i,j),ijk_face(2,i,j),ijk_face(3,i,j),ispec)
-!                  mask_ibool_ac_el(iglob) = .true.
-!                enddo
-!              enddo
-!            endif ! test_flag
-!          endif ! mask_ibool
-!        endif ! elastic_flag
-!      endif ! acoustic_flag
-!    enddo ! iface_ref
-!  enddo ! ispec
-!
-!  !endif !if (count_elastic >0 .and. count_acoustic >0)
-!
-!! stores completed coupling face informations
-!!
-!! note: no need to store material parameters on these coupling points
-!!          for acoustic-elastic interface
-!  num_coupling_ac_el_faces = inum_ac_el
-!  allocate(coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_el_normal'
-!  allocate(coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_el_jacobian2Dw'
-!  allocate(coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_el_ijk'
-!  allocate(coupling_ac_el_ispec(num_coupling_ac_el_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_el_ispec'
-!  do inum = 1,num_coupling_ac_el_faces
-!    coupling_ac_el_normal(:,:,inum) = tmp_normal(:,:,inum)
-!    coupling_ac_el_jacobian2Dw(:,inum) = tmp_jacobian2Dw(:,inum)
-!    coupling_ac_el_ijk(:,:,inum) = tmp_ijk(:,:,inum)
-!    coupling_ac_el_ispec(inum) = tmp_ispec(inum)
-!  enddo
-!
-!! user output
-!! makes sure processes are synchronized
-!  call sum_all_i(num_coupling_ac_el_faces,inum_ac_el)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     acoustic-elastic coupling:'
-!    write(IMAIN,*) '     total number of faces = ',inum_ac_el
-!  endif
-!
-!
-!!----------------------
-!! acoustic-poroelastic
-!!----------------------
-!  tmp_ispec(:) = 0
-!  tmp_ijk(:,:,:) = 0
-!  tmp_normal(:,:,:) = 0.0
-!  tmp_jacobian2Dw(:,:) = 0.0
-!  inum_ac_po = 0
-!  !if (inum_po >0 .and. inum_ac >0) then
-!  ! loops over all element faces and
-!  ! counts number of coupling faces between acoustic and poroelastic elements
-!  do ispec=1,nspec
-!
-!   if(ispec_is_poroelastic(ispec)) then
-!
-!    ! loops over each face
-!    do iface_ref= 1, 6
-!
-!      ! takes indices of corners of reference face
-!      do icorner = 1,NGNOD2D
-!        i = iface_all_corner_ijk(1,icorner,iface_ref)
-!        j = iface_all_corner_ijk(2,icorner,iface_ref)
-!        k = iface_all_corner_ijk(3,icorner,iface_ref)
-!        ! global reference indices
-!        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
-!
-!        ! reference corner coordinates
-!        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-!        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-!        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
-!      enddo
-!
-!      ! checks if face has acoustic side
-!      if( acoustic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-!         acoustic_flag( iglob_corners_ref(4) ) >= 1) then
-!
-!
-!            ! gets face GLL points i,j,k indices from element face
-!            call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
-!
-!            ! gets face GLL 2Djacobian, weighted from element face
-!              call get_jacobian_boundary_face(myrank,nspec, &
-!                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-!                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-!                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-!                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
-!
-!              ! normal convention: points away from acoustic, reference element
-!              !                                switch normal direction if necessary
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                    ! directs normals such that they point outwards of element
-!                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-!                                                ibool,nspec,nglob_dummy, &
-!                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-!                                                normal_face(:,i,j) )
-!                    ! makes sure that it always points away from acoustic element,
-!                    ! otherwise switch direction
-!                    !if( ispec_is_poroelastic(ispec) ) normal_face(:,i,j) = - normal_face(:,i,j)
-!                    normal_face(:,i,j) = - normal_face(:,i,j)
-!                enddo
-!              enddo
-!
-!              ! stores informations about this face
-!              inum_ac_po = inum_ac_po + 1
-!              tmp_ispec(inum_ac_po) = ispec
-!              igll = 0
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                  ! adds all gll points on this face
-!                  igll = igll + 1
-!
-!                  ! do we need to store local i,j,k,ispec info? or only global indices iglob?
-!                  tmp_ijk(:,igll,inum_ac_po) = ijk_face(:,i,j)
-!
-!                  ! stores weighted jacobian and normals
-!                  tmp_jacobian2Dw(igll,inum_ac_po) = jacobian2Dw_face(i,j)
-!                  tmp_normal(:,igll,inum_ac_po) = normal_face(:,i,j)
-!
-!                enddo
-!              enddo
-!      endif ! acoustic_flag
-!    enddo ! iface_ref
-!   endif ! ispec_is_poroelastic
-!  enddo ! ispec
-!
-!  !endif !if (count_poroelastic >0 .and. count_acoustic >0)
-!
-!! stores completed coupling face informations
-!!
-!! note: for this coupling we need to have access to porous properties. The construction is such
-!! that i,j,k, & face correspond to poroelastic interface. Note that the normal is pointing outward the
-!! acoustic element
-!  num_coupling_ac_po_faces = inum_ac_po
-!  allocate(coupling_ac_po_normal(NDIM,NGLLSQUARE,num_coupling_ac_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_po_normal'
-!  allocate(coupling_ac_po_jacobian2Dw(NGLLSQUARE,num_coupling_ac_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_po_jacobian2Dw'
-!  allocate(coupling_ac_po_ijk(3,NGLLSQUARE,num_coupling_ac_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_po_ijk'
-!  allocate(coupling_ac_po_ispec(num_coupling_ac_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_ac_po_ispec'
-!  do inum = 1,num_coupling_ac_po_faces
-!    coupling_ac_po_normal(:,:,inum) = tmp_normal(:,:,inum)
-!    coupling_ac_po_jacobian2Dw(:,inum) = tmp_jacobian2Dw(:,inum)
-!    coupling_ac_po_ijk(:,:,inum) = tmp_ijk(:,:,inum)
-!    coupling_ac_po_ispec(inum) = tmp_ispec(inum)
-!  enddo
-!
-!! user output
-!! makes sure processes are synchronized
-!  call sum_all_i(num_coupling_ac_po_faces,inum_ac_po)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     acoustic-poroelastic coupling:'
-!    write(IMAIN,*) '     total number of faces = ',inum_ac_po
-!  endif
-!
-!
-!!----------------------
-!! elastic-poroelastic
-!!----------------------
-!  tmp_ispec(:) = 0
-!  tmp_ijk(:,:,:) = 0
-!  tmp_normal(:,:,:) = 0.0
-!  tmp_jacobian2Dw(:,:) = 0.0
-!  inum_el_po = 0
-!  !if (inum_el >0 .and. inum_po >0) then
-!  ! loops over all element faces and
-!  ! counts number of coupling faces between elastic and poroelastic elements
-!  do ispec=1,nspec
-!
-!   if(ispec_is_poroelastic(ispec)) then
-!
-!    ! loops over each face
-!    do iface_ref= 1, 6
-!
-!      ! takes indices of corners of reference face
-!      do icorner = 1,NGNOD2D
-!        i = iface_all_corner_ijk(1,icorner,iface_ref)
-!        j = iface_all_corner_ijk(2,icorner,iface_ref)
-!        k = iface_all_corner_ijk(3,icorner,iface_ref)
-!        ! global reference indices
-!        iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
-!
-!        ! reference corner coordinates
-!        xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
-!        ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
-!        zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
-!      enddo
-!
-!      ! checks if face has elastic side
-!      if( elastic_flag( iglob_corners_ref(1) ) >= 1 .and. &
-!         elastic_flag( iglob_corners_ref(2) ) >= 1 .and. &
-!         elastic_flag( iglob_corners_ref(3) ) >= 1 .and. &
-!         elastic_flag( iglob_corners_ref(4) ) >= 1) then
-!
-!            ! gets face GLL points i,j,k indices from element face
-!            call get_element_face_gll_indices(iface_ref,ijk_face,NGLLX,NGLLY)
-!
-!              ! gets face GLL 2Djacobian, weighted from element face
-!              call get_jacobian_boundary_face(myrank,nspec, &
-!                        xstore_dummy,ystore_dummy,zstore_dummy,ibool,nglob_dummy, &
-!                        dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
-!                        wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-!                        ispec,iface_ref,jacobian2Dw_face,normal_face,NGLLX,NGLLY)
-!
-!              ! normal convention: points away from poroelastic, reference element
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                    ! directs normals such that they point outwards of poroelastic element
-!                    call get_element_face_normal(ispec,iface_ref,xcoord,ycoord,zcoord, &
-!                                                ibool,nspec,nglob_dummy, &
-!                                                xstore_dummy,ystore_dummy,zstore_dummy, &
-!                                                normal_face(:,i,j) )
-!                enddo
-!              enddo
-!
-!              ! stores informations about this face
-!              inum_el_po = inum_el_po + 1
-!              tmp_ispec(inum_el_po) = ispec
-!              igll = 0
-!              do j=1,NGLLY
-!                do i=1,NGLLX
-!                  ! adds all gll points on this face
-!                  igll = igll + 1
-!
-!                  ! do we need to store local i,j,k,ispec info? or only global indices iglob?
-!                  tmp_ijk(:,igll,inum_el_po) = ijk_face(:,i,j)
-!
-!                  ! stores weighted jacobian and normals
-!                  tmp_jacobian2Dw(igll,inum_el_po) = jacobian2Dw_face(i,j)
-!                  tmp_normal(:,igll,inum_el_po) = normal_face(:,i,j)
-!
-!                enddo
-!              enddo
-!        endif ! elastic_flag
-!    enddo ! iface_ref
-!   endif ! ispec_is_poroelastic
-!  enddo ! ispec
-!
-!  !endif !if (count_elastic >0 .and. count_poroelastic >0)
-!
-!! stores completed coupling face informations
-!!
-!! note: for this coupling we need to have access to porous properties. The construction is such
-!! that i,j,k, & face correspond to poroelastic interface. Note that the normal is pointing outward the
-!! poroelastic element
-!  num_coupling_el_po_faces = inum_el_po
-!  allocate(coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_el_po_normal'
-!  allocate(coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_el_po_jacobian2Dw'
-!  allocate(coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_el_po_ijk'
-!  allocate(coupling_el_po_ispec(num_coupling_el_po_faces),stat=ier)
-!  if( ier /= 0 ) stop 'error allocating array coupling_el_po_ispec'
-!  do inum = 1,num_coupling_el_po_faces
-!    coupling_el_po_normal(:,:,inum) = tmp_normal(:,:,inum)
-!    coupling_el_po_jacobian2Dw(:,inum) = tmp_jacobian2Dw(:,inum)
-!    coupling_el_po_ijk(:,:,inum) = tmp_ijk(:,:,inum)
-!    coupling_el_po_ispec(inum) = tmp_ispec(inum)
-!  enddo
-!
-!! user output
-!! makes sure processes are synchronized
-!  call sum_all_i(num_coupling_el_po_faces,inum_el_po)
-!  if( myrank == 0 ) then
-!    write(IMAIN,*) '     elastic-poroelastic coupling:'
-!    write(IMAIN,*) '     total number of faces = ',inum_el_po
-!  endif
-!
-!
-!
-!  end subroutine get_coupling_surfaces_comb
-

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -28,7 +28,7 @@
   subroutine get_model(myrank,nspec,ibool,mat_ext_mesh,nelmnts_ext_mesh, &
                         materials_ext_mesh,nmat_ext_mesh, &
                         undef_mat_prop,nundefMat_ext_mesh, &
-                        ANISOTROPY,LOCAL_PATH)
+                        ANISOTROPY)
 
   use generate_databases_par,only: IMODEL
   use create_regions_mesh_ext_par
@@ -51,8 +51,6 @@
   ! anisotropy
   logical :: ANISOTROPY
 
-  character(len=256) LOCAL_PATH
-
   ! local parameters
   real(kind=CUSTOM_REAL) :: vp,vs,rho,qmu_atten
   real(kind=CUSTOM_REAL) :: c11,c12,c13,c14,c15,c16,c22,c23,c24,c25, &
@@ -73,12 +71,18 @@
   double precision :: xmesh,ymesh,zmesh
   integer :: iglob
 
+  ! timing
+  double precision, external :: wtime
+  double precision :: time_start,tCPU
+
   ! initializes element domain flags
   ispec_is_acoustic(:) = .false.
   ispec_is_elastic(:) = .false.
   ispec_is_poroelastic(:) = .false.
 
-  print*,"nundefMat_ext_mesh:",nundefMat_ext_mesh
+  !debug
+  !print*,"nundefMat_ext_mesh:",nundefMat_ext_mesh
+
 ! prepares tomography model if needed for elements with undefined material definitions
   ! TODO: Max -- somehow this code is breaking when I try to run
   ! Piero's PREM
@@ -98,6 +102,8 @@
 ! in case, see file model_interface_bedrock.f90:
 !  call model_bedrock_broadcast(myrank)
 
+  ! get MPI starting time
+  time_start = wtime()
 
   ! material properties on all GLL points: taken from material values defined for
   ! each spectral element in input mesh
@@ -296,6 +302,17 @@
         enddo
       enddo
     enddo
+
+    ! user output
+    if(myrank == 0 ) then
+      if( mod(ispec,nspec/10) == 0 ) then
+        tCPU = wtime() - time_start
+        ! remaining
+        tCPU = (10.0-ispec/(nspec/10.0))/ispec/(nspec/10.0)*tCPU
+        write(IMAIN,*) "    ",ispec/(nspec/10) * 10," %", &
+                      " time remaining:", tCPU,"s"
+      endif
+    endif
   enddo
 
   ! checks material domains
@@ -324,16 +341,6 @@
     endif
   enddo
 
-  ! GLL model
-  ! variables for importing models from files in SPECFEM format, e.g.,  proc000000_vp.bin etc.
-  ! can be used for importing updated model in iterative inversions
-  if( IMODEL == IMODEL_GLL ) then
-    ! note:
-    ! import the model from files in SPECFEM format
-    ! note that those those files should be saved in LOCAL_PATH
-    call model_gll(myrank,nspec,LOCAL_PATH)
-  endif
-
   end subroutine get_model
 
 !
@@ -393,7 +400,7 @@
   ! selects chosen velocity model
   select case( IMODEL )
 
-  case( IMODEL_DEFAULT, IMODEL_GLL )
+  case( IMODEL_DEFAULT,IMODEL_GLL,IMODEL_IPATI )
     ! material values determined by mesh properties
     call model_default(materials_ext_mesh,nmat_ext_mesh, &
                           undef_mat_prop,nundefMat_ext_mesh, &
@@ -459,3 +466,39 @@
   endif
 
   end subroutine get_model_values
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine get_model_binaries(myrank,nspec,LOCAL_PATH)
+
+! reads in material parameters from external binary files
+
+  use generate_databases_par,only: IMODEL
+  use create_regions_mesh_ext_par
+  implicit none
+
+  ! number of spectral elements in each block
+  integer :: myrank,nspec
+  character(len=256) :: LOCAL_PATH
+
+  ! external GLL models
+  ! variables for importing models from files in SPECFEM format, e.g.,  proc000000_vp.bin etc.
+  ! can be used for importing updated model in iterative inversions
+
+  ! note: we read in these binary files after mesh coloring, since mesh coloring is permuting arrays.
+  !          here, the ordering in **_vp.bin etc. can be permuted as they are outputted when saving mesh files
+
+  select case( IMODEL )
+  case( IMODEL_GLL )
+    ! note:
+    ! import the model from files in SPECFEM format
+    ! note that those those files should be saved in LOCAL_PATH
+    call model_gll(myrank,nspec,LOCAL_PATH)
+  case( IMODEL_IPATI )
+    ! import the model from modified files in SPECFEM format
+    call model_ipati(myrank,nspec,LOCAL_PATH)
+  end select
+
+  end subroutine get_model_binaries

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_external_values.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_external_values.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_external_values.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -172,7 +172,7 @@
                                   nspec,nglob_dummy,ibool,xstore_dummy,ystore_dummy,zstore_dummy, &
                                   num_free_surface_faces,free_surface_ispec,free_surface_ijk)
 
-                    
+
   ! depth in Z-direction
   if( distmin < HUGEVAL ) then
     depth = elevation - z

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_gll.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_gll.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_gll.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -60,7 +60,11 @@
   if( ier /= 0 ) stop 'error allocating array rho_read'
 
   filename = prname_lp(1:len_trim(prname_lp))//'rho.bin'
-  open(unit=28,file=trim(filename),status='unknown',action='read',form='unformatted')
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading rho.bin file'
+  endif
 
   read(28) rho_read
   close(28)
@@ -70,7 +74,11 @@
   if( ier /= 0 ) stop 'error allocating array vp_read'
 
   filename = prname_lp(1:len_trim(prname_lp))//'vp.bin'
-  open(unit=28,file=trim(filename),status='unknown',action='read',form='unformatted')
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading vp.bin file'
+  endif
 
   read(28) vp_read
   close(28)
@@ -80,7 +88,11 @@
   if( ier /= 0 ) stop 'error allocating array vs_read'
 
   filename = prname_lp(1:len_trim(prname_lp))//'vs.bin'
-  open(unit=28,file=trim(filename),status='unknown',action='read',form='unformatted')
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading vs.bin file'
+  endif
 
   read(28) vs_read
   close(28)

Added: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_ipati.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_ipati.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_ipati.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -0,0 +1,113 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+
+!--------------------------------------------------------------------------------------------------
+!
+! IPATI
+!
+! based on given rho and vp structure for GLL files
+!
+!--------------------------------------------------------------------------------------------------
+
+  subroutine model_ipati(myrank,nspec,LOCAL_PATH)
+
+  use create_regions_mesh_ext_par
+  implicit none
+
+  integer, intent(in) :: myrank,nspec
+  character(len=256) :: LOCAL_PATH
+
+  ! local parameters
+  real, dimension(:,:,:,:),allocatable :: vp_read,vs_read,rho_read
+  integer :: ier
+  character(len=256) :: prname_lp,filename
+
+  ! -----------------------------------------------------------------------------
+
+  ! note: vp not vs structure is available (as is often the case in exploration seismology),
+  ! scaling factor
+  real, parameter :: SCALING_FACTOR = 1.0/1.8
+
+  ! -----------------------------------------------------------------------------
+
+  ! user output
+  if (myrank==0) then
+    write(IMAIN,*)
+    write(IMAIN,*) 'using external IPATI model from:',trim(LOCAL_PATH)
+    write(IMAIN,*) 'scaling factor: ',SCALING_FACTOR
+    write(IMAIN,*)
+  endif
+
+  ! processors name
+  write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'proc',myrank,'_'
+
+  ! density
+  allocate( rho_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array rho_read'
+
+  filename = prname_lp(1:len_trim(prname_lp))//'rho.bin'
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading rho.bin file'
+  endif
+
+  read(28) rho_read
+  close(28)
+
+  ! vp
+  allocate( vp_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array vp_read'
+
+  filename = prname_lp(1:len_trim(prname_lp))//'vp.bin'
+  open(unit=28,file=trim(filename),status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error opening file: ',trim(filename)
+    stop 'error reading vp.bin file'
+  endif
+
+  read(28) vp_read
+  close(28)
+
+  ! vs scaled from vp
+  allocate( vs_read(NGLLX,NGLLY,NGLLZ,nspec),stat=ier)
+  if( ier /= 0 ) stop 'error allocating array vs_read'
+
+  ! scaling
+  vs_read = vp_read * SCALING_FACTOR
+
+  ! isotropic model parameters
+  rhostore    = rho_read
+  kappastore  = rhostore * ( vp_read * vp_read - FOUR_THIRDS * vs_read * vs_read )
+  mustore     = rhostore * vs_read * vs_read
+  rho_vp = rhostore * vp_read
+  rho_vs = rhostore * vs_read
+
+  ! free memory
+  deallocate( rho_read,vp_read,vs_read)
+
+  end subroutine model_ipati

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/save_arrays_solver.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/save_arrays_solver.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/save_arrays_solver.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -28,118 +28,124 @@
 ! for external mesh
 
   subroutine save_arrays_solver_ext_mesh(nspec,nglob, &
-                    xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore, &
-                    gammaxstore,gammaystore,gammazstore, &
-                    jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, &
-                    rhostore,kappastore,mustore, &
-                    rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, &
-                    rho_vpI,rho_vpII,rho_vsI, &
-                    rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, &
-                    OCEANS,rmass_ocean_load,NGLOB_OCEAN,&
+!                    xixstore,xiystore,xizstore,etaxstore,etaystore,etazstore, &
+!                    gammaxstore,gammaystore,gammazstore, &
+!                    jacobianstore, rho_vp,rho_vs,qmu_attenuation_store, &
+!                    rhostore,kappastore,mustore, &
+!                    rhoarraystore,kappaarraystore,etastore,phistore,tortstore,permstore, &
+!                    rho_vpI,rho_vpII,rho_vsI, &
+!                    rmass,rmass_acoustic,rmass_solid_poroelastic,rmass_fluid_poroelastic, &
+                    OCEANS, &
+!                    rmass_ocean_load,NGLOB_OCEAN,&
                     ibool, &
-                    xstore_dummy,ystore_dummy,zstore_dummy, &
-                    abs_boundary_normal,abs_boundary_jacobian2Dw, &
-                    abs_boundary_ijk,abs_boundary_ispec, &
-                    num_abs_boundary_faces, &
-                    free_surface_normal,free_surface_jacobian2Dw, &
-                    free_surface_ijk,free_surface_ispec, &
-                    num_free_surface_faces, &
-                    coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
-                    coupling_ac_el_ijk,coupling_ac_el_ispec, &
-                    num_coupling_ac_el_faces, &
-                    coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, &
-                    coupling_ac_po_ijk,coupling_ac_po_ispec, &
-                    num_coupling_ac_po_faces, &
-                    coupling_el_po_normal,coupling_el_po_jacobian2Dw, &
-                    coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, &
-                    coupling_po_el_ispec,num_coupling_el_po_faces, &
+!                    xstore_dummy,ystore_dummy,zstore_dummy, &
+!                    abs_boundary_normal,abs_boundary_jacobian2Dw, &
+!                    abs_boundary_ijk,abs_boundary_ispec, &
+!                    num_abs_boundary_faces, &
+!                    free_surface_normal,free_surface_jacobian2Dw, &
+!                    free_surface_ijk,free_surface_ispec, &
+!                    num_free_surface_faces, &
+!                    coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
+!                    coupling_ac_el_ijk,coupling_ac_el_ispec, &
+!                    num_coupling_ac_el_faces, &
+!                    coupling_ac_po_normal,coupling_ac_po_jacobian2Dw, &
+!                    coupling_ac_po_ijk,coupling_ac_po_ispec, &
+!                    num_coupling_ac_po_faces, &
+!                    coupling_el_po_normal,coupling_el_po_jacobian2Dw, &
+!                    coupling_el_po_ijk,coupling_po_el_ijk,coupling_el_po_ispec, &
+!                    coupling_po_el_ispec,num_coupling_el_po_faces, &
                     num_interfaces_ext_mesh,my_neighbours_ext_mesh,nibool_interfaces_ext_mesh, &
                     max_interface_size_ext_mesh,ibool_interfaces_ext_mesh, &
-                    prname,SAVE_MESH_FILES, &
-                    ANISOTROPY,NSPEC_ANISO, &
-                    c11store,c12store,c13store,c14store,c15store,c16store, &
-                    c22store,c23store,c24store,c25store,c26store,c33store, &
-                    c34store,c35store,c36store,c44store,c45store,c46store, &
-                    c55store,c56store,c66store, &
-                    ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
-                    ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, &
-                    nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, &
-                    num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
-                    num_phase_ispec_elastic,phase_ispec_inner_elastic, &
-                    num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic, &
-                    num_colors_outer_acoustic,num_colors_inner_acoustic, &
-                    num_elem_colors_acoustic, &
-                    num_colors_outer_elastic,num_colors_inner_elastic, &
-                    num_elem_colors_elastic)
+!                    prname, &
+                    SAVE_MESH_FILES, &
+                    ANISOTROPY &
+!                    NSPEC_ANISO, &
+!                    c11store,c12store,c13store,c14store,c15store,c16store, &
+!                    c22store,c23store,c24store,c25store,c26store,c33store, &
+!                    c34store,c35store,c36store,c44store,c45store,c46store, &
+!                    c55store,c56store,c66store, &
+!                    ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic, &
+!                    ispec_is_inner,nspec_inner_acoustic,nspec_inner_elastic,nspec_inner_poroelastic, &
+!                    nspec_outer_acoustic,nspec_outer_elastic,nspec_outer_poroelastic, &
+!                    num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+!                    num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+!                    num_phase_ispec_poroelastic,phase_ispec_inner_poroelastic, &
+!                    num_colors_outer_acoustic,num_colors_inner_acoustic, &
+!                    num_elem_colors_acoustic, &
+!                    num_colors_outer_elastic,num_colors_inner_elastic, &
+!                    num_elem_colors_elastic, &
+              )
 
+  use create_regions_mesh_ext_par
+
   implicit none
 
-  include "constants.h"
+!  include "constants.h"
 
   integer :: nspec,nglob
 
 ! jacobian
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: xixstore,xiystore,xizstore, &
-            etaxstore,etaystore,etazstore,gammaxstore,gammaystore,gammazstore,jacobianstore
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vp,rho_vs
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: xixstore,xiystore,xizstore, &
+!            etaxstore,etaystore,etazstore,gammaxstore,gammaystore,gammazstore,jacobianstore
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vp,rho_vs
 
 ! attenuation
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: qmu_attenuation_store
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: qmu_attenuation_store
 
 ! material
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rhostore,kappastore,mustore
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: etastore,phistore,tortstore
-  real(kind=CUSTOM_REAL), dimension(2,NGLLX,NGLLY,NGLLZ,nspec) :: rhoarraystore
-  real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ,nspec) :: kappaarraystore
-  real(kind=CUSTOM_REAL), dimension(6,NGLLX,NGLLY,NGLLZ,nspec) :: permstore
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vpI,rho_vpII,rho_vsI
-  real(kind=CUSTOM_REAL), dimension(nglob) :: rmass,rmass_acoustic, &
-            rmass_solid_poroelastic,rmass_fluid_poroelastic
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rhostore,kappastore,mustore
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: etastore,phistore,tortstore
+!  real(kind=CUSTOM_REAL), dimension(2,NGLLX,NGLLY,NGLLZ,nspec) :: rhoarraystore
+!  real(kind=CUSTOM_REAL), dimension(3,NGLLX,NGLLY,NGLLZ,nspec) :: kappaarraystore
+!  real(kind=CUSTOM_REAL), dimension(6,NGLLX,NGLLY,NGLLZ,nspec) :: permstore
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec) :: rho_vpI,rho_vpII,rho_vsI
+!  real(kind=CUSTOM_REAL), dimension(nglob) :: rmass,rmass_acoustic, &
+!            rmass_solid_poroelastic,rmass_fluid_poroelastic
 ! ocean load
   logical :: OCEANS
-  integer :: NGLOB_OCEAN
-  real(kind=CUSTOM_REAL),dimension(NGLOB_OCEAN) :: rmass_ocean_load
+!  integer :: NGLOB_OCEAN
+!  real(kind=CUSTOM_REAL),dimension(NGLOB_OCEAN) :: rmass_ocean_load
 
 ! mesh coordinates
   integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
-  real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy
+!  real(kind=CUSTOM_REAL), dimension(nglob) :: xstore_dummy,ystore_dummy,zstore_dummy
 
 ! absorbing boundary surface
-  integer :: num_abs_boundary_faces
-  real(kind=CUSTOM_REAL) :: abs_boundary_normal(NDIM,NGLLSQUARE,num_abs_boundary_faces)
-  real(kind=CUSTOM_REAL) :: abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces)
-  integer :: abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces)
-  integer :: abs_boundary_ispec(num_abs_boundary_faces)
+!  integer :: num_abs_boundary_faces
+!  real(kind=CUSTOM_REAL) :: abs_boundary_normal(NDIM,NGLLSQUARE,num_abs_boundary_faces)
+!  real(kind=CUSTOM_REAL) :: abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces)
+!  integer :: abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces)
+!  integer :: abs_boundary_ispec(num_abs_boundary_faces)
 
 ! free surface
-  integer :: num_free_surface_faces
-  real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces)
-  real(kind=CUSTOM_REAL) :: free_surface_jacobian2Dw(NGLLSQUARE,num_free_surface_faces)
-  integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces)
-  integer :: free_surface_ispec(num_free_surface_faces)
+!  integer :: num_free_surface_faces
+!  real(kind=CUSTOM_REAL) :: free_surface_normal(NDIM,NGLLSQUARE,num_free_surface_faces)
+!  real(kind=CUSTOM_REAL) :: free_surface_jacobian2Dw(NGLLSQUARE,num_free_surface_faces)
+!  integer :: free_surface_ijk(3,NGLLSQUARE,num_free_surface_faces)
+!  integer :: free_surface_ispec(num_free_surface_faces)
 
 ! acoustic-elastic coupling surface
-  integer :: num_coupling_ac_el_faces
-  real(kind=CUSTOM_REAL) :: coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces)
-  real(kind=CUSTOM_REAL) :: coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces)
-  integer :: coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces)
-  integer :: coupling_ac_el_ispec(num_coupling_ac_el_faces)
+!  integer :: num_coupling_ac_el_faces
+!  real(kind=CUSTOM_REAL) :: coupling_ac_el_normal(NDIM,NGLLSQUARE,num_coupling_ac_el_faces)
+!  real(kind=CUSTOM_REAL) :: coupling_ac_el_jacobian2Dw(NGLLSQUARE,num_coupling_ac_el_faces)
+!  integer :: coupling_ac_el_ijk(3,NGLLSQUARE,num_coupling_ac_el_faces)
+!  integer :: coupling_ac_el_ispec(num_coupling_ac_el_faces)
 
 ! acoustic-poroelastic coupling surface
-  integer :: num_coupling_ac_po_faces
-  real(kind=CUSTOM_REAL) :: coupling_ac_po_normal(NDIM,NGLLSQUARE,num_coupling_ac_po_faces)
-  real(kind=CUSTOM_REAL) :: coupling_ac_po_jacobian2Dw(NGLLSQUARE,num_coupling_ac_po_faces)
-  integer :: coupling_ac_po_ijk(3,NGLLSQUARE,num_coupling_ac_po_faces)
-  integer :: coupling_ac_po_ispec(num_coupling_ac_po_faces)
+!  integer :: num_coupling_ac_po_faces
+!  real(kind=CUSTOM_REAL) :: coupling_ac_po_normal(NDIM,NGLLSQUARE,num_coupling_ac_po_faces)
+!  real(kind=CUSTOM_REAL) :: coupling_ac_po_jacobian2Dw(NGLLSQUARE,num_coupling_ac_po_faces)
+!  integer :: coupling_ac_po_ijk(3,NGLLSQUARE,num_coupling_ac_po_faces)
+!  integer :: coupling_ac_po_ispec(num_coupling_ac_po_faces)
 
 ! elastic-poroelastic coupling surface
-  integer :: num_coupling_el_po_faces
-  real(kind=CUSTOM_REAL) :: coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces)
-  real(kind=CUSTOM_REAL) :: coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces)
-  integer :: coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
-  integer :: coupling_po_el_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
-  integer :: coupling_el_po_ispec(num_coupling_el_po_faces)
-  integer :: coupling_po_el_ispec(num_coupling_el_po_faces)
+!  integer :: num_coupling_el_po_faces
+!  real(kind=CUSTOM_REAL) :: coupling_el_po_normal(NDIM,NGLLSQUARE,num_coupling_el_po_faces)
+!  real(kind=CUSTOM_REAL) :: coupling_el_po_jacobian2Dw(NGLLSQUARE,num_coupling_el_po_faces)
+!  integer :: coupling_el_po_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
+!  integer :: coupling_po_el_ijk(3,NGLLSQUARE,num_coupling_el_po_faces)
+!  integer :: coupling_el_po_ispec(num_coupling_el_po_faces)
+!  integer :: coupling_po_el_ispec(num_coupling_el_po_faces)
 
 ! MPI interfaces
   integer :: num_interfaces_ext_mesh
@@ -150,43 +156,43 @@
   integer :: max_nibool_interfaces_ext_mesh
 
 ! file name
-  character(len=256) prname
+!  character(len=256) prname
   logical :: SAVE_MESH_FILES
 
 ! anisotropy
   logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
+!  integer :: NSPEC_ANISO
+!  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+!            c11store,c12store,c13store,c14store,c15store,c16store, &
+!            c22store,c23store,c24store,c25store,c26store,c33store, &
+!            c34store,c35store,c36store,c44store,c45store,c46store, &
+!            c55store,c56store,c66store
 
 ! material domain flags
-  logical, dimension(nspec) :: ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic
+!  logical, dimension(nspec) :: ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic
 
 ! inner/outer elements
-  logical,dimension(nspec) :: ispec_is_inner
-  integer :: nspec_inner_acoustic,nspec_outer_acoustic
-  integer :: nspec_inner_elastic,nspec_outer_elastic
-  integer :: nspec_inner_poroelastic,nspec_outer_poroelastic
+!  logical,dimension(nspec) :: ispec_is_inner
+!  integer :: nspec_inner_acoustic,nspec_outer_acoustic
+!  integer :: nspec_inner_elastic,nspec_outer_elastic
+!  integer :: nspec_inner_poroelastic,nspec_outer_poroelastic
 
-  integer :: num_phase_ispec_acoustic
-  integer,dimension(num_phase_ispec_acoustic,2) :: phase_ispec_inner_acoustic
+!  integer :: num_phase_ispec_acoustic
+!  integer,dimension(num_phase_ispec_acoustic,2) :: phase_ispec_inner_acoustic
 
-  integer :: num_phase_ispec_elastic
-  integer,dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+!  integer :: num_phase_ispec_elastic
+!  integer,dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
 
-  integer :: num_phase_ispec_poroelastic
-  integer,dimension(num_phase_ispec_poroelastic,2) :: phase_ispec_inner_poroelastic
+!  integer :: num_phase_ispec_poroelastic
+!  integer,dimension(num_phase_ispec_poroelastic,2) :: phase_ispec_inner_poroelastic
 
   ! mesh coloring
-  integer :: num_colors_outer_acoustic,num_colors_inner_acoustic
-  integer, dimension(num_colors_outer_acoustic + num_colors_inner_acoustic) :: &
-    num_elem_colors_acoustic
-  integer :: num_colors_outer_elastic,num_colors_inner_elastic
-  integer, dimension(num_colors_outer_elastic + num_colors_inner_elastic) :: &
-    num_elem_colors_elastic
+!  integer :: num_colors_outer_acoustic,num_colors_inner_acoustic
+!  integer, dimension(num_colors_outer_acoustic + num_colors_inner_acoustic) :: &
+!    num_elem_colors_acoustic
+!  integer :: num_colors_outer_elastic,num_colors_inner_elastic
+!  integer, dimension(num_colors_outer_elastic + num_colors_inner_elastic) :: &
+!    num_elem_colors_elastic
 
 ! local parameters
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: v_tmp
@@ -195,13 +201,15 @@
   !real(kind=CUSTOM_REAL) :: minimum(1)
   integer, dimension(:,:), allocatable :: ibool_interfaces_ext_mesh_dummy
   integer :: ier,i
-  logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION
+!  logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION
   character(len=256) :: filename
 
   integer, dimension(:), allocatable :: iglob_tmp
   integer :: j,inum
 
-! saves mesh file proc***_external_mesh.bin
+  logical,parameter :: DEBUG = .false.
+
+  ! saves mesh file proc***_external_mesh.bin
   filename = prname(1:len_trim(prname))//'external_mesh.bin'
   open(unit=IOUT,file=trim(filename),status='unknown',action='write',form='unformatted',iostat=ier)
   if( ier /= 0 ) stop 'error opening database proc######_external_mesh.bin'
@@ -487,181 +495,182 @@
                         xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
                         qmu_attenuation_store,filename)
 
+    deallocate(v_tmp)
+
     ! VTK file output
-    ! acoustic-elastic domains
-    if( ACOUSTIC_SIMULATION .and. ELASTIC_SIMULATION ) then
-      ! saves points on acoustic-elastic coupling interface
-      allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_el_faces),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array iglob_tmp'
-      inum = 0
-      iglob_tmp(:) = 0
-      do i=1,num_coupling_ac_el_faces
-        do j=1,NGLLSQUARE
-          inum = inum+1
-          iglob_tmp(inum) = ibool(coupling_ac_el_ijk(1,j,i), &
-                                  coupling_ac_el_ijk(2,j,i), &
-                                  coupling_ac_el_ijk(3,j,i), &
-                                  coupling_ac_el_ispec(i) )
+    if( DEBUG ) then
+      ! acoustic-elastic domains
+      if( ACOUSTIC_SIMULATION .and. ELASTIC_SIMULATION ) then
+        ! saves points on acoustic-elastic coupling interface
+        allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_el_faces),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array iglob_tmp'
+        inum = 0
+        iglob_tmp(:) = 0
+        do i=1,num_coupling_ac_el_faces
+          do j=1,NGLLSQUARE
+            inum = inum+1
+            iglob_tmp(inum) = ibool(coupling_ac_el_ijk(1,j,i), &
+                                    coupling_ac_el_ijk(2,j,i), &
+                                    coupling_ac_el_ijk(3,j,i), &
+                                    coupling_ac_el_ispec(i) )
+          enddo
         enddo
-      enddo
-      filename = prname(1:len_trim(prname))//'coupling_acoustic_elastic'
-      call write_VTK_data_points(nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        iglob_tmp,NGLLSQUARE*num_coupling_ac_el_faces, &
-                        filename)
+        filename = prname(1:len_trim(prname))//'coupling_acoustic_elastic'
+        call write_VTK_data_points(nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy, &
+                          iglob_tmp,NGLLSQUARE*num_coupling_ac_el_faces, &
+                          filename)
 
-      ! saves acoustic/elastic flag
-      allocate(v_tmp_i(nspec),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array v_tmp_i'
-      do i=1,nspec
-        if( ispec_is_acoustic(i) ) then
-          v_tmp_i(i) = 1
-        else if( ispec_is_elastic(i) ) then
-          v_tmp_i(i) = 2
-        else
-          v_tmp_i(i) = 0
-        endif
-      enddo
-      filename = prname(1:len_trim(prname))//'acoustic_elastic_flag'
-      call write_VTK_data_elem_i(nspec,nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
-                        v_tmp_i,filename)
+        ! saves acoustic/elastic flag
+        allocate(v_tmp_i(nspec),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array v_tmp_i'
+        do i=1,nspec
+          if( ispec_is_acoustic(i) ) then
+            v_tmp_i(i) = 1
+          else if( ispec_is_elastic(i) ) then
+            v_tmp_i(i) = 2
+          else
+            v_tmp_i(i) = 0
+          endif
+        enddo
+        filename = prname(1:len_trim(prname))//'acoustic_elastic_flag'
+        call write_VTK_data_elem_i(nspec,nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                          v_tmp_i,filename)
 
-      deallocate(iglob_tmp,v_tmp_i)
-    endif
+        deallocate(iglob_tmp,v_tmp_i)
+      endif
 
-    ! saves free surface points
-    if( num_free_surface_faces > 0 ) then
-      ! saves free surface interface points
-      allocate( iglob_tmp(NGLLSQUARE*num_free_surface_faces),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array iglob_tmp'
-      inum = 0
-      iglob_tmp(:) = 0
-      do i=1,num_free_surface_faces
-        do j=1,NGLLSQUARE
-          inum = inum+1
-          iglob_tmp(inum) = ibool(free_surface_ijk(1,j,i), &
-                                  free_surface_ijk(2,j,i), &
-                                  free_surface_ijk(3,j,i), &
-                                  free_surface_ispec(i) )
+      ! saves free surface points
+      if( num_free_surface_faces > 0 ) then
+        ! saves free surface interface points
+        allocate( iglob_tmp(NGLLSQUARE*num_free_surface_faces),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array iglob_tmp'
+        inum = 0
+        iglob_tmp(:) = 0
+        do i=1,num_free_surface_faces
+          do j=1,NGLLSQUARE
+            inum = inum+1
+            iglob_tmp(inum) = ibool(free_surface_ijk(1,j,i), &
+                                    free_surface_ijk(2,j,i), &
+                                    free_surface_ijk(3,j,i), &
+                                    free_surface_ispec(i) )
+          enddo
         enddo
-      enddo
-      filename = prname(1:len_trim(prname))//'free_surface'
-      call write_VTK_data_points(nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        iglob_tmp,NGLLSQUARE*num_free_surface_faces, &
-                        filename)
+        filename = prname(1:len_trim(prname))//'free_surface'
+        call write_VTK_data_points(nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy, &
+                          iglob_tmp,NGLLSQUARE*num_free_surface_faces, &
+                          filename)
 
-      deallocate(iglob_tmp)
-    endif
+        deallocate(iglob_tmp)
+      endif
 
-    ! debug: saves 1. MPI interface
-    !if( num_interfaces_ext_mesh >= 1 ) then
-    !  filename = prname(1:len_trim(prname))//'MPI_1_points'
-    !  call write_VTK_data_points(nglob, &
-    !                    xstore_dummy,ystore_dummy,zstore_dummy, &
-    !                    ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), &
-    !                    nibool_interfaces_ext_mesh(1), &
-    !                    filename)
-    !endif
+      ! debug: saves 1. MPI interface
+      !if( num_interfaces_ext_mesh >= 1 ) then
+      !  filename = prname(1:len_trim(prname))//'MPI_1_points'
+      !  call write_VTK_data_points(nglob, &
+      !                    xstore_dummy,ystore_dummy,zstore_dummy, &
+      !                    ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), &
+      !                    nibool_interfaces_ext_mesh(1), &
+      !                    filename)
+      !endif
 
-    ! acoustic-poroelastic domains
-    if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then
-      ! saves points on acoustic-poroelastic coupling interface
-      allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_po_faces),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array iglob_tmp'
-      inum = 0
-      iglob_tmp(:) = 0
-      do i=1,num_coupling_ac_po_faces
-        do j=1,NGLLSQUARE
-          inum = inum+1
-          iglob_tmp(inum) = ibool(coupling_ac_po_ijk(1,j,i), &
-                                  coupling_ac_po_ijk(2,j,i), &
-                                  coupling_ac_po_ijk(3,j,i), &
-                                  coupling_ac_po_ispec(i) )
+      ! acoustic-poroelastic domains
+      if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then
+        ! saves points on acoustic-poroelastic coupling interface
+        allocate( iglob_tmp(NGLLSQUARE*num_coupling_ac_po_faces),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array iglob_tmp'
+        inum = 0
+        iglob_tmp(:) = 0
+        do i=1,num_coupling_ac_po_faces
+          do j=1,NGLLSQUARE
+            inum = inum+1
+            iglob_tmp(inum) = ibool(coupling_ac_po_ijk(1,j,i), &
+                                    coupling_ac_po_ijk(2,j,i), &
+                                    coupling_ac_po_ijk(3,j,i), &
+                                    coupling_ac_po_ispec(i) )
+          enddo
         enddo
-      enddo
-      filename = prname(1:len_trim(prname))//'coupling_acoustic_poroelastic'
-      call write_VTK_data_points(nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        iglob_tmp,NGLLSQUARE*num_coupling_ac_po_faces, &
-                        filename)
+        filename = prname(1:len_trim(prname))//'coupling_acoustic_poroelastic'
+        call write_VTK_data_points(nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy, &
+                          iglob_tmp,NGLLSQUARE*num_coupling_ac_po_faces, &
+                          filename)
 
-      ! saves acoustic/poroelastic flag
-      allocate(v_tmp_i(nspec),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array v_tmp_i'
-      do i=1,nspec
-        if( ispec_is_acoustic(i) ) then
-          v_tmp_i(i) = 1
-        else if( ispec_is_poroelastic(i) ) then
-          v_tmp_i(i) = 2
-        else
-          v_tmp_i(i) = 0
-        endif
-      enddo
-      filename = prname(1:len_trim(prname))//'acoustic_poroelastic_flag'
-      call write_VTK_data_elem_i(nspec,nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
-                        v_tmp_i,filename)
+        ! saves acoustic/poroelastic flag
+        allocate(v_tmp_i(nspec),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array v_tmp_i'
+        do i=1,nspec
+          if( ispec_is_acoustic(i) ) then
+            v_tmp_i(i) = 1
+          else if( ispec_is_poroelastic(i) ) then
+            v_tmp_i(i) = 2
+          else
+            v_tmp_i(i) = 0
+          endif
+        enddo
+        filename = prname(1:len_trim(prname))//'acoustic_poroelastic_flag'
+        call write_VTK_data_elem_i(nspec,nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                          v_tmp_i,filename)
 
-    deallocate(v_tmp_i,iglob_tmp)
-    endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION )
+      deallocate(v_tmp_i,iglob_tmp)
+      endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION )
 
-    ! elastic-poroelastic domains
-    if( ELASTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then
-      ! saves points on elastic-poroelastic coupling interface
-      allocate( iglob_tmp(NGLLSQUARE*num_coupling_el_po_faces),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array iglob_tmp'
-      inum = 0
-      iglob_tmp(:) = 0
-      do i=1,num_coupling_el_po_faces
-        do j=1,NGLLSQUARE
-          inum = inum+1
-          iglob_tmp(inum) = ibool(coupling_el_po_ijk(1,j,i), &
-                                  coupling_el_po_ijk(2,j,i), &
-                                  coupling_el_po_ijk(3,j,i), &
-                                  coupling_el_po_ispec(i) )
+      ! elastic-poroelastic domains
+      if( ELASTIC_SIMULATION .and. POROELASTIC_SIMULATION ) then
+        ! saves points on elastic-poroelastic coupling interface
+        allocate( iglob_tmp(NGLLSQUARE*num_coupling_el_po_faces),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array iglob_tmp'
+        inum = 0
+        iglob_tmp(:) = 0
+        do i=1,num_coupling_el_po_faces
+          do j=1,NGLLSQUARE
+            inum = inum+1
+            iglob_tmp(inum) = ibool(coupling_el_po_ijk(1,j,i), &
+                                    coupling_el_po_ijk(2,j,i), &
+                                    coupling_el_po_ijk(3,j,i), &
+                                    coupling_el_po_ispec(i) )
+          enddo
         enddo
-      enddo
-      filename = prname(1:len_trim(prname))//'coupling_elastic_poroelastic'
-      call write_VTK_data_points(nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        iglob_tmp,NGLLSQUARE*num_coupling_el_po_faces, &
-                        filename)
+        filename = prname(1:len_trim(prname))//'coupling_elastic_poroelastic'
+        call write_VTK_data_points(nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy, &
+                          iglob_tmp,NGLLSQUARE*num_coupling_el_po_faces, &
+                          filename)
 
-      ! saves elastic/poroelastic flag
-      allocate(v_tmp_i(nspec),stat=ier)
-      if( ier /= 0 ) stop 'error allocating array v_tmp_i'
-      do i=1,nspec
-        if( ispec_is_elastic(i) ) then
-          v_tmp_i(i) = 1
-        else if( ispec_is_poroelastic(i) ) then
-          v_tmp_i(i) = 2
-        else
-          v_tmp_i(i) = 0
-        endif
-      enddo
-      filename = prname(1:len_trim(prname))//'elastic_poroelastic_flag'
-      call write_VTK_data_elem_i(nspec,nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
-                        v_tmp_i,filename)
+        ! saves elastic/poroelastic flag
+        allocate(v_tmp_i(nspec),stat=ier)
+        if( ier /= 0 ) stop 'error allocating array v_tmp_i'
+        do i=1,nspec
+          if( ispec_is_elastic(i) ) then
+            v_tmp_i(i) = 1
+          else if( ispec_is_poroelastic(i) ) then
+            v_tmp_i(i) = 2
+          else
+            v_tmp_i(i) = 0
+          endif
+        enddo
+        filename = prname(1:len_trim(prname))//'elastic_poroelastic_flag'
+        call write_VTK_data_elem_i(nspec,nglob, &
+                          xstore_dummy,ystore_dummy,zstore_dummy,ibool, &
+                          v_tmp_i,filename)
 
-    deallocate(v_tmp_i,iglob_tmp)
-    endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION
+      deallocate(v_tmp_i,iglob_tmp)
+      endif !if( ACOUSTIC_SIMULATION .and. POROELASTIC_SIMULATION
 
-    !debug: saves 1. MPI interface
-    if( num_interfaces_ext_mesh >= 1 ) then
-      filename = prname(1:len_trim(prname))//'MPI_1_points'
-      call write_VTK_data_points(nglob, &
-                        xstore_dummy,ystore_dummy,zstore_dummy, &
-                        ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), &
-                        nibool_interfaces_ext_mesh(1), &
-                        filename)
-    endif
+      !debug: saves 1. MPI interface
+      !if( num_interfaces_ext_mesh >= 1 ) then
+      !  filename = prname(1:len_trim(prname))//'MPI_1_points'
+      !  call write_VTK_data_points(nglob, &
+      !                    xstore_dummy,ystore_dummy,zstore_dummy, &
+      !                    ibool_interfaces_ext_mesh_dummy(1:nibool_interfaces_ext_mesh(1),1), &
+      !                    nibool_interfaces_ext_mesh(1), &
+      !                    filename)
+      !endif
+    endif ! DEBUG
 
-
-    deallocate(v_tmp)
-
   endif ! SAVE_MESH_FILES
 
 ! cleanup

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -683,11 +683,12 @@
       model_speed_max = vsmax_glob
     endif
   endif
-  call bcast_all_cr(model_speed_max,1)
+  tmp_val(1) = model_speed_max
+  call bcast_all_cr(tmp_val,1)
+  model_speed_max = tmp_val(1)
 
   ! returns minimum period
   if( myrank == 0 ) min_resolved_period = pmax_glob
-
   tmp_val(1) = min_resolved_period
   call bcast_all_cr(tmp_val,1)
   min_resolved_period = tmp_val(1)

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/combine_vol_data.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/combine_vol_data.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/combine_vol_data.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -38,7 +38,7 @@
 
     ! maximum number of slices
     integer,parameter :: MAX_NUM_NODES = 600
-    
+
   end module vtk
 
 !
@@ -76,9 +76,9 @@
 
   integer :: i, ios, it, ier
   integer :: iproc, proc1, proc2, num_node
-  
+
   integer,dimension(MAX_NUM_NODES) :: node_list
-  
+
   integer :: np, ne, npp, nee, nelement, njunk
 
   character(len=256) :: sline, arg(6), filename, indir, outdir
@@ -382,9 +382,9 @@
   include 'constants.h'
 
   integer,intent(in) :: num_node
-  integer,dimension(MAX_NUM_NODES),intent(in) :: node_list  
+  integer,dimension(MAX_NUM_NODES),intent(in) :: node_list
   character(len=256),intent(in) :: LOCAL_PATH
-  
+
   integer,intent(out) :: npp,nee
   logical,intent(in) :: HIGH_RESOLUTION_MESH
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in	2012-06-19 22:23:01 UTC (rev 20392)
@@ -77,8 +77,9 @@
 !!-----------------------------------------------------------
 ! ouput format of seismograms, ASCII or binary
   logical, parameter :: SEISMOGRAMS_BINARY = .false.
+
 ! output format of seismograms, Seismic Unix (binary with 240-byte-headers)
-  logical, parameter :: SU_FORMAT=.false.
+  logical, parameter :: SU_FORMAT = .false.
 
 ! input, output and main MPI I/O files
   integer, parameter :: ISTANDARD_OUTPUT = 6
@@ -256,7 +257,7 @@
 
 ! add mesh coloring for the GPU + MPI implementation
   logical, parameter :: USE_MESH_COLORING_GPU = .false.
-  integer, parameter :: MAX_NUMBER_OF_COLORS = 10000
+  integer, parameter :: MAX_NUMBER_OF_COLORS = 1000
 
 ! enhanced coloring:
 !
@@ -425,3 +426,4 @@
   integer, parameter :: IMODEL_GLL              = 7
   integer, parameter :: IMODEL_SALTON_TROUGH    = 8
   integer, parameter :: IMODEL_1D_PREM_PB       = 9
+  integer, parameter :: IMODEL_IPATI            = 10

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/get_element_face.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/get_element_face.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/get_element_face.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -24,7 +24,8 @@
 !
 !=====================================================================
 
-subroutine get_element_face_id(ispec,xcoord,ycoord,zcoord,&
+
+  subroutine get_element_face_id(ispec,xcoord,ycoord,zcoord,&
                               ibool,nspec,nglob, &
                               xstore_dummy,ystore_dummy,zstore_dummy, &
                               iface_id )
@@ -174,13 +175,13 @@
 
   endif
 
-end subroutine get_element_face_id
+  end subroutine get_element_face_id
 
 !
-!----
+!-------------------------------------------------------------------------------------------------
 !
 
-subroutine get_element_face_gll_indices(iface,ijk_face,NGLLA,NGLLB )
+  subroutine get_element_face_gll_indices(iface,ijk_face,NGLLA,NGLLB )
 
 ! returns local indices in ijk_face for specified face
 
@@ -368,10 +369,10 @@
 end subroutine get_element_face_gll_indices
 
 !
-!----
+!-------------------------------------------------------------------------------------------------
 !
 
-subroutine get_element_face_normal(ispec,iface,xcoord,ycoord,zcoord, &
+  subroutine get_element_face_normal(ispec,iface,xcoord,ycoord,zcoord, &
                                 ibool,nspec,nglob, &
                                 xstore_dummy,ystore_dummy,zstore_dummy, &
                                 normal)
@@ -463,13 +464,13 @@
   endif
   !print*,'face ',iface,'scalarproduct:',tmp
 
-end subroutine get_element_face_normal
+  end subroutine get_element_face_normal
 
 !
-!----
+!-------------------------------------------------------------------------------------------------
 !
 
-subroutine get_element_face_normal_idirect(ispec,iface,xcoord,ycoord,zcoord, &
+  subroutine get_element_face_normal_idirect(ispec,iface,xcoord,ycoord,zcoord, &
                                 ibool,nspec,nglob, &
                                 xstore_dummy,ystore_dummy,zstore_dummy, &
                                 normal,idirect)
@@ -565,5 +566,50 @@
     idirect = 1
   endif
 
-end subroutine get_element_face_normal_idirect
+  end subroutine get_element_face_normal_idirect
 
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine get_element_corners(ispec,iface_ref,xcoord,ycoord,zcoord,iglob_corners_ref, &
+                                ibool,nspec,nglob,xstore_dummy,ystore_dummy,zstore_dummy, &
+                                iface_all_corner_ijk)
+
+  implicit none
+
+  include "constants.h"
+
+  integer,intent(in) :: ispec,iface_ref,nspec,nglob
+
+  ! face corner locations
+  real(kind=CUSTOM_REAL),dimension(NGNOD2D),intent(out) :: xcoord,ycoord,zcoord
+  integer,dimension(NGNOD2D),intent(out):: iglob_corners_ref
+
+  ! index array
+  integer, dimension(NGLLX,NGLLY,NGLLZ,nspec) :: ibool
+  ! global point locations
+  real(kind=CUSTOM_REAL) :: xstore_dummy(nglob),ystore_dummy(nglob),zstore_dummy(nglob)
+
+  ! assumes NGNOD2D == 4
+  integer,dimension(3,4,6) :: iface_all_corner_ijk
+
+  ! local parameters
+  integer :: icorner,i,j,k
+
+  ! loops over corners
+  do icorner = 1,NGNOD2D
+    i = iface_all_corner_ijk(1,icorner,iface_ref)
+    j = iface_all_corner_ijk(2,icorner,iface_ref)
+    k = iface_all_corner_ijk(3,icorner,iface_ref)
+
+    ! global reference indices
+    iglob_corners_ref(icorner) = ibool(i,j,k,ispec)
+
+    ! reference corner coordinates
+    xcoord(icorner) = xstore_dummy(iglob_corners_ref(icorner))
+    ycoord(icorner) = ystore_dummy(iglob_corners_ref(icorner))
+    zcoord(icorner) = zstore_dummy(iglob_corners_ref(icorner))
+  enddo
+
+  end subroutine get_element_corners

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/read_parameter_file.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/read_parameter_file.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/read_parameter_file.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -243,17 +243,21 @@
     IMODEL = IMODEL_1D_CASCADIA
 
   ! user models
+  case( '1d_prem_pb' )
+    IMODEL = IMODEL_1D_PREM_PB
+  case( 'aniso' )
+    IMODEL = IMODEL_DEFAULT
+    ANISOTROPY = .true.
+  case( 'external' )
+    IMODEL = IMODEL_USER_EXTERNAL
+  case( 'ipati' )
+    IMODEL = IMODEL_IPATI
+  case( 'gll' )
+    IMODEL = IMODEL_GLL
   case( 'salton_trough')
     IMODEL = IMODEL_SALTON_TROUGH
   case( 'tomo' )
     IMODEL = IMODEL_TOMO
-  case( 'external' )
-    IMODEL = IMODEL_USER_EXTERNAL
-  case( 'aniso' )
-    IMODEL = IMODEL_DEFAULT
-    ANISOTROPY = .true.
-  case( '1d_prem_pb' )
-    IMODEL = IMODEL_1D_PREM_PB
 
   case default
     print*
@@ -263,6 +267,10 @@
     IMODEL = IMODEL_DEFAULT
   end select
 
+  ! check
+  if( IMODEL == IMODEL_IPATI ) then
+    if( USE_RICKER_IPATI .eqv. .false. ) stop 'please set USE_RICKER_IPATI to true in shared/constants.h and recompile'
+  endif
 
   end subroutine read_parameter_file
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/sum_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/sum_kernels.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/sum_kernels.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -24,7 +24,7 @@
 module sum_par
 
   include 'constants.h'
-  
+
   ! USER PARAMETERS
 
   ! by default, this algorithm uses transverse isotropic (bulk,bulk_betav,bulk_betah,eta) kernels to sum up
@@ -38,13 +38,13 @@
 
   ! 1 permille of maximum for inverting hessian
   real(kind=CUSTOM_REAL),parameter :: THRESHOLD_HESS = 1.e-3
-  
+
   ! sums all hessians before inverting and preconditioning
   logical, parameter :: USE_HESS_SUM = .true.
-  
+
   ! uses source mask to blend out source elements
   logical, parameter :: USE_SOURCE_MASK = .false.
-  
+
   ! maximum number of kernels listed
   integer, parameter :: MAX_NUM_NODES = 1000
 
@@ -53,7 +53,7 @@
 
   ! mesh size
   integer :: NSPEC_AB, NGLOB_AB
-    
+
 end module sum_par
 
 !
@@ -64,7 +64,7 @@
 
   use sum_par
   implicit none
-  
+
   include 'mpif.h'
   include 'precision.h'
 
@@ -102,7 +102,7 @@
     write(*,*) 'reading kernel list: '
   endif
   call mpi_barrier(MPI_COMM_WORLD,ier)
-  
+
   ! reads in event list
   nker=0
   open(unit = 20, file = trim(kernel_file_list), status = 'old',iostat = ios)
@@ -150,7 +150,7 @@
   ! reads mesh file
   !
   ! needs to get array dimensions
-    
+
   ! opens external mesh file
   write(prname_lp,'(a,i6.6,a)') trim(LOCAL_PATH)//'/proc',myrank,'_'//'external_mesh.bin'
   open(unit=27,file=trim(prname_lp),&
@@ -161,7 +161,7 @@
     call exit_mpi(myrank, 'error reading external mesh file')
   endif
 
-  ! gets number of elements and global points for this partition          
+  ! gets number of elements and global points for this partition
   read(27) NSPEC_AB
   read(27) NGLOB_AB
 
@@ -177,7 +177,7 @@
 
   ! synchronizes
   call mpi_barrier(MPI_COMM_WORLD,ier)
-  
+
   ! sums up kernels
   if( USE_ISO_KERNELS ) then
 
@@ -260,18 +260,18 @@
   allocate(kernel(NGLLX,NGLLY,NGLLZ,NSPEC_AB), &
           hess(NGLLX,NGLLY,NGLLZ,NSPEC_AB), &
           total_kernel(NGLLX,NGLLY,NGLLZ,NSPEC_AB))
-  
-  
-  if( USE_HESS_SUM ) then  
+
+
+  if( USE_HESS_SUM ) then
     allocate( total_hess(NGLLX,NGLLY,NGLLZ,NSPEC_AB) )
-    total_hess(:,:,:,:) = 0.0_CUSTOM_REAL    
+    total_hess(:,:,:,:) = 0.0_CUSTOM_REAL
   endif
-  
-  if( USE_SOURCE_MASK ) then  
+
+  if( USE_SOURCE_MASK ) then
     allocate( mask_source(NGLLX,NGLLY,NGLLZ,NSPEC_AB) )
-    mask_source(:,:,:,:) = 1.0_CUSTOM_REAL      
+    mask_source(:,:,:,:) = 1.0_CUSTOM_REAL
   endif
-  
+
   ! loops over all event kernels
   total_kernel = 0._CUSTOM_REAL
   do iker = 1, nker
@@ -295,7 +295,7 @@
     read(12) kernel
     close(12)
 
-    ! outputs norm of kernel 
+    ! outputs norm of kernel
     norm = sum( kernel * kernel )
     call mpi_reduce(norm,norm_sum,1,CUSTOM_MPI_TYPE,MPI_SUM,0,MPI_COMM_WORLD,ier)
     if( myrank == 0 ) then
@@ -326,7 +326,7 @@
 
     ! note: we take absolute values for hessian (as proposed by Yang)
     hess = abs(hess)
-    
+
     ! source mask
     if( USE_SOURCE_MASK ) then
       ! reads in mask
@@ -339,10 +339,10 @@
       endif
       read(12) mask_source
       close(12)
-      
+
       ! masks source elements
       kernel = kernel * mask_source
-      
+
     endif
 
     ! precondition
@@ -355,26 +355,26 @@
 
       ! inverts hessian
       call invert_hess( myrank,hess )
-      
+
       ! preconditions each event kernel with its hessian
       kernel = kernel * hess
 
     endif
-    
+
     ! sums all kernels from each event
     total_kernel = total_kernel + kernel
-      
+
   enddo
 
   ! preconditions summed kernels with summed hessians
   if( USE_HESS_SUM ) then
-  
+
       ! inverts hessian matrix
       call invert_hess( myrank,total_hess )
-      
+
       ! preconditions kernel
       total_kernel = total_kernel * total_hess
-  
+
   endif
 
   ! stores summed kernels
@@ -396,7 +396,7 @@
   deallocate(kernel,hess,total_kernel)
   if( USE_HESS_SUM ) deallocate(total_hess)
   if( USE_SOURCE_MASK ) deallocate(mask_source)
-  
+
 end subroutine sum_kernel_pre
 
 !
@@ -409,7 +409,7 @@
 ! the approximate hessian is only defined for diagonal elements: like
 ! H_nn = \frac{ \partial^2 \chi }{ \partial \rho_n \partial \rho_n }
 ! on all GLL points, which are indexed (i,j,k,ispec)
-  
+
   use sum_par
   implicit none
 
@@ -423,31 +423,31 @@
   ! local parameters
   real(kind=CUSTOM_REAL) :: maxh,maxh_all
   integer :: ier
-  
+
   ! maximum value of hessian
   maxh = maxval( abs(hess_matrix) )
 
   ! determines maximum from all slices on master
   call mpi_allreduce(maxh,maxh_all,1,CUSTOM_MPI_TYPE,MPI_MAX,MPI_COMM_WORLD,ier)
-  
+
   ! user output
   if( myrank == 0 ) then
     print*
     print*,'hessian maximum: ',maxh_all
     print*
   endif
-  
-  ! normalizes hessian 
+
+  ! normalizes hessian
   if( maxh_all < 1.e-18 ) then
-    ! hessian is zero, re-initializes 
+    ! hessian is zero, re-initializes
     hess_matrix = 1.0_CUSTOM_REAL
     !call exit_mpi(myrank,'error hessian too small')
   else
     ! since hessian has absolute values, this scales between [0,1]
-    hess_matrix = hess_matrix / maxh_all  
+    hess_matrix = hess_matrix / maxh_all
   endif
-  
 
+
   ! inverts hessian values
   where( abs(hess_matrix(:,:,:,:)) > THRESHOLD_HESS )
     hess_matrix = 1.0_CUSTOM_REAL / hess_matrix
@@ -457,5 +457,5 @@
 
   ! rescales hessian
   !hess_matrix = hess_matrix * maxh_all
-  
+
 end subroutine invert_hess

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c	2012-06-19 22:23:01 UTC (rev 20392)
@@ -242,7 +242,7 @@
 
   FILE *ft;
   int itemlen,remlen,donelen,ret;
-  void *buf;
+  char *buf;
 
   // file pointer
   ft = fp_abs[*fid];
@@ -308,7 +308,7 @@
   FILE *ft;
   int ret,itemlen,remlen,donelen;
   long long pos;
-  void *buf;
+  char *buf;
 
   // file pointer
   ft = fp_abs[*fid];

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in	2012-06-19 22:23:01 UTC (rev 20392)
@@ -172,6 +172,7 @@
 	$O/compute_forces_acoustic_PML.o \
 	$O/compute_forces_elastic.o \
 	$O/compute_forces_elastic_Dev.o \
+	$O/compute_forces_elastic_Dev2.o \
 	$O/compute_forces_elastic_noDev.o \
 	$O/compute_forces_poroelastic.o \
 	$O/compute_forces_solid.o \

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -426,36 +426,37 @@
 
   integer :: NPROC
   integer(kind=8) :: Mesh_pointer
-! array to assemble
 
+  ! array to assemble
   integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
 
   real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
        buffer_recv_vector_ext_mesh
 
-  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
-  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_recv_vector_ext_mesh
 
-  integer iinterface ! ipoin
-  integer FORWARD_OR_ADJOINT
+  ! local parameters
+  !integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  !integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+  !integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh
+  integer :: iinterface
 
-! here we have to assemble all the contributions between partitions using MPI
+  ! here we have to assemble all the contributions between partitions using MPI
 
-! assemble only if more than one partition
+  ! assemble only if more than one partition
   if(NPROC > 1) then
 
-! wait for communications completion (recv)
-
-  write(IMAIN,*) "sending MPI_wait"
+  ! wait for communications completion (recv)
+  !write(IMAIN,*) "sending MPI_wait"
   do iinterface = 1, num_interfaces_ext_mesh
     call wait_req(request_recv_vector_ext_mesh(iinterface))
   enddo
 
-! send contributions to GPU
-  call transfer_boundary_to_device_asynchronously(Mesh_pointer, buffer_recv_vector_ext_mesh, &
-       num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh)
+  ! send contributions to GPU
+  call transfer_boundary_to_device_a(Mesh_pointer, buffer_recv_vector_ext_mesh, &
+                                    num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh)
   endif
+
   ! This step is done via previous function transfer_and_assemble...
   ! do iinterface = 1, num_interfaces_ext_mesh
   !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
@@ -464,65 +465,77 @@
   !   enddo
   ! enddo
 
-end subroutine transfer_boundary_to_device
+  end subroutine transfer_boundary_to_device
 
-subroutine assemble_MPI_vector_write_cuda_no_transfer(NPROC,NGLOB_AB,array_val, Mesh_pointer, &
-                                            buffer_recv_vector_ext_mesh, &
-                                            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
-                                            FORWARD_OR_ADJOINT )
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-implicit none
+! not used...
+!  subroutine assemble_MPI_vector_write_cuda_no_transfer(NPROC,NGLOB_AB,array_val, Mesh_pointer, &
+!                                            buffer_recv_vector_ext_mesh, &
+!                                            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+!                                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+!                                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+!                                            FORWARD_OR_ADJOINT )
+!
+!  implicit none
+!
+!  include "constants.h"
+!
+!  integer :: NPROC
+!  integer :: NGLOB_AB
+!  integer(kind=8) :: Mesh_pointer
+!  ! array to assemble
+!  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
+!
+!  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+!
+!  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+!       buffer_recv_vector_ext_mesh
+!
+!  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+!  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+!  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh
+!  !integer, dimension(num_interfaces_ext_mesh) :: request_recv_vector_ext_mesh
+!
+!  integer :: FORWARD_OR_ADJOINT
+!
+!  ! local parameters
+!  integer :: iinterface
+!
+!  ! here we have to assemble all the contributions between partitions using MPI
+!
+!  ! assemble only if more than one partition
+!  if(NPROC > 1) then
+!
+!     ! adding contributions of neighbours
+!     call assemble_accel_on_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, &
+!          num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+!          nibool_interfaces_ext_mesh,&
+!          ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+!
+!     ! This step is done via previous function transfer_and_assemble...
+!     ! do iinterface = 1, num_interfaces_ext_mesh
+!     !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+!     !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+!     !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+!     !   enddo
+!     ! enddo
+!
+!     ! wait for communications completion (send)
+!     do iinterface = 1, num_interfaces_ext_mesh
+!        call wait_req(request_send_vector_ext_mesh(iinterface))
+!     enddo
+!  endif
+!
+!  end subroutine assemble_MPI_vector_write_cuda_no_transfer
 
-  include "constants.h"
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-  integer :: NPROC
-  integer :: NGLOB_AB
-  integer(kind=8) :: Mesh_pointer
-! array to assemble
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
 
-  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
-       buffer_recv_vector_ext_mesh
-
-  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
-  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
-  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
-
-  integer iinterface ! ipoin
-  integer FORWARD_OR_ADJOINT
-
-! here we have to assemble all the contributions between partitions using MPI
-
-  ! assemble only if more than one partition
-  if(NPROC > 1) then
-
-     ! adding contributions of neighbours
-     call assemble_accel_on_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, &
-          num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
-          nibool_interfaces_ext_mesh,&
-          ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
-
-     ! This step is done via previous function transfer_and_assemble...
-     ! do iinterface = 1, num_interfaces_ext_mesh
-     !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
-     !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
-     !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
-     !   enddo
-     ! enddo
-
-     ! wait for communications completion (send)
-     do iinterface = 1, num_interfaces_ext_mesh
-        call wait_req(request_send_vector_ext_mesh(iinterface))
-     enddo
-  endif
-
-  end subroutine assemble_MPI_vector_write_cuda_no_transfer
-
-
   subroutine assemble_MPI_vector_send_cuda(NPROC, &
                                           buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
                                           num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
@@ -530,48 +543,49 @@
                                           my_neighbours_ext_mesh, &
                                           request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
 
-    ! sends data
-    ! note: array to assemble already filled into buffer_send_vector_ext_mesh array
+! sends data
+! note: array to assemble already filled into buffer_send_vector_ext_mesh array
 
-    implicit none
+  implicit none
 
-    include "constants.h"
+  include "constants.h"
 
-    integer :: NPROC
+  integer :: NPROC
 
-    integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
 
-    real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
-         buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
 
-    integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
-    integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
 
-    integer iinterface
+  ! local parameters
+  integer :: iinterface
 
-    ! note: preparation of the contribution between partitions using MPI
-    !          already done in transfer_boun_accel routine
+  ! note: preparation of the contribution between partitions using MPI
+  !          already done in transfer_boun_accel routine
 
-    ! send only if more than one partition
-    if(NPROC > 1) then
+  ! send only if more than one partition
+  if(NPROC > 1) then
 
-       ! send messages
-       do iinterface = 1, num_interfaces_ext_mesh
-          call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
-               NDIM*nibool_interfaces_ext_mesh(iinterface), &
-               my_neighbours_ext_mesh(iinterface), &
-               itag, &
-               request_send_vector_ext_mesh(iinterface) &
-               )
-          call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
-               NDIM*nibool_interfaces_ext_mesh(iinterface), &
-               my_neighbours_ext_mesh(iinterface), &
-               itag, &
-               request_recv_vector_ext_mesh(iinterface) &
-               )
-       enddo
+     ! send messages
+     do iinterface = 1, num_interfaces_ext_mesh
+        call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
+             NDIM*nibool_interfaces_ext_mesh(iinterface), &
+             my_neighbours_ext_mesh(iinterface), &
+             itag, &
+             request_send_vector_ext_mesh(iinterface) &
+             )
+        call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
+             NDIM*nibool_interfaces_ext_mesh(iinterface), &
+             my_neighbours_ext_mesh(iinterface), &
+             itag, &
+             request_recv_vector_ext_mesh(iinterface) &
+             )
+     enddo
 
-    endif
+  endif
 
   end subroutine assemble_MPI_vector_send_cuda
 
@@ -608,37 +622,39 @@
   integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
   integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
 
-  integer iinterface ! ipoin
-  integer FORWARD_OR_ADJOINT
+  integer :: FORWARD_OR_ADJOINT
 
-! here we have to assemble all the contributions between partitions using MPI
+  ! local parameters
+  integer :: iinterface
 
-! assemble only if more than one partition
+  ! here we have to assemble all the contributions between partitions using MPI
+
+  ! assemble only if more than one partition
   if(NPROC > 1) then
 
-! wait for communications completion (recv)
-  do iinterface = 1, num_interfaces_ext_mesh
-    call wait_req(request_recv_vector_ext_mesh(iinterface))
-  enddo
+    ! wait for communications completion (recv)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_recv_vector_ext_mesh(iinterface))
+    enddo
 
-! adding contributions of neighbours
-  call transfer_asmbl_accel_to_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, &
-                                    num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
-                                    nibool_interfaces_ext_mesh,&
-                                    ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+    ! adding contributions of neighbours
+    call transfer_asmbl_accel_to_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, &
+                                      num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+                                      nibool_interfaces_ext_mesh,&
+                                      ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
 
-  ! This step is done via previous function transfer_and_assemble...
-  ! do iinterface = 1, num_interfaces_ext_mesh
-  !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
-  !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
-  !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
-  !   enddo
-  ! enddo
+    ! This step is done via previous function transfer_and_assemble...
+    ! do iinterface = 1, num_interfaces_ext_mesh
+    !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+    !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+    !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+    !   enddo
+    ! enddo
 
-! wait for communications completion (send)
-  do iinterface = 1, num_interfaces_ext_mesh
-    call wait_req(request_send_vector_ext_mesh(iinterface))
-  enddo
+    ! wait for communications completion (send)
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_send_vector_ext_mesh(iinterface))
+    enddo
 
   endif
 
@@ -673,9 +689,10 @@
   integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
   integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
 
-  integer iinterface
+  ! local parameters
+  integer :: iinterface
 
-! sends only if more than one partition
+  ! sends only if more than one partition
   if(NPROC > 1) then
 
     ! note: partition border copy into the buffer has already been done
@@ -728,7 +745,7 @@
 
   integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
 
-! array to assemble
+  ! array to assemble
   real(kind=CUSTOM_REAL), dimension(NGLOB_AB) :: array_val
 
 
@@ -743,7 +760,7 @@
 
   integer iinterface ! ipoin
 
-! assemble only if more than one partition
+  ! assemble only if more than one partition
   if(NPROC > 1) then
 
     ! wait for communications completion (recv)

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -135,7 +135,7 @@
               num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,&
               request_recv_vector_ext_mesh)
       endif ! inner elements
-      
+
    endif ! GPU_MODE
 
 
@@ -247,8 +247,8 @@
           ! MPI-send is done from within compute_forces_elastic_cuda,
           ! once the inner element kernels are launched, and the
           ! memcpy has finished. see compute_forces_elastic_cuda:1655
-          call transfer_boundary_from_device_asynchronously(Mesh_pointer,nspec_outer_elastic)
-                    
+          call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic)
+
        endif ! GPU_MODE
 
        ! adjoint simulations

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic_Dev.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic_Dev.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic_Dev.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -24,8 +24,9 @@
 !
 !=====================================================================
 
+! Deville routine for NGLL == 5 (default)
 
-subroutine compute_forces_elastic_Dev_5p( iphase ,NSPEC_AB,NGLOB_AB, &
+  subroutine compute_forces_elastic_Dev_5p( iphase ,NSPEC_AB,NGLOB_AB, &
                                     displ,accel, &
                                     xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
                                     hprime_xx,hprime_xxT, &
@@ -655,3454 +656,5 @@
 
   enddo  ! spectral element loop
 
-end subroutine compute_forces_elastic_Dev_5p
+  end subroutine compute_forces_elastic_Dev_5p
 
-!
-!=====================================================================
-!
-
-subroutine compute_forces_elastic_Dev_6p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,6) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(6,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(6,6,6) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(6,6,6) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(6,36) :: B1_m1_m2_6points,B2_m1_m2_6points,B3_m1_m2_6points
-  real(kind=CUSTOM_REAL), dimension(6,36) :: C1_m1_m2_6points,C2_m1_m2_6points,C3_m1_m2_6points
-  real(kind=CUSTOM_REAL), dimension(6,36) :: E1_m1_m2_6points,E2_m1_m2_6points,E3_m1_m2_6points
-
-  equivalence(dummyx_loc,B1_m1_m2_6points)
-  equivalence(dummyy_loc,B2_m1_m2_6points)
-  equivalence(dummyz_loc,B3_m1_m2_6points)
-  equivalence(tempx1,C1_m1_m2_6points)
-  equivalence(tempy1,C2_m1_m2_6points)
-  equivalence(tempz1,C3_m1_m2_6points)
-  equivalence(newtempx1,E1_m1_m2_6points)
-  equivalence(newtempy1,E2_m1_m2_6points)
-  equivalence(newtempz1,E3_m1_m2_6points)
-
-  real(kind=CUSTOM_REAL), dimension(36,6) :: &
-    A1_mxm_m2_m1_6points,A2_mxm_m2_m1_6points,A3_mxm_m2_m1_6points
-  real(kind=CUSTOM_REAL), dimension(36,6) :: &
-    C1_mxm_m2_m1_6points,C2_mxm_m2_m1_6points,C3_mxm_m2_m1_6points
-  real(kind=CUSTOM_REAL), dimension(36,6) :: &
-    E1_mxm_m2_m1_6points,E2_mxm_m2_m1_6points,E3_mxm_m2_m1_6points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_6points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_6points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_6points)
-  equivalence(tempx3,C1_mxm_m2_m1_6points)
-  equivalence(tempy3,C2_mxm_m2_m1_6points)
-  equivalence(tempz3,C3_mxm_m2_m1_6points)
-  equivalence(newtempx3,E1_mxm_m2_m1_6points)
-  equivalence(newtempy3,E2_mxm_m2_m1_6points)
-  equivalence(newtempz3,E3_mxm_m2_m1_6points)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-        ! stores displacment values in local array
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_6points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_6points(i,j) = hprime_xx(i,1)*B1_m1_m2_6points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_6points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_6points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_6points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_6points(5,j) + &
-                                  hprime_xx(i,6)*B1_m1_m2_6points(6,j)
-            C2_m1_m2_6points(i,j) = hprime_xx(i,1)*B2_m1_m2_6points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_6points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_6points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_6points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_6points(5,j) + &
-                                  hprime_xx(i,6)*B2_m1_m2_6points(6,j)
-            C3_m1_m2_6points(i,j) = hprime_xx(i,1)*B3_m1_m2_6points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_6points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_6points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_6points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_6points(5,j) + &
-                                  hprime_xx(i,6)*B3_m1_m2_6points(6,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_6points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyx_loc(i,6,k)*hprime_xxT(6,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyy_loc(i,6,k)*hprime_xxT(6,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyz_loc(i,6,k)*hprime_xxT(6,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_6points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_6points(i,j) = A1_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
-                                      A1_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
-            C2_mxm_m2_m1_6points(i,j) = A2_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
-                                      A2_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
-            C3_mxm_m2_m1_6points(i,j) = A3_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
-                                      A3_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! computes deviatoric strain attenuation and/or for kernel calculations
-              if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_6points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_6points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_6points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_6points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_6points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_6points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C1_m1_m2_6points(6,j)
-            E2_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_6points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_6points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_6points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_6points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_6points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C2_m1_m2_6points(6,j)
-            E3_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_6points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_6points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_6points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_6points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_6points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C3_m1_m2_6points(6,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_6points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempx2(i,6,k)*hprimewgll_xx(6,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempy2(i,6,k)*hprimewgll_xx(6,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempz2(i,6,k)*hprimewgll_xx(6,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_6points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_6points(i,j) = C1_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
-                                      C1_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
-            E2_mxm_m2_m1_6points(i,j) = C2_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
-                                      C2_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
-            E3_mxm_m2_m1_6points(i,j) = C3_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
-                                      C3_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-end subroutine compute_forces_elastic_Dev_6p
-
-!
-!=====================================================================
-!
-
-subroutine compute_forces_elastic_Dev_7p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,7) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(7,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(7,7,7) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(7,7,7) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(7,49) :: B1_m1_m2_7points,B2_m1_m2_7points,B3_m1_m2_7points
-  real(kind=CUSTOM_REAL), dimension(7,49) :: C1_m1_m2_7points,C2_m1_m2_7points,C3_m1_m2_7points
-  real(kind=CUSTOM_REAL), dimension(7,49) :: E1_m1_m2_7points,E2_m1_m2_7points,E3_m1_m2_7points
-
-  equivalence(dummyx_loc,B1_m1_m2_7points)
-  equivalence(dummyy_loc,B2_m1_m2_7points)
-  equivalence(dummyz_loc,B3_m1_m2_7points)
-  equivalence(tempx1,C1_m1_m2_7points)
-  equivalence(tempy1,C2_m1_m2_7points)
-  equivalence(tempz1,C3_m1_m2_7points)
-  equivalence(newtempx1,E1_m1_m2_7points)
-  equivalence(newtempy1,E2_m1_m2_7points)
-  equivalence(newtempz1,E3_m1_m2_7points)
-
-  real(kind=CUSTOM_REAL), dimension(49,7) :: &
-    A1_mxm_m2_m1_7points,A2_mxm_m2_m1_7points,A3_mxm_m2_m1_7points
-  real(kind=CUSTOM_REAL), dimension(49,7) :: &
-    C1_mxm_m2_m1_7points,C2_mxm_m2_m1_7points,C3_mxm_m2_m1_7points
-  real(kind=CUSTOM_REAL), dimension(49,7) :: &
-    E1_mxm_m2_m1_7points,E2_mxm_m2_m1_7points,E3_mxm_m2_m1_7points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_7points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_7points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_7points)
-  equivalence(tempx3,C1_mxm_m2_m1_7points)
-  equivalence(tempy3,C2_mxm_m2_m1_7points)
-  equivalence(tempz3,C3_mxm_m2_m1_7points)
-  equivalence(newtempx3,E1_mxm_m2_m1_7points)
-  equivalence(newtempy3,E2_mxm_m2_m1_7points)
-  equivalence(newtempz3,E3_mxm_m2_m1_7points)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-        ! stores displacment values in local array
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_7points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_7points(i,j) = hprime_xx(i,1)*B1_m1_m2_7points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_7points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_7points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_7points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_7points(5,j) + &
-                                  hprime_xx(i,6)*B1_m1_m2_7points(6,j) + &
-                                  hprime_xx(i,7)*B1_m1_m2_7points(7,j)
-            C2_m1_m2_7points(i,j) = hprime_xx(i,1)*B2_m1_m2_7points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_7points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_7points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_7points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_7points(5,j) + &
-                                  hprime_xx(i,6)*B2_m1_m2_7points(6,j) + &
-                                  hprime_xx(i,7)*B2_m1_m2_7points(7,j)
-            C3_m1_m2_7points(i,j) = hprime_xx(i,1)*B3_m1_m2_7points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_7points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_7points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_7points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_7points(5,j) + &
-                                  hprime_xx(i,6)*B3_m1_m2_7points(6,j) + &
-                                  hprime_xx(i,7)*B3_m1_m2_7points(7,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_7points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyx_loc(i,7,k)*hprime_xxT(7,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyy_loc(i,7,k)*hprime_xxT(7,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyz_loc(i,7,k)*hprime_xxT(7,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_7points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_7points(i,j) = A1_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
-                                      A1_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
-                                      A1_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
-            C2_mxm_m2_m1_7points(i,j) = A2_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
-                                      A2_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
-                                      A2_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
-            C3_mxm_m2_m1_7points(i,j) = A3_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
-                                      A3_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
-                                      A3_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! computes deviatoric strain attenuation and/or for kernel calculations
-              if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_7points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_7points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_7points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_7points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_7points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_7points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C1_m1_m2_7points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C1_m1_m2_7points(7,j)
-            E2_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_7points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_7points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_7points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_7points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_7points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C2_m1_m2_7points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C2_m1_m2_7points(7,j)
-            E3_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_7points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_7points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_7points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_7points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_7points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C3_m1_m2_7points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C3_m1_m2_7points(7,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_7points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempx2(i,7,k)*hprimewgll_xx(7,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempy2(i,7,k)*hprimewgll_xx(7,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempz2(i,7,k)*hprimewgll_xx(7,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_7points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_7points(i,j) = C1_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
-                                      C1_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
-                                      C1_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
-            E2_mxm_m2_m1_7points(i,j) = C2_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
-                                      C2_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
-                                      C2_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
-            E3_mxm_m2_m1_7points(i,j) = C3_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
-                                      C3_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
-                                      C3_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-end subroutine compute_forces_elastic_Dev_7p
-
-!
-!=====================================================================
-!
-
-subroutine compute_forces_elastic_Dev_8p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,8) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(8,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(8,8,8) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(8,8,8) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(8,64) :: B1_m1_m2_8points,B2_m1_m2_8points,B3_m1_m2_8points
-  real(kind=CUSTOM_REAL), dimension(8,64) :: C1_m1_m2_8points,C2_m1_m2_8points,C3_m1_m2_8points
-  real(kind=CUSTOM_REAL), dimension(8,64) :: E1_m1_m2_8points,E2_m1_m2_8points,E3_m1_m2_8points
-
-  equivalence(dummyx_loc,B1_m1_m2_8points)
-  equivalence(dummyy_loc,B2_m1_m2_8points)
-  equivalence(dummyz_loc,B3_m1_m2_8points)
-  equivalence(tempx1,C1_m1_m2_8points)
-  equivalence(tempy1,C2_m1_m2_8points)
-  equivalence(tempz1,C3_m1_m2_8points)
-  equivalence(newtempx1,E1_m1_m2_8points)
-  equivalence(newtempy1,E2_m1_m2_8points)
-  equivalence(newtempz1,E3_m1_m2_8points)
-
-  real(kind=CUSTOM_REAL), dimension(64,8) :: &
-    A1_mxm_m2_m1_8points,A2_mxm_m2_m1_8points,A3_mxm_m2_m1_8points
-  real(kind=CUSTOM_REAL), dimension(64,8) :: &
-    C1_mxm_m2_m1_8points,C2_mxm_m2_m1_8points,C3_mxm_m2_m1_8points
-  real(kind=CUSTOM_REAL), dimension(64,8) :: &
-    E1_mxm_m2_m1_8points,E2_mxm_m2_m1_8points,E3_mxm_m2_m1_8points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_8points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_8points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_8points)
-  equivalence(tempx3,C1_mxm_m2_m1_8points)
-  equivalence(tempy3,C2_mxm_m2_m1_8points)
-  equivalence(tempz3,C3_mxm_m2_m1_8points)
-  equivalence(newtempx3,E1_mxm_m2_m1_8points)
-  equivalence(newtempy3,E2_mxm_m2_m1_8points)
-  equivalence(newtempz3,E3_mxm_m2_m1_8points)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-        ! stores displacment values in local array
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_8points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_8points(i,j) = hprime_xx(i,1)*B1_m1_m2_8points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_8points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_8points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_8points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_8points(5,j) + &
-                                  hprime_xx(i,6)*B1_m1_m2_8points(6,j) + &
-                                  hprime_xx(i,7)*B1_m1_m2_8points(7,j) + &
-                                  hprime_xx(i,8)*B1_m1_m2_8points(8,j)
-            C2_m1_m2_8points(i,j) = hprime_xx(i,1)*B2_m1_m2_8points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_8points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_8points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_8points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_8points(5,j) + &
-                                  hprime_xx(i,6)*B2_m1_m2_8points(6,j) + &
-                                  hprime_xx(i,7)*B2_m1_m2_8points(7,j) + &
-                                  hprime_xx(i,8)*B2_m1_m2_8points(8,j)
-            C3_m1_m2_8points(i,j) = hprime_xx(i,1)*B3_m1_m2_8points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_8points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_8points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_8points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_8points(5,j) + &
-                                  hprime_xx(i,6)*B3_m1_m2_8points(6,j) + &
-                                  hprime_xx(i,7)*B3_m1_m2_8points(7,j) + &
-                                  hprime_xx(i,8)*B3_m1_m2_8points(8,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_8points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyx_loc(i,8,k)*hprime_xxT(8,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyy_loc(i,8,k)*hprime_xxT(8,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyz_loc(i,8,k)*hprime_xxT(8,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_8points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_8points(i,j) = A1_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
-                                      A1_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
-                                      A1_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
-                                      A1_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
-            C2_mxm_m2_m1_8points(i,j) = A2_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
-                                      A2_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
-                                      A2_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
-                                      A2_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
-            C3_mxm_m2_m1_8points(i,j) = A3_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
-                                      A3_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
-                                      A3_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
-                                      A3_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! computes deviatoric strain attenuation and/or for kernel calculations
-              if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_8points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_8points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_8points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_8points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_8points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_8points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C1_m1_m2_8points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C1_m1_m2_8points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C1_m1_m2_8points(8,j)
-            E2_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_8points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_8points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_8points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_8points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_8points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C2_m1_m2_8points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C2_m1_m2_8points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C2_m1_m2_8points(8,j)
-            E3_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_8points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_8points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_8points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_8points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_8points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C3_m1_m2_8points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C3_m1_m2_8points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C3_m1_m2_8points(8,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_8points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempx2(i,8,k)*hprimewgll_xx(8,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempy2(i,8,k)*hprimewgll_xx(8,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempz2(i,8,k)*hprimewgll_xx(8,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_8points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_8points(i,j) = C1_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
-                                      C1_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
-                                      C1_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
-                                      C1_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
-            E2_mxm_m2_m1_8points(i,j) = C2_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
-                                      C2_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
-                                      C2_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
-                                      C2_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
-            E3_mxm_m2_m1_8points(i,j) = C3_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
-                                      C3_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
-                                      C3_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
-                                      C3_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-end subroutine compute_forces_elastic_Dev_8p
-
-!
-!=====================================================================
-!
-
-subroutine compute_forces_elastic_Dev_9p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,9) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(9,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(9,9,9) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(9,9,9) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(9,81) :: B1_m1_m2_9points,B2_m1_m2_9points,B3_m1_m2_9points
-  real(kind=CUSTOM_REAL), dimension(9,81) :: C1_m1_m2_9points,C2_m1_m2_9points,C3_m1_m2_9points
-  real(kind=CUSTOM_REAL), dimension(9,81) :: E1_m1_m2_9points,E2_m1_m2_9points,E3_m1_m2_9points
-
-  equivalence(dummyx_loc,B1_m1_m2_9points)
-  equivalence(dummyy_loc,B2_m1_m2_9points)
-  equivalence(dummyz_loc,B3_m1_m2_9points)
-  equivalence(tempx1,C1_m1_m2_9points)
-  equivalence(tempy1,C2_m1_m2_9points)
-  equivalence(tempz1,C3_m1_m2_9points)
-  equivalence(newtempx1,E1_m1_m2_9points)
-  equivalence(newtempy1,E2_m1_m2_9points)
-  equivalence(newtempz1,E3_m1_m2_9points)
-
-  real(kind=CUSTOM_REAL), dimension(81,9) :: &
-    A1_mxm_m2_m1_9points,A2_mxm_m2_m1_9points,A3_mxm_m2_m1_9points
-  real(kind=CUSTOM_REAL), dimension(81,9) :: &
-    C1_mxm_m2_m1_9points,C2_mxm_m2_m1_9points,C3_mxm_m2_m1_9points
-  real(kind=CUSTOM_REAL), dimension(81,9) :: &
-    E1_mxm_m2_m1_9points,E2_mxm_m2_m1_9points,E3_mxm_m2_m1_9points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_9points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_9points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_9points)
-  equivalence(tempx3,C1_mxm_m2_m1_9points)
-  equivalence(tempy3,C2_mxm_m2_m1_9points)
-  equivalence(tempz3,C3_mxm_m2_m1_9points)
-  equivalence(newtempx3,E1_mxm_m2_m1_9points)
-  equivalence(newtempy3,E2_mxm_m2_m1_9points)
-  equivalence(newtempz3,E3_mxm_m2_m1_9points)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-        ! stores displacment values in local array
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_9points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_9points(i,j) = hprime_xx(i,1)*B1_m1_m2_9points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_9points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_9points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_9points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_9points(5,j) + &
-                                  hprime_xx(i,6)*B1_m1_m2_9points(6,j) + &
-                                  hprime_xx(i,7)*B1_m1_m2_9points(7,j) + &
-                                  hprime_xx(i,8)*B1_m1_m2_9points(8,j) + &
-                                  hprime_xx(i,9)*B1_m1_m2_9points(9,j)
-            C2_m1_m2_9points(i,j) = hprime_xx(i,1)*B2_m1_m2_9points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_9points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_9points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_9points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_9points(5,j) + &
-                                  hprime_xx(i,6)*B2_m1_m2_9points(6,j) + &
-                                  hprime_xx(i,7)*B2_m1_m2_9points(7,j) + &
-                                  hprime_xx(i,8)*B2_m1_m2_9points(8,j) + &
-                                  hprime_xx(i,9)*B2_m1_m2_9points(9,j)
-            C3_m1_m2_9points(i,j) = hprime_xx(i,1)*B3_m1_m2_9points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_9points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_9points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_9points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_9points(5,j) + &
-                                  hprime_xx(i,6)*B3_m1_m2_9points(6,j) + &
-                                  hprime_xx(i,7)*B3_m1_m2_9points(7,j) + &
-                                  hprime_xx(i,8)*B3_m1_m2_9points(8,j) + &
-                                  hprime_xx(i,9)*B3_m1_m2_9points(9,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_9points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyx_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyx_loc(i,9,k)*hprime_xxT(9,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyy_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyy_loc(i,9,k)*hprime_xxT(9,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyz_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyz_loc(i,9,k)*hprime_xxT(9,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_9points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_9points(i,j) = A1_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
-                                      A1_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
-                                      A1_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
-                                      A1_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
-                                      A1_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
-            C2_mxm_m2_m1_9points(i,j) = A2_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
-                                      A2_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
-                                      A2_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
-                                      A2_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
-                                      A2_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
-            C3_mxm_m2_m1_9points(i,j) = A3_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
-                                      A3_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
-                                      A3_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
-                                      A3_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
-                                      A3_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! computes deviatoric strain attenuation and/or for kernel calculations
-              if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
- 
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_9points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_9points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_9points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_9points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_9points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_9points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C1_m1_m2_9points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C1_m1_m2_9points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C1_m1_m2_9points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C1_m1_m2_9points(9,j)
-            E2_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_9points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_9points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_9points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_9points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_9points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C2_m1_m2_9points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C2_m1_m2_9points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C2_m1_m2_9points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C2_m1_m2_9points(9,j)
-            E3_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_9points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_9points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_9points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_9points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_9points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C3_m1_m2_9points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C3_m1_m2_9points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C3_m1_m2_9points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C3_m1_m2_9points(9,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_9points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempx2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempx2(i,9,k)*hprimewgll_xx(9,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempy2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempy2(i,9,k)*hprimewgll_xx(9,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempz2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempz2(i,9,k)*hprimewgll_xx(9,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_9points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_9points(i,j) = C1_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
-                                      C1_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
-                                      C1_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
-                                      C1_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
-                                      C1_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
-            E2_mxm_m2_m1_9points(i,j) = C2_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
-                                      C2_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
-                                      C2_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
-                                      C2_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
-                                      C2_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
-            E3_mxm_m2_m1_9points(i,j) = C3_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
-                                      C3_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
-                                      C3_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
-                                      C3_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
-                                      C3_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-end subroutine compute_forces_elastic_Dev_9p
-
-!
-!=====================================================================
-!
-
-subroutine compute_forces_elastic_Dev_10p( iphase ,NSPEC_AB,NGLOB_AB, &
-                                    displ,accel, &
-                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                    hprime_xx,hprime_xxT, &
-                                    hprimewgll_xx,hprimewgll_xxT, &
-                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
-                                    kappastore,mustore,jacobian,ibool, &
-                                    ATTENUATION, &
-                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
-                                    NSPEC_ATTENUATION_AB, &
-                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
-                                    ANISOTROPY,NSPEC_ANISO, &
-                                    c11store,c12store,c13store,c14store,c15store,c16store,&
-                                    c22store,c23store,c24store,c25store,c26store,c33store,&
-                                    c34store,c35store,c36store,c44store,c45store,c46store,&
-                                    c55store,c56store,c66store, &
-                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
-                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
-                                    is_moho_top,is_moho_bot, &
-                                    dsdx_top,dsdx_bot, &
-                                    ispec2D_moho_top,ispec2D_moho_bot, &
-                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
-                                    phase_ispec_inner_elastic)
-
-
-! computes elastic tensor term
-
-  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
-                      N_SLS,SAVE_MOHO_MESH, &
-                      ONE_THIRD,FOUR_THIRDS,m1,m2
-  implicit none
-
-  integer :: NSPEC_AB,NGLOB_AB
-
-! displacement and acceleration
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
-
-! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
-        kappastore,mustore,jacobian
-
-! array with derivatives of Lagrange polynomials and precalculated products
-  real(kind=CUSTOM_REAL), dimension(NGLLX,10) :: hprime_xx,hprimewgll_xxT
-  real(kind=CUSTOM_REAL), dimension(10,NGLLX) :: hprime_xxT,hprimewgll_xx
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
-  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
-
-! memory variables and standard linear solids for attenuation
-  logical :: ATTENUATION
-  logical :: COMPUTE_AND_STORE_STRAIN
-  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
-  integer :: NSPEC_ATTENUATION_AB
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
-      R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
-       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
-
-! anisotropy
-  logical :: ANISOTROPY
-  integer :: NSPEC_ANISO
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
-            c11store,c12store,c13store,c14store,c15store,c16store, &
-            c22store,c23store,c24store,c25store,c26store,c33store, &
-            c34store,c35store,c36store,c44store,c45store,c46store, &
-            c55store,c56store,c66store
-
-  integer :: iphase
-  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
-  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
-
-! adjoint simulations
-  integer :: SIMULATION_TYPE
-  integer :: NSPEC_BOUN,NSPEC2D_MOHO
-
-  ! moho kernel
-  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
-    dsdx_top,dsdx_bot
-  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
-  integer :: ispec2D_moho_top, ispec2D_moho_bot
-
-! local parameters
-  real(kind=CUSTOM_REAL), dimension(10,10,10) :: dummyx_loc,dummyy_loc,dummyz_loc, &
-    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
-  real(kind=CUSTOM_REAL), dimension(10,10,10) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
-  ! manually inline the calls to the Deville et al. (2002) routines
-  real(kind=CUSTOM_REAL), dimension(10,100) :: B1_m1_m2_10points,B2_m1_m2_10points,B3_m1_m2_10points
-  real(kind=CUSTOM_REAL), dimension(10,100) :: C1_m1_m2_10points,C2_m1_m2_10points,C3_m1_m2_10points
-  real(kind=CUSTOM_REAL), dimension(10,100) :: E1_m1_m2_10points,E2_m1_m2_10points,E3_m1_m2_10points
-
-  equivalence(dummyx_loc,B1_m1_m2_10points)
-  equivalence(dummyy_loc,B2_m1_m2_10points)
-  equivalence(dummyz_loc,B3_m1_m2_10points)
-  equivalence(tempx1,C1_m1_m2_10points)
-  equivalence(tempy1,C2_m1_m2_10points)
-  equivalence(tempz1,C3_m1_m2_10points)
-  equivalence(newtempx1,E1_m1_m2_10points)
-  equivalence(newtempy1,E2_m1_m2_10points)
-  equivalence(newtempz1,E3_m1_m2_10points)
-
-  real(kind=CUSTOM_REAL), dimension(100,10) :: &
-    A1_mxm_m2_m1_10points,A2_mxm_m2_m1_10points,A3_mxm_m2_m1_10points
-  real(kind=CUSTOM_REAL), dimension(100,10) :: &
-    C1_mxm_m2_m1_10points,C2_mxm_m2_m1_10points,C3_mxm_m2_m1_10points
-  real(kind=CUSTOM_REAL), dimension(100,10) :: &
-    E1_mxm_m2_m1_10points,E2_mxm_m2_m1_10points,E3_mxm_m2_m1_10points
-
-  equivalence(dummyx_loc,A1_mxm_m2_m1_10points)
-  equivalence(dummyy_loc,A2_mxm_m2_m1_10points)
-  equivalence(dummyz_loc,A3_mxm_m2_m1_10points)
-  equivalence(tempx3,C1_mxm_m2_m1_10points)
-  equivalence(tempy3,C2_mxm_m2_m1_10points)
-  equivalence(tempz3,C3_mxm_m2_m1_10points)
-  equivalence(newtempx3,E1_mxm_m2_m1_10points)
-  equivalence(newtempy3,E2_mxm_m2_m1_10points)
-  equivalence(newtempz3,E3_mxm_m2_m1_10points)
-
-  ! local attenuation parameters
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
-       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
-  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
-  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
-  real(kind=CUSTOM_REAL) Sn,Snp1
-  real(kind=CUSTOM_REAL) templ
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3
-
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! local anisotropy parameters
-  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
-                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
-
-  integer i_SLS,imodulo_N_SLS
-  integer ispec,iglob,ispec_p,num_elements
-  integer i,j,k
-
-  imodulo_N_SLS = mod(N_SLS,3)
-
-  ! choses inner/outer elements
-  if( iphase == 1 ) then
-    num_elements = nspec_outer_elastic
-  else
-    num_elements = nspec_inner_elastic
-  endif
-
-  do ispec_p = 1,num_elements
-
-        ! returns element id from stored element list
-        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
-
-        ! adjoint simulations: moho kernel
-        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
-          if (is_moho_top(ispec)) then
-            ispec2D_moho_top = ispec2D_moho_top + 1
-          else if (is_moho_bot(ispec)) then
-            ispec2D_moho_bot = ispec2D_moho_bot + 1
-          endif
-        endif ! adjoint
-
-        ! stores displacment values in local array
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-                iglob = ibool(i,j,k,ispec)
-                dummyx_loc(i,j,k) = displ(1,iglob)
-                dummyy_loc(i,j,k) = displ(2,iglob)
-                dummyz_loc(i,j,k) = displ(3,iglob)
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_10points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
-        do j=1,m2
-          do i=1,m1
-            C1_m1_m2_10points(i,j) = hprime_xx(i,1)*B1_m1_m2_10points(1,j) + &
-                                  hprime_xx(i,2)*B1_m1_m2_10points(2,j) + &
-                                  hprime_xx(i,3)*B1_m1_m2_10points(3,j) + &
-                                  hprime_xx(i,4)*B1_m1_m2_10points(4,j) + &
-                                  hprime_xx(i,5)*B1_m1_m2_10points(5,j) + &
-                                  hprime_xx(i,6)*B1_m1_m2_10points(6,j) + &
-                                  hprime_xx(i,7)*B1_m1_m2_10points(7,j) + &
-                                  hprime_xx(i,8)*B1_m1_m2_10points(8,j) + &
-                                  hprime_xx(i,9)*B1_m1_m2_10points(9,j) + &
-                                  hprime_xx(i,10)*B1_m1_m2_10points(10,j)
-            C2_m1_m2_10points(i,j) = hprime_xx(i,1)*B2_m1_m2_10points(1,j) + &
-                                  hprime_xx(i,2)*B2_m1_m2_10points(2,j) + &
-                                  hprime_xx(i,3)*B2_m1_m2_10points(3,j) + &
-                                  hprime_xx(i,4)*B2_m1_m2_10points(4,j) + &
-                                  hprime_xx(i,5)*B2_m1_m2_10points(5,j) + &
-                                  hprime_xx(i,6)*B2_m1_m2_10points(6,j) + &
-                                  hprime_xx(i,7)*B2_m1_m2_10points(7,j) + &
-                                  hprime_xx(i,8)*B2_m1_m2_10points(8,j) + &
-                                  hprime_xx(i,9)*B2_m1_m2_10points(9,j) + &
-                                  hprime_xx(i,10)*B2_m1_m2_10points(10,j)
-            C3_m1_m2_10points(i,j) = hprime_xx(i,1)*B3_m1_m2_10points(1,j) + &
-                                  hprime_xx(i,2)*B3_m1_m2_10points(2,j) + &
-                                  hprime_xx(i,3)*B3_m1_m2_10points(3,j) + &
-                                  hprime_xx(i,4)*B3_m1_m2_10points(4,j) + &
-                                  hprime_xx(i,5)*B3_m1_m2_10points(5,j) + &
-                                  hprime_xx(i,6)*B3_m1_m2_10points(6,j) + &
-                                  hprime_xx(i,7)*B3_m1_m2_10points(7,j) + &
-                                  hprime_xx(i,8)*B3_m1_m2_10points(8,j) + &
-                                  hprime_xx(i,9)*B3_m1_m2_10points(9,j) + &
-                                  hprime_xx(i,10)*B3_m1_m2_10points(10,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_10points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
-        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
-        do j=1,m1
-          do i=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyx_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyx_loc(i,9,k)*hprime_xxT(9,j) + &
-                            dummyx_loc(i,10,k)*hprime_xxT(10,j)
-              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyy_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyy_loc(i,9,k)*hprime_xxT(9,j) + &
-                            dummyy_loc(i,10,k)*hprime_xxT(10,j)
-              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
-                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
-                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
-                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
-                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
-                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
-                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
-                            dummyz_loc(i,8,k)*hprime_xxT(8,j) + &
-                            dummyz_loc(i,9,k)*hprime_xxT(9,j) + &
-                            dummyz_loc(i,10,k)*hprime_xxT(10,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_10points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
-        do j=1,m1
-          do i=1,m2
-            C1_mxm_m2_m1_10points(i,j) = A1_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
-                                      A1_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
-                                      A1_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
-                                      A1_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
-                                      A1_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
-                                      A1_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
-                                      A1_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
-                                      A1_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
-                                      A1_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
-                                      A1_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
-            C2_mxm_m2_m1_10points(i,j) = A2_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
-                                      A2_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
-                                      A2_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
-                                      A2_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
-                                      A2_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
-                                      A2_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
-                                      A2_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
-                                      A2_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
-                                      A2_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
-                                      A2_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
-            C3_mxm_m2_m1_10points(i,j) = A3_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
-                                      A3_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
-                                      A3_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
-                                      A3_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
-                                      A3_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
-                                      A3_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
-                                      A3_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
-                                      A3_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
-                                      A3_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
-                                      A3_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-              ! get derivatives of ux, uy and uz with respect to x, y and z
-              xixl = xix(i,j,k,ispec)
-              xiyl = xiy(i,j,k,ispec)
-              xizl = xiz(i,j,k,ispec)
-              etaxl = etax(i,j,k,ispec)
-              etayl = etay(i,j,k,ispec)
-              etazl = etaz(i,j,k,ispec)
-              gammaxl = gammax(i,j,k,ispec)
-              gammayl = gammay(i,j,k,ispec)
-              gammazl = gammaz(i,j,k,ispec)
-              jacobianl = jacobian(i,j,k,ispec)
-
-              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-              ! save strain on the Moho boundary
-              if (SAVE_MOHO_MESH ) then
-                if (is_moho_top(ispec)) then
-                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
-                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
-                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
-                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
-                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
-                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
-                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
-                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
-                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
-                else if (is_moho_bot(ispec)) then
-                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
-                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
-                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
-                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
-                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
-                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
-                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
-                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
-                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
-                endif
-              endif
-
-              ! precompute some sums to save CPU time
-              duxdxl_plus_duydyl = duxdxl + duydyl
-              duxdxl_plus_duzdzl = duxdxl + duzdzl
-              duydyl_plus_duzdzl = duydyl + duzdzl
-              duxdyl_plus_duydxl = duxdyl + duydxl
-              duzdxl_plus_duxdzl = duzdxl + duxdzl
-              duzdyl_plus_duydzl = duzdyl + duydzl
-
-              ! computes deviatoric strain attenuation and/or for kernel calculations
-              if (COMPUTE_AND_STORE_STRAIN) then
-                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
-                epsilondev_xx_loc(i,j,k) = duxdxl - templ
-                epsilondev_yy_loc(i,j,k) = duydyl - templ
-                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
-                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
-                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
-              endif
-
-              kappal = kappastore(i,j,k,ispec)
-              mul = mustore(i,j,k,ispec)
-
-              ! attenuation
-              if(ATTENUATION) then
-                ! use unrelaxed parameters if attenuation
-                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
-              endif
-
-  ! full anisotropic case, stress calculations
-              if(ANISOTROPY) then
-                c11 = c11store(i,j,k,ispec)
-                c12 = c12store(i,j,k,ispec)
-                c13 = c13store(i,j,k,ispec)
-                c14 = c14store(i,j,k,ispec)
-                c15 = c15store(i,j,k,ispec)
-                c16 = c16store(i,j,k,ispec)
-                c22 = c22store(i,j,k,ispec)
-                c23 = c23store(i,j,k,ispec)
-                c24 = c24store(i,j,k,ispec)
-                c25 = c25store(i,j,k,ispec)
-                c26 = c26store(i,j,k,ispec)
-                c33 = c33store(i,j,k,ispec)
-                c34 = c34store(i,j,k,ispec)
-                c35 = c35store(i,j,k,ispec)
-                c36 = c36store(i,j,k,ispec)
-                c44 = c44store(i,j,k,ispec)
-                c45 = c45store(i,j,k,ispec)
-                c46 = c46store(i,j,k,ispec)
-                c55 = c55store(i,j,k,ispec)
-                c56 = c56store(i,j,k,ispec)
-                c66 = c66store(i,j,k,ispec)
-
-                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-              else
-
-  ! isotropic case
-                lambdalplus2mul = kappal + FOUR_THIRDS * mul
-                lambdal = lambdalplus2mul - 2.*mul
-
-                ! compute stress sigma
-                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-                sigma_xy = mul*duxdyl_plus_duydxl
-                sigma_xz = mul*duzdxl_plus_duxdzl
-                sigma_yz = mul*duzdyl_plus_duydzl
-
-              endif ! ANISOTROPY
-
-              ! subtract memory variables if attenuation
-              if(ATTENUATION) then
-! way 1
-!                do i_sls = 1,N_SLS
-!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
-!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
-!                  sigma_xx = sigma_xx - R_xx_val
-!                  sigma_yy = sigma_yy - R_yy_val
-!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
-!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-!                enddo
-
-! way 2
-! note: this should help compilers to pipeline the code and make better use of the cache;
-!          depending on compilers, it can further decrease the computation time by ~ 30%.
-!          by default, N_SLS = 3, therefore we take steps of 3
-              if(imodulo_N_SLS >= 1) then
-                do i_sls = 1,imodulo_N_SLS
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-                enddo
-              endif
-
-              if(N_SLS >= imodulo_N_SLS+1) then
-                do i_sls = imodulo_N_SLS+1,N_SLS,3
-                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
-                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
-                  sigma_xx = sigma_xx - R_xx_val1
-                  sigma_yy = sigma_yy - R_yy_val1
-                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
-
-                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
-                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
-                  sigma_xx = sigma_xx - R_xx_val2
-                  sigma_yy = sigma_yy - R_yy_val2
-                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
-
-                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
-                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
-                  sigma_xx = sigma_xx - R_xx_val3
-                  sigma_yy = sigma_yy - R_yy_val3
-                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
-                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
-                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
-                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
-                enddo
-              endif
-
-
-              endif
-
-            ! define symmetric components of sigma
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
-
-            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-            enddo
-          enddo
-        enddo
-
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
-        ! call mxm_m1_m2_10points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
-        do j=1,m2
-          do i=1,m1
-            E1_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_10points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C1_m1_m2_10points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C1_m1_m2_10points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C1_m1_m2_10points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C1_m1_m2_10points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C1_m1_m2_10points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C1_m1_m2_10points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C1_m1_m2_10points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C1_m1_m2_10points(9,j) + &
-                                  hprimewgll_xxT(i,10)*C1_m1_m2_10points(10,j)
-            E2_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_10points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C2_m1_m2_10points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C2_m1_m2_10points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C2_m1_m2_10points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C2_m1_m2_10points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C2_m1_m2_10points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C2_m1_m2_10points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C2_m1_m2_10points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C2_m1_m2_10points(9,j) + &
-                                  hprimewgll_xxT(i,10)*C2_m1_m2_10points(10,j)
-            E3_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_10points(1,j) + &
-                                  hprimewgll_xxT(i,2)*C3_m1_m2_10points(2,j) + &
-                                  hprimewgll_xxT(i,3)*C3_m1_m2_10points(3,j) + &
-                                  hprimewgll_xxT(i,4)*C3_m1_m2_10points(4,j) + &
-                                  hprimewgll_xxT(i,5)*C3_m1_m2_10points(5,j) + &
-                                  hprimewgll_xxT(i,6)*C3_m1_m2_10points(6,j) + &
-                                  hprimewgll_xxT(i,7)*C3_m1_m2_10points(7,j) + &
-                                  hprimewgll_xxT(i,8)*C3_m1_m2_10points(8,j) + &
-                                  hprimewgll_xxT(i,9)*C3_m1_m2_10points(9,j) + &
-                                  hprimewgll_xxT(i,10)*C3_m1_m2_10points(10,j)
-          enddo
-        enddo
-
-        !   call mxm_m1_m1_10points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
-        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
-        do i=1,m1
-          do j=1,m1
-            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
-            do k = 1,NGLLX
-              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempx2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempx2(i,9,k)*hprimewgll_xx(9,j) + &
-                               tempx2(i,10,k)*hprimewgll_xx(10,j)
-              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempy2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempy2(i,9,k)*hprimewgll_xx(9,j) + &
-                               tempy2(i,10,k)*hprimewgll_xx(10,j)
-              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
-                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
-                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
-                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
-                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
-                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
-                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
-                               tempz2(i,8,k)*hprimewgll_xx(8,j) + &
-                               tempz2(i,9,k)*hprimewgll_xx(9,j) + &
-                               tempz2(i,10,k)*hprimewgll_xx(10,j)
-            enddo
-          enddo
-        enddo
-
-        ! call mxm_m2_m1_10points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
-        do j=1,m1
-          do i=1,m2
-            E1_mxm_m2_m1_10points(i,j) = C1_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
-                                      C1_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
-                                      C1_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
-                                      C1_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
-                                      C1_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
-                                      C1_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
-                                      C1_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
-                                      C1_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
-                                      C1_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
-                                      C1_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
-            E2_mxm_m2_m1_10points(i,j) = C2_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
-                                      C2_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
-                                      C2_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
-                                      C2_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
-                                      C2_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
-                                      C2_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
-                                      C2_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
-                                      C2_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
-                                      C2_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
-                                      C2_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
-            E3_mxm_m2_m1_10points(i,j) = C3_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
-                                      C3_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
-                                      C3_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
-                                      C3_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
-                                      C3_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
-                                      C3_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
-                                      C3_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
-                                      C3_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
-                                      C3_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
-                                      C3_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
-          enddo
-        enddo
-
-        do k=1,NGLLZ
-          do j=1,NGLLY
-            do i=1,NGLLX
-
-              fac1 = wgllwgll_yz(j,k)
-              fac2 = wgllwgll_xz(i,k)
-              fac3 = wgllwgll_xy(i,j)
-
-              ! sum contributions from each element to the global mesh using indirect addressing
-              iglob = ibool(i,j,k,ispec)
-              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
-                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
-              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
-                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
-              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
-                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
-
-              !  update memory variables based upon the Runge-Kutta scheme
-              if(ATTENUATION) then
-
-                 ! use Runge-Kutta scheme to march in time
-                 do i_sls = 1,N_SLS
-
-                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
-
-                    alphaval_loc = alphaval(i_sls)
-                    betaval_loc = betaval(i_sls)
-                    gammaval_loc = gammaval(i_sls)
-
-                    ! term in xx
-                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
-                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yy
-                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
-                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in zz not computed since zero trace
-                    ! term in xy
-                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
-                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in xz
-                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
-                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-                    ! term in yz
-                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
-                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
-                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
-                                      betaval_loc * Sn + gammaval_loc * Snp1
-
-                 enddo   ! end of loop on memory variables
-
-              endif  !  end attenuation
-
-            enddo
-          enddo
-        enddo
-
-        ! save deviatoric strain for Runge-Kutta scheme
-        if ( COMPUTE_AND_STORE_STRAIN ) then
-          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
-          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
-          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
-          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
-        endif
-
-  enddo  ! spectral element loop
-
-end subroutine compute_forces_elastic_Dev_10p

Added: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic_Dev2.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic_Dev2.f90	                        (rev 0)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic_Dev2.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -0,0 +1,3477 @@
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+
+! higher-order Deville routines (NGLL == 6 to NGLL == 10 )
+!
+! note: put these routines into this extra file to avoid reaching internal threshold 
+!          for vectorizations when compiling
+
+subroutine compute_forces_elastic_Dev_6p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,6) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(6,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(6,6,6) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(6,6,6) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(6,36) :: B1_m1_m2_6points,B2_m1_m2_6points,B3_m1_m2_6points
+  real(kind=CUSTOM_REAL), dimension(6,36) :: C1_m1_m2_6points,C2_m1_m2_6points,C3_m1_m2_6points
+  real(kind=CUSTOM_REAL), dimension(6,36) :: E1_m1_m2_6points,E2_m1_m2_6points,E3_m1_m2_6points
+
+  equivalence(dummyx_loc,B1_m1_m2_6points)
+  equivalence(dummyy_loc,B2_m1_m2_6points)
+  equivalence(dummyz_loc,B3_m1_m2_6points)
+  equivalence(tempx1,C1_m1_m2_6points)
+  equivalence(tempy1,C2_m1_m2_6points)
+  equivalence(tempz1,C3_m1_m2_6points)
+  equivalence(newtempx1,E1_m1_m2_6points)
+  equivalence(newtempy1,E2_m1_m2_6points)
+  equivalence(newtempz1,E3_m1_m2_6points)
+
+  real(kind=CUSTOM_REAL), dimension(36,6) :: &
+    A1_mxm_m2_m1_6points,A2_mxm_m2_m1_6points,A3_mxm_m2_m1_6points
+  real(kind=CUSTOM_REAL), dimension(36,6) :: &
+    C1_mxm_m2_m1_6points,C2_mxm_m2_m1_6points,C3_mxm_m2_m1_6points
+  real(kind=CUSTOM_REAL), dimension(36,6) :: &
+    E1_mxm_m2_m1_6points,E2_mxm_m2_m1_6points,E3_mxm_m2_m1_6points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_6points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_6points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_6points)
+  equivalence(tempx3,C1_mxm_m2_m1_6points)
+  equivalence(tempy3,C2_mxm_m2_m1_6points)
+  equivalence(tempz3,C3_mxm_m2_m1_6points)
+  equivalence(newtempx3,E1_mxm_m2_m1_6points)
+  equivalence(newtempy3,E2_mxm_m2_m1_6points)
+  equivalence(newtempz3,E3_mxm_m2_m1_6points)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+        ! stores displacment values in local array
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_6points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_6points(i,j) = hprime_xx(i,1)*B1_m1_m2_6points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_6points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_6points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_6points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_6points(5,j) + &
+                                  hprime_xx(i,6)*B1_m1_m2_6points(6,j)
+            C2_m1_m2_6points(i,j) = hprime_xx(i,1)*B2_m1_m2_6points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_6points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_6points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_6points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_6points(5,j) + &
+                                  hprime_xx(i,6)*B2_m1_m2_6points(6,j)
+            C3_m1_m2_6points(i,j) = hprime_xx(i,1)*B3_m1_m2_6points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_6points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_6points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_6points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_6points(5,j) + &
+                                  hprime_xx(i,6)*B3_m1_m2_6points(6,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_6points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyx_loc(i,6,k)*hprime_xxT(6,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyy_loc(i,6,k)*hprime_xxT(6,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyz_loc(i,6,k)*hprime_xxT(6,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_6points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_6points(i,j) = A1_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
+                                      A1_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
+            C2_mxm_m2_m1_6points(i,j) = A2_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
+                                      A2_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
+            C3_mxm_m2_m1_6points(i,j) = A3_mxm_m2_m1_6points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_6points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_6points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_6points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_6points(i,5)*hprime_xxT(5,j) + &
+                                      A3_mxm_m2_m1_6points(i,6)*hprime_xxT(6,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! computes deviatoric strain attenuation and/or for kernel calculations
+              if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_6points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_6points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_6points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_6points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_6points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_6points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C1_m1_m2_6points(6,j)
+            E2_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_6points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_6points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_6points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_6points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_6points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C2_m1_m2_6points(6,j)
+            E3_m1_m2_6points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_6points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_6points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_6points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_6points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_6points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C3_m1_m2_6points(6,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_6points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempx2(i,6,k)*hprimewgll_xx(6,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempy2(i,6,k)*hprimewgll_xx(6,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempz2(i,6,k)*hprimewgll_xx(6,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_6points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_6points(i,j) = C1_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
+                                      C1_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
+            E2_mxm_m2_m1_6points(i,j) = C2_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
+                                      C2_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
+            E3_mxm_m2_m1_6points(i,j) = C3_mxm_m2_m1_6points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_6points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_6points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_6points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_6points(i,5)*hprimewgll_xx(5,j) + &
+                                      C3_mxm_m2_m1_6points(i,6)*hprimewgll_xx(6,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+end subroutine compute_forces_elastic_Dev_6p
+
+!
+!=====================================================================
+!
+
+subroutine compute_forces_elastic_Dev_7p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,7) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(7,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(7,7,7) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(7,7,7) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(7,49) :: B1_m1_m2_7points,B2_m1_m2_7points,B3_m1_m2_7points
+  real(kind=CUSTOM_REAL), dimension(7,49) :: C1_m1_m2_7points,C2_m1_m2_7points,C3_m1_m2_7points
+  real(kind=CUSTOM_REAL), dimension(7,49) :: E1_m1_m2_7points,E2_m1_m2_7points,E3_m1_m2_7points
+
+  equivalence(dummyx_loc,B1_m1_m2_7points)
+  equivalence(dummyy_loc,B2_m1_m2_7points)
+  equivalence(dummyz_loc,B3_m1_m2_7points)
+  equivalence(tempx1,C1_m1_m2_7points)
+  equivalence(tempy1,C2_m1_m2_7points)
+  equivalence(tempz1,C3_m1_m2_7points)
+  equivalence(newtempx1,E1_m1_m2_7points)
+  equivalence(newtempy1,E2_m1_m2_7points)
+  equivalence(newtempz1,E3_m1_m2_7points)
+
+  real(kind=CUSTOM_REAL), dimension(49,7) :: &
+    A1_mxm_m2_m1_7points,A2_mxm_m2_m1_7points,A3_mxm_m2_m1_7points
+  real(kind=CUSTOM_REAL), dimension(49,7) :: &
+    C1_mxm_m2_m1_7points,C2_mxm_m2_m1_7points,C3_mxm_m2_m1_7points
+  real(kind=CUSTOM_REAL), dimension(49,7) :: &
+    E1_mxm_m2_m1_7points,E2_mxm_m2_m1_7points,E3_mxm_m2_m1_7points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_7points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_7points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_7points)
+  equivalence(tempx3,C1_mxm_m2_m1_7points)
+  equivalence(tempy3,C2_mxm_m2_m1_7points)
+  equivalence(tempz3,C3_mxm_m2_m1_7points)
+  equivalence(newtempx3,E1_mxm_m2_m1_7points)
+  equivalence(newtempy3,E2_mxm_m2_m1_7points)
+  equivalence(newtempz3,E3_mxm_m2_m1_7points)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+        ! stores displacment values in local array
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_7points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_7points(i,j) = hprime_xx(i,1)*B1_m1_m2_7points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_7points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_7points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_7points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_7points(5,j) + &
+                                  hprime_xx(i,6)*B1_m1_m2_7points(6,j) + &
+                                  hprime_xx(i,7)*B1_m1_m2_7points(7,j)
+            C2_m1_m2_7points(i,j) = hprime_xx(i,1)*B2_m1_m2_7points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_7points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_7points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_7points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_7points(5,j) + &
+                                  hprime_xx(i,6)*B2_m1_m2_7points(6,j) + &
+                                  hprime_xx(i,7)*B2_m1_m2_7points(7,j)
+            C3_m1_m2_7points(i,j) = hprime_xx(i,1)*B3_m1_m2_7points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_7points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_7points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_7points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_7points(5,j) + &
+                                  hprime_xx(i,6)*B3_m1_m2_7points(6,j) + &
+                                  hprime_xx(i,7)*B3_m1_m2_7points(7,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_7points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyx_loc(i,7,k)*hprime_xxT(7,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyy_loc(i,7,k)*hprime_xxT(7,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyz_loc(i,7,k)*hprime_xxT(7,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_7points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_7points(i,j) = A1_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
+                                      A1_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
+                                      A1_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
+            C2_mxm_m2_m1_7points(i,j) = A2_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
+                                      A2_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
+                                      A2_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
+            C3_mxm_m2_m1_7points(i,j) = A3_mxm_m2_m1_7points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_7points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_7points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_7points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_7points(i,5)*hprime_xxT(5,j) + &
+                                      A3_mxm_m2_m1_7points(i,6)*hprime_xxT(6,j) + &
+                                      A3_mxm_m2_m1_7points(i,7)*hprime_xxT(7,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! computes deviatoric strain attenuation and/or for kernel calculations
+              if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_7points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_7points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_7points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_7points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_7points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_7points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C1_m1_m2_7points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C1_m1_m2_7points(7,j)
+            E2_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_7points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_7points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_7points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_7points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_7points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C2_m1_m2_7points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C2_m1_m2_7points(7,j)
+            E3_m1_m2_7points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_7points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_7points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_7points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_7points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_7points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C3_m1_m2_7points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C3_m1_m2_7points(7,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_7points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempx2(i,7,k)*hprimewgll_xx(7,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempy2(i,7,k)*hprimewgll_xx(7,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempz2(i,7,k)*hprimewgll_xx(7,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_7points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_7points(i,j) = C1_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
+                                      C1_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
+                                      C1_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
+            E2_mxm_m2_m1_7points(i,j) = C2_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
+                                      C2_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
+                                      C2_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
+            E3_mxm_m2_m1_7points(i,j) = C3_mxm_m2_m1_7points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_7points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_7points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_7points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_7points(i,5)*hprimewgll_xx(5,j) + &
+                                      C3_mxm_m2_m1_7points(i,6)*hprimewgll_xx(6,j) + &
+                                      C3_mxm_m2_m1_7points(i,7)*hprimewgll_xx(7,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+end subroutine compute_forces_elastic_Dev_7p
+
+!
+!=====================================================================
+!
+
+subroutine compute_forces_elastic_Dev_8p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,8) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(8,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(8,8,8) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(8,8,8) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(8,64) :: B1_m1_m2_8points,B2_m1_m2_8points,B3_m1_m2_8points
+  real(kind=CUSTOM_REAL), dimension(8,64) :: C1_m1_m2_8points,C2_m1_m2_8points,C3_m1_m2_8points
+  real(kind=CUSTOM_REAL), dimension(8,64) :: E1_m1_m2_8points,E2_m1_m2_8points,E3_m1_m2_8points
+
+  equivalence(dummyx_loc,B1_m1_m2_8points)
+  equivalence(dummyy_loc,B2_m1_m2_8points)
+  equivalence(dummyz_loc,B3_m1_m2_8points)
+  equivalence(tempx1,C1_m1_m2_8points)
+  equivalence(tempy1,C2_m1_m2_8points)
+  equivalence(tempz1,C3_m1_m2_8points)
+  equivalence(newtempx1,E1_m1_m2_8points)
+  equivalence(newtempy1,E2_m1_m2_8points)
+  equivalence(newtempz1,E3_m1_m2_8points)
+
+  real(kind=CUSTOM_REAL), dimension(64,8) :: &
+    A1_mxm_m2_m1_8points,A2_mxm_m2_m1_8points,A3_mxm_m2_m1_8points
+  real(kind=CUSTOM_REAL), dimension(64,8) :: &
+    C1_mxm_m2_m1_8points,C2_mxm_m2_m1_8points,C3_mxm_m2_m1_8points
+  real(kind=CUSTOM_REAL), dimension(64,8) :: &
+    E1_mxm_m2_m1_8points,E2_mxm_m2_m1_8points,E3_mxm_m2_m1_8points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_8points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_8points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_8points)
+  equivalence(tempx3,C1_mxm_m2_m1_8points)
+  equivalence(tempy3,C2_mxm_m2_m1_8points)
+  equivalence(tempz3,C3_mxm_m2_m1_8points)
+  equivalence(newtempx3,E1_mxm_m2_m1_8points)
+  equivalence(newtempy3,E2_mxm_m2_m1_8points)
+  equivalence(newtempz3,E3_mxm_m2_m1_8points)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+        ! stores displacment values in local array
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_8points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_8points(i,j) = hprime_xx(i,1)*B1_m1_m2_8points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_8points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_8points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_8points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_8points(5,j) + &
+                                  hprime_xx(i,6)*B1_m1_m2_8points(6,j) + &
+                                  hprime_xx(i,7)*B1_m1_m2_8points(7,j) + &
+                                  hprime_xx(i,8)*B1_m1_m2_8points(8,j)
+            C2_m1_m2_8points(i,j) = hprime_xx(i,1)*B2_m1_m2_8points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_8points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_8points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_8points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_8points(5,j) + &
+                                  hprime_xx(i,6)*B2_m1_m2_8points(6,j) + &
+                                  hprime_xx(i,7)*B2_m1_m2_8points(7,j) + &
+                                  hprime_xx(i,8)*B2_m1_m2_8points(8,j)
+            C3_m1_m2_8points(i,j) = hprime_xx(i,1)*B3_m1_m2_8points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_8points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_8points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_8points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_8points(5,j) + &
+                                  hprime_xx(i,6)*B3_m1_m2_8points(6,j) + &
+                                  hprime_xx(i,7)*B3_m1_m2_8points(7,j) + &
+                                  hprime_xx(i,8)*B3_m1_m2_8points(8,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_8points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyx_loc(i,8,k)*hprime_xxT(8,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyy_loc(i,8,k)*hprime_xxT(8,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyz_loc(i,8,k)*hprime_xxT(8,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_8points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_8points(i,j) = A1_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
+                                      A1_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
+                                      A1_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
+                                      A1_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
+            C2_mxm_m2_m1_8points(i,j) = A2_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
+                                      A2_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
+                                      A2_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
+                                      A2_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
+            C3_mxm_m2_m1_8points(i,j) = A3_mxm_m2_m1_8points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_8points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_8points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_8points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_8points(i,5)*hprime_xxT(5,j) + &
+                                      A3_mxm_m2_m1_8points(i,6)*hprime_xxT(6,j) + &
+                                      A3_mxm_m2_m1_8points(i,7)*hprime_xxT(7,j) + &
+                                      A3_mxm_m2_m1_8points(i,8)*hprime_xxT(8,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! computes deviatoric strain attenuation and/or for kernel calculations
+              if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_8points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_8points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_8points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_8points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_8points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_8points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C1_m1_m2_8points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C1_m1_m2_8points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C1_m1_m2_8points(8,j)
+            E2_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_8points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_8points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_8points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_8points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_8points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C2_m1_m2_8points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C2_m1_m2_8points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C2_m1_m2_8points(8,j)
+            E3_m1_m2_8points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_8points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_8points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_8points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_8points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_8points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C3_m1_m2_8points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C3_m1_m2_8points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C3_m1_m2_8points(8,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_8points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempx2(i,8,k)*hprimewgll_xx(8,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempy2(i,8,k)*hprimewgll_xx(8,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempz2(i,8,k)*hprimewgll_xx(8,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_8points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_8points(i,j) = C1_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
+                                      C1_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
+                                      C1_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
+                                      C1_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
+            E2_mxm_m2_m1_8points(i,j) = C2_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
+                                      C2_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
+                                      C2_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
+                                      C2_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
+            E3_mxm_m2_m1_8points(i,j) = C3_mxm_m2_m1_8points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_8points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_8points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_8points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_8points(i,5)*hprimewgll_xx(5,j) + &
+                                      C3_mxm_m2_m1_8points(i,6)*hprimewgll_xx(6,j) + &
+                                      C3_mxm_m2_m1_8points(i,7)*hprimewgll_xx(7,j) + &
+                                      C3_mxm_m2_m1_8points(i,8)*hprimewgll_xx(8,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+end subroutine compute_forces_elastic_Dev_8p
+
+!
+!=====================================================================
+!
+
+subroutine compute_forces_elastic_Dev_9p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,9) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(9,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(9,9,9) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(9,9,9) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(9,81) :: B1_m1_m2_9points,B2_m1_m2_9points,B3_m1_m2_9points
+  real(kind=CUSTOM_REAL), dimension(9,81) :: C1_m1_m2_9points,C2_m1_m2_9points,C3_m1_m2_9points
+  real(kind=CUSTOM_REAL), dimension(9,81) :: E1_m1_m2_9points,E2_m1_m2_9points,E3_m1_m2_9points
+
+  equivalence(dummyx_loc,B1_m1_m2_9points)
+  equivalence(dummyy_loc,B2_m1_m2_9points)
+  equivalence(dummyz_loc,B3_m1_m2_9points)
+  equivalence(tempx1,C1_m1_m2_9points)
+  equivalence(tempy1,C2_m1_m2_9points)
+  equivalence(tempz1,C3_m1_m2_9points)
+  equivalence(newtempx1,E1_m1_m2_9points)
+  equivalence(newtempy1,E2_m1_m2_9points)
+  equivalence(newtempz1,E3_m1_m2_9points)
+
+  real(kind=CUSTOM_REAL), dimension(81,9) :: &
+    A1_mxm_m2_m1_9points,A2_mxm_m2_m1_9points,A3_mxm_m2_m1_9points
+  real(kind=CUSTOM_REAL), dimension(81,9) :: &
+    C1_mxm_m2_m1_9points,C2_mxm_m2_m1_9points,C3_mxm_m2_m1_9points
+  real(kind=CUSTOM_REAL), dimension(81,9) :: &
+    E1_mxm_m2_m1_9points,E2_mxm_m2_m1_9points,E3_mxm_m2_m1_9points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_9points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_9points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_9points)
+  equivalence(tempx3,C1_mxm_m2_m1_9points)
+  equivalence(tempy3,C2_mxm_m2_m1_9points)
+  equivalence(tempz3,C3_mxm_m2_m1_9points)
+  equivalence(newtempx3,E1_mxm_m2_m1_9points)
+  equivalence(newtempy3,E2_mxm_m2_m1_9points)
+  equivalence(newtempz3,E3_mxm_m2_m1_9points)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+        ! stores displacment values in local array
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_9points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_9points(i,j) = hprime_xx(i,1)*B1_m1_m2_9points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_9points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_9points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_9points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_9points(5,j) + &
+                                  hprime_xx(i,6)*B1_m1_m2_9points(6,j) + &
+                                  hprime_xx(i,7)*B1_m1_m2_9points(7,j) + &
+                                  hprime_xx(i,8)*B1_m1_m2_9points(8,j) + &
+                                  hprime_xx(i,9)*B1_m1_m2_9points(9,j)
+            C2_m1_m2_9points(i,j) = hprime_xx(i,1)*B2_m1_m2_9points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_9points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_9points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_9points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_9points(5,j) + &
+                                  hprime_xx(i,6)*B2_m1_m2_9points(6,j) + &
+                                  hprime_xx(i,7)*B2_m1_m2_9points(7,j) + &
+                                  hprime_xx(i,8)*B2_m1_m2_9points(8,j) + &
+                                  hprime_xx(i,9)*B2_m1_m2_9points(9,j)
+            C3_m1_m2_9points(i,j) = hprime_xx(i,1)*B3_m1_m2_9points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_9points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_9points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_9points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_9points(5,j) + &
+                                  hprime_xx(i,6)*B3_m1_m2_9points(6,j) + &
+                                  hprime_xx(i,7)*B3_m1_m2_9points(7,j) + &
+                                  hprime_xx(i,8)*B3_m1_m2_9points(8,j) + &
+                                  hprime_xx(i,9)*B3_m1_m2_9points(9,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_9points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyx_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyx_loc(i,9,k)*hprime_xxT(9,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyy_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyy_loc(i,9,k)*hprime_xxT(9,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyz_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyz_loc(i,9,k)*hprime_xxT(9,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_9points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_9points(i,j) = A1_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
+                                      A1_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
+                                      A1_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
+                                      A1_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
+                                      A1_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
+            C2_mxm_m2_m1_9points(i,j) = A2_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
+                                      A2_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
+                                      A2_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
+                                      A2_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
+                                      A2_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
+            C3_mxm_m2_m1_9points(i,j) = A3_mxm_m2_m1_9points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_9points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_9points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_9points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_9points(i,5)*hprime_xxT(5,j) + &
+                                      A3_mxm_m2_m1_9points(i,6)*hprime_xxT(6,j) + &
+                                      A3_mxm_m2_m1_9points(i,7)*hprime_xxT(7,j) + &
+                                      A3_mxm_m2_m1_9points(i,8)*hprime_xxT(8,j) + &
+                                      A3_mxm_m2_m1_9points(i,9)*hprime_xxT(9,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! computes deviatoric strain attenuation and/or for kernel calculations
+              if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_9points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_9points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_9points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_9points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_9points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_9points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C1_m1_m2_9points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C1_m1_m2_9points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C1_m1_m2_9points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C1_m1_m2_9points(9,j)
+            E2_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_9points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_9points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_9points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_9points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_9points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C2_m1_m2_9points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C2_m1_m2_9points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C2_m1_m2_9points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C2_m1_m2_9points(9,j)
+            E3_m1_m2_9points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_9points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_9points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_9points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_9points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_9points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C3_m1_m2_9points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C3_m1_m2_9points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C3_m1_m2_9points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C3_m1_m2_9points(9,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_9points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempx2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempx2(i,9,k)*hprimewgll_xx(9,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempy2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempy2(i,9,k)*hprimewgll_xx(9,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempz2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempz2(i,9,k)*hprimewgll_xx(9,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_9points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_9points(i,j) = C1_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
+                                      C1_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
+                                      C1_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
+                                      C1_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
+                                      C1_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
+            E2_mxm_m2_m1_9points(i,j) = C2_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
+                                      C2_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
+                                      C2_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
+                                      C2_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
+                                      C2_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
+            E3_mxm_m2_m1_9points(i,j) = C3_mxm_m2_m1_9points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_9points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_9points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_9points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_9points(i,5)*hprimewgll_xx(5,j) + &
+                                      C3_mxm_m2_m1_9points(i,6)*hprimewgll_xx(6,j) + &
+                                      C3_mxm_m2_m1_9points(i,7)*hprimewgll_xx(7,j) + &
+                                      C3_mxm_m2_m1_9points(i,8)*hprimewgll_xx(8,j) + &
+                                      C3_mxm_m2_m1_9points(i,9)*hprimewgll_xx(9,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+end subroutine compute_forces_elastic_Dev_9p
+
+!
+!=====================================================================
+!
+
+subroutine compute_forces_elastic_Dev_10p( iphase ,NSPEC_AB,NGLOB_AB, &
+                                    displ,accel, &
+                                    xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                    hprime_xx,hprime_xxT, &
+                                    hprimewgll_xx,hprimewgll_xxT, &
+                                    wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                    kappastore,mustore,jacobian,ibool, &
+                                    ATTENUATION, &
+                                    one_minus_sum_beta,factor_common,alphaval,betaval,gammaval,&
+                                    NSPEC_ATTENUATION_AB, &
+                                    R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+                                    epsilondev_xz,epsilondev_yz,epsilon_trace_over_3, &
+                                    ANISOTROPY,NSPEC_ANISO, &
+                                    c11store,c12store,c13store,c14store,c15store,c16store,&
+                                    c22store,c23store,c24store,c25store,c26store,c33store,&
+                                    c34store,c35store,c36store,c44store,c45store,c46store,&
+                                    c55store,c56store,c66store, &
+                                    SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,NSPEC_STRAIN_ONLY, &
+                                    NSPEC_BOUN,NSPEC2D_MOHO,NSPEC_ADJOINT, &
+                                    is_moho_top,is_moho_bot, &
+                                    dsdx_top,dsdx_bot, &
+                                    ispec2D_moho_top,ispec2D_moho_bot, &
+                                    num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic,&
+                                    phase_ispec_inner_elastic)
+
+
+! computes elastic tensor term
+
+  use constants,only: CUSTOM_REAL,NGLLX,NGLLY,NGLLZ,NDIM, &
+                      N_SLS,SAVE_MOHO_MESH, &
+                      ONE_THIRD,FOUR_THIRDS,m1,m2
+  implicit none
+
+  integer :: NSPEC_AB,NGLOB_AB
+
+! displacement and acceleration
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: displ,accel
+
+! arrays with mesh parameters per slice
+  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: ibool
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_AB) :: &
+        kappastore,mustore,jacobian
+
+! array with derivatives of Lagrange polynomials and precalculated products
+  real(kind=CUSTOM_REAL), dimension(NGLLX,10) :: hprime_xx,hprimewgll_xxT
+  real(kind=CUSTOM_REAL), dimension(10,NGLLX) :: hprime_xxT,hprimewgll_xx
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
+  real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
+
+! memory variables and standard linear solids for attenuation
+  logical :: ATTENUATION
+  logical :: COMPUTE_AND_STORE_STRAIN
+  integer :: NSPEC_STRAIN_ONLY, NSPEC_ADJOINT
+  integer :: NSPEC_ATTENUATION_AB
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: one_minus_sum_beta
+  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB) :: factor_common
+  real(kind=CUSTOM_REAL), dimension(N_SLS) :: alphaval,betaval,gammaval
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS) :: &
+      R_xx,R_yy,R_xy,R_xz,R_yz
+
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_STRAIN_ONLY) :: &
+       epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz
+  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT) :: epsilon_trace_over_3
+
+! anisotropy
+  logical :: ANISOTROPY
+  integer :: NSPEC_ANISO
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ANISO) :: &
+            c11store,c12store,c13store,c14store,c15store,c16store, &
+            c22store,c23store,c24store,c25store,c26store,c33store, &
+            c34store,c35store,c36store,c44store,c45store,c46store, &
+            c55store,c56store,c66store
+
+  integer :: iphase
+  integer :: num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic
+  integer, dimension(num_phase_ispec_elastic,2) :: phase_ispec_inner_elastic
+
+! adjoint simulations
+  integer :: SIMULATION_TYPE
+  integer :: NSPEC_BOUN,NSPEC2D_MOHO
+
+  ! moho kernel
+  real(kind=CUSTOM_REAL),dimension(NDIM,NDIM,NGLLX,NGLLY,NGLLZ,NSPEC2D_MOHO):: &
+    dsdx_top,dsdx_bot
+  logical,dimension(NSPEC_BOUN) :: is_moho_top,is_moho_bot
+  integer :: ispec2D_moho_top, ispec2D_moho_bot
+
+! local parameters
+  real(kind=CUSTOM_REAL), dimension(10,10,10) :: dummyx_loc,dummyy_loc,dummyz_loc, &
+    newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+  real(kind=CUSTOM_REAL), dimension(10,10,10) :: &
+    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+  ! manually inline the calls to the Deville et al. (2002) routines
+  real(kind=CUSTOM_REAL), dimension(10,100) :: B1_m1_m2_10points,B2_m1_m2_10points,B3_m1_m2_10points
+  real(kind=CUSTOM_REAL), dimension(10,100) :: C1_m1_m2_10points,C2_m1_m2_10points,C3_m1_m2_10points
+  real(kind=CUSTOM_REAL), dimension(10,100) :: E1_m1_m2_10points,E2_m1_m2_10points,E3_m1_m2_10points
+
+  equivalence(dummyx_loc,B1_m1_m2_10points)
+  equivalence(dummyy_loc,B2_m1_m2_10points)
+  equivalence(dummyz_loc,B3_m1_m2_10points)
+  equivalence(tempx1,C1_m1_m2_10points)
+  equivalence(tempy1,C2_m1_m2_10points)
+  equivalence(tempz1,C3_m1_m2_10points)
+  equivalence(newtempx1,E1_m1_m2_10points)
+  equivalence(newtempy1,E2_m1_m2_10points)
+  equivalence(newtempz1,E3_m1_m2_10points)
+
+  real(kind=CUSTOM_REAL), dimension(100,10) :: &
+    A1_mxm_m2_m1_10points,A2_mxm_m2_m1_10points,A3_mxm_m2_m1_10points
+  real(kind=CUSTOM_REAL), dimension(100,10) :: &
+    C1_mxm_m2_m1_10points,C2_mxm_m2_m1_10points,C3_mxm_m2_m1_10points
+  real(kind=CUSTOM_REAL), dimension(100,10) :: &
+    E1_mxm_m2_m1_10points,E2_mxm_m2_m1_10points,E3_mxm_m2_m1_10points
+
+  equivalence(dummyx_loc,A1_mxm_m2_m1_10points)
+  equivalence(dummyy_loc,A2_mxm_m2_m1_10points)
+  equivalence(dummyz_loc,A3_mxm_m2_m1_10points)
+  equivalence(tempx3,C1_mxm_m2_m1_10points)
+  equivalence(tempy3,C2_mxm_m2_m1_10points)
+  equivalence(tempz3,C3_mxm_m2_m1_10points)
+  equivalence(newtempx3,E1_mxm_m2_m1_10points)
+  equivalence(newtempy3,E2_mxm_m2_m1_10points)
+  equivalence(newtempz3,E3_mxm_m2_m1_10points)
+
+  ! local attenuation parameters
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: epsilondev_xx_loc, &
+       epsilondev_yy_loc, epsilondev_xy_loc, epsilondev_xz_loc, epsilondev_yz_loc
+  real(kind=CUSTOM_REAL) R_xx_val1,R_yy_val1,R_xx_val2,R_yy_val2,R_xx_val3,R_yy_val3
+  real(kind=CUSTOM_REAL) factor_loc,alphaval_loc,betaval_loc,gammaval_loc
+  real(kind=CUSTOM_REAL) Sn,Snp1
+  real(kind=CUSTOM_REAL) templ
+
+  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
+
+  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
+  real(kind=CUSTOM_REAL) kappal
+
+  ! local anisotropy parameters
+  real(kind=CUSTOM_REAL) c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,&
+                        c33,c34,c35,c36,c44,c45,c46,c55,c56,c66
+
+  integer i_SLS,imodulo_N_SLS
+  integer ispec,iglob,ispec_p,num_elements
+  integer i,j,k
+
+  imodulo_N_SLS = mod(N_SLS,3)
+
+  ! choses inner/outer elements
+  if( iphase == 1 ) then
+    num_elements = nspec_outer_elastic
+  else
+    num_elements = nspec_inner_elastic
+  endif
+
+  do ispec_p = 1,num_elements
+
+        ! returns element id from stored element list
+        ispec = phase_ispec_inner_elastic(ispec_p,iphase)
+
+        ! adjoint simulations: moho kernel
+        if( SIMULATION_TYPE == 3 .and. SAVE_MOHO_MESH ) then
+          if (is_moho_top(ispec)) then
+            ispec2D_moho_top = ispec2D_moho_top + 1
+          else if (is_moho_bot(ispec)) then
+            ispec2D_moho_bot = ispec2D_moho_bot + 1
+          endif
+        endif ! adjoint
+
+        ! stores displacment values in local array
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+                iglob = ibool(i,j,k,ispec)
+                dummyx_loc(i,j,k) = displ(1,iglob)
+                dummyy_loc(i,j,k) = displ(2,iglob)
+                dummyz_loc(i,j,k) = displ(3,iglob)
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_10points(hprime_xx,dummyx_loc,dummyy_loc,dummyz_loc,tempx1,tempy1,tempz1)
+        do j=1,m2
+          do i=1,m1
+            C1_m1_m2_10points(i,j) = hprime_xx(i,1)*B1_m1_m2_10points(1,j) + &
+                                  hprime_xx(i,2)*B1_m1_m2_10points(2,j) + &
+                                  hprime_xx(i,3)*B1_m1_m2_10points(3,j) + &
+                                  hprime_xx(i,4)*B1_m1_m2_10points(4,j) + &
+                                  hprime_xx(i,5)*B1_m1_m2_10points(5,j) + &
+                                  hprime_xx(i,6)*B1_m1_m2_10points(6,j) + &
+                                  hprime_xx(i,7)*B1_m1_m2_10points(7,j) + &
+                                  hprime_xx(i,8)*B1_m1_m2_10points(8,j) + &
+                                  hprime_xx(i,9)*B1_m1_m2_10points(9,j) + &
+                                  hprime_xx(i,10)*B1_m1_m2_10points(10,j)
+            C2_m1_m2_10points(i,j) = hprime_xx(i,1)*B2_m1_m2_10points(1,j) + &
+                                  hprime_xx(i,2)*B2_m1_m2_10points(2,j) + &
+                                  hprime_xx(i,3)*B2_m1_m2_10points(3,j) + &
+                                  hprime_xx(i,4)*B2_m1_m2_10points(4,j) + &
+                                  hprime_xx(i,5)*B2_m1_m2_10points(5,j) + &
+                                  hprime_xx(i,6)*B2_m1_m2_10points(6,j) + &
+                                  hprime_xx(i,7)*B2_m1_m2_10points(7,j) + &
+                                  hprime_xx(i,8)*B2_m1_m2_10points(8,j) + &
+                                  hprime_xx(i,9)*B2_m1_m2_10points(9,j) + &
+                                  hprime_xx(i,10)*B2_m1_m2_10points(10,j)
+            C3_m1_m2_10points(i,j) = hprime_xx(i,1)*B3_m1_m2_10points(1,j) + &
+                                  hprime_xx(i,2)*B3_m1_m2_10points(2,j) + &
+                                  hprime_xx(i,3)*B3_m1_m2_10points(3,j) + &
+                                  hprime_xx(i,4)*B3_m1_m2_10points(4,j) + &
+                                  hprime_xx(i,5)*B3_m1_m2_10points(5,j) + &
+                                  hprime_xx(i,6)*B3_m1_m2_10points(6,j) + &
+                                  hprime_xx(i,7)*B3_m1_m2_10points(7,j) + &
+                                  hprime_xx(i,8)*B3_m1_m2_10points(8,j) + &
+                                  hprime_xx(i,9)*B3_m1_m2_10points(9,j) + &
+                                  hprime_xx(i,10)*B3_m1_m2_10points(10,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_10points(dummyx_loc(1,1,k),dummyy_loc(1,1,k),dummyz_loc(1,1,k), &
+        !          hprime_xxT,tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k))
+        do j=1,m1
+          do i=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              tempx2(i,j,k) = dummyx_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyx_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyx_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyx_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyx_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyx_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyx_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyx_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyx_loc(i,9,k)*hprime_xxT(9,j) + &
+                            dummyx_loc(i,10,k)*hprime_xxT(10,j)
+              tempy2(i,j,k) = dummyy_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyy_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyy_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyy_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyy_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyy_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyy_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyy_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyy_loc(i,9,k)*hprime_xxT(9,j) + &
+                            dummyy_loc(i,10,k)*hprime_xxT(10,j)
+              tempz2(i,j,k) = dummyz_loc(i,1,k)*hprime_xxT(1,j) + &
+                            dummyz_loc(i,2,k)*hprime_xxT(2,j) + &
+                            dummyz_loc(i,3,k)*hprime_xxT(3,j) + &
+                            dummyz_loc(i,4,k)*hprime_xxT(4,j) + &
+                            dummyz_loc(i,5,k)*hprime_xxT(5,j) + &
+                            dummyz_loc(i,6,k)*hprime_xxT(6,j) + &
+                            dummyz_loc(i,7,k)*hprime_xxT(7,j) + &
+                            dummyz_loc(i,8,k)*hprime_xxT(8,j) + &
+                            dummyz_loc(i,9,k)*hprime_xxT(9,j) + &
+                            dummyz_loc(i,10,k)*hprime_xxT(10,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_10points(dummyx_loc,dummyy_loc,dummyz_loc,tempx3,tempy3,tempz3)
+        do j=1,m1
+          do i=1,m2
+            C1_mxm_m2_m1_10points(i,j) = A1_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
+                                      A1_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
+                                      A1_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
+                                      A1_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
+                                      A1_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
+                                      A1_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
+                                      A1_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
+                                      A1_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
+                                      A1_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
+                                      A1_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
+            C2_mxm_m2_m1_10points(i,j) = A2_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
+                                      A2_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
+                                      A2_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
+                                      A2_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
+                                      A2_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
+                                      A2_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
+                                      A2_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
+                                      A2_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
+                                      A2_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
+                                      A2_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
+            C3_mxm_m2_m1_10points(i,j) = A3_mxm_m2_m1_10points(i,1)*hprime_xxT(1,j) + &
+                                      A3_mxm_m2_m1_10points(i,2)*hprime_xxT(2,j) + &
+                                      A3_mxm_m2_m1_10points(i,3)*hprime_xxT(3,j) + &
+                                      A3_mxm_m2_m1_10points(i,4)*hprime_xxT(4,j) + &
+                                      A3_mxm_m2_m1_10points(i,5)*hprime_xxT(5,j) + &
+                                      A3_mxm_m2_m1_10points(i,6)*hprime_xxT(6,j) + &
+                                      A3_mxm_m2_m1_10points(i,7)*hprime_xxT(7,j) + &
+                                      A3_mxm_m2_m1_10points(i,8)*hprime_xxT(8,j) + &
+                                      A3_mxm_m2_m1_10points(i,9)*hprime_xxT(9,j) + &
+                                      A3_mxm_m2_m1_10points(i,10)*hprime_xxT(10,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              ! get derivatives of ux, uy and uz with respect to x, y and z
+              xixl = xix(i,j,k,ispec)
+              xiyl = xiy(i,j,k,ispec)
+              xizl = xiz(i,j,k,ispec)
+              etaxl = etax(i,j,k,ispec)
+              etayl = etay(i,j,k,ispec)
+              etazl = etaz(i,j,k,ispec)
+              gammaxl = gammax(i,j,k,ispec)
+              gammayl = gammay(i,j,k,ispec)
+              gammazl = gammaz(i,j,k,ispec)
+              jacobianl = jacobian(i,j,k,ispec)
+
+              duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+              duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+              duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+              duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+              duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+              duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+              duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+              duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+              duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+              ! save strain on the Moho boundary
+              if (SAVE_MOHO_MESH ) then
+                if (is_moho_top(ispec)) then
+                  dsdx_top(1,1,i,j,k,ispec2D_moho_top) = duxdxl
+                  dsdx_top(1,2,i,j,k,ispec2D_moho_top) = duxdyl
+                  dsdx_top(1,3,i,j,k,ispec2D_moho_top) = duxdzl
+                  dsdx_top(2,1,i,j,k,ispec2D_moho_top) = duydxl
+                  dsdx_top(2,2,i,j,k,ispec2D_moho_top) = duydyl
+                  dsdx_top(2,3,i,j,k,ispec2D_moho_top) = duydzl
+                  dsdx_top(3,1,i,j,k,ispec2D_moho_top) = duzdxl
+                  dsdx_top(3,2,i,j,k,ispec2D_moho_top) = duzdyl
+                  dsdx_top(3,3,i,j,k,ispec2D_moho_top) = duzdzl
+                else if (is_moho_bot(ispec)) then
+                  dsdx_bot(1,1,i,j,k,ispec2D_moho_bot) = duxdxl
+                  dsdx_bot(1,2,i,j,k,ispec2D_moho_bot) = duxdyl
+                  dsdx_bot(1,3,i,j,k,ispec2D_moho_bot) = duxdzl
+                  dsdx_bot(2,1,i,j,k,ispec2D_moho_bot) = duydxl
+                  dsdx_bot(2,2,i,j,k,ispec2D_moho_bot) = duydyl
+                  dsdx_bot(2,3,i,j,k,ispec2D_moho_bot) = duydzl
+                  dsdx_bot(3,1,i,j,k,ispec2D_moho_bot) = duzdxl
+                  dsdx_bot(3,2,i,j,k,ispec2D_moho_bot) = duzdyl
+                  dsdx_bot(3,3,i,j,k,ispec2D_moho_bot) = duzdzl
+                endif
+              endif
+
+              ! precompute some sums to save CPU time
+              duxdxl_plus_duydyl = duxdxl + duydyl
+              duxdxl_plus_duzdzl = duxdxl + duzdzl
+              duydyl_plus_duzdzl = duydyl + duzdzl
+              duxdyl_plus_duydxl = duxdyl + duydxl
+              duzdxl_plus_duxdzl = duzdxl + duxdzl
+              duzdyl_plus_duydzl = duzdyl + duydzl
+
+              ! computes deviatoric strain attenuation and/or for kernel calculations
+              if (COMPUTE_AND_STORE_STRAIN) then
+                templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+                if( SIMULATION_TYPE == 3 ) epsilon_trace_over_3(i,j,k,ispec) = templ
+                epsilondev_xx_loc(i,j,k) = duxdxl - templ
+                epsilondev_yy_loc(i,j,k) = duydyl - templ
+                epsilondev_xy_loc(i,j,k) = 0.5 * duxdyl_plus_duydxl
+                epsilondev_xz_loc(i,j,k) = 0.5 * duzdxl_plus_duxdzl
+                epsilondev_yz_loc(i,j,k) = 0.5 * duzdyl_plus_duydzl
+              endif
+
+              kappal = kappastore(i,j,k,ispec)
+              mul = mustore(i,j,k,ispec)
+
+              ! attenuation
+              if(ATTENUATION) then
+                ! use unrelaxed parameters if attenuation
+                mul  = mul * one_minus_sum_beta(i,j,k,ispec)
+              endif
+
+  ! full anisotropic case, stress calculations
+              if(ANISOTROPY) then
+                c11 = c11store(i,j,k,ispec)
+                c12 = c12store(i,j,k,ispec)
+                c13 = c13store(i,j,k,ispec)
+                c14 = c14store(i,j,k,ispec)
+                c15 = c15store(i,j,k,ispec)
+                c16 = c16store(i,j,k,ispec)
+                c22 = c22store(i,j,k,ispec)
+                c23 = c23store(i,j,k,ispec)
+                c24 = c24store(i,j,k,ispec)
+                c25 = c25store(i,j,k,ispec)
+                c26 = c26store(i,j,k,ispec)
+                c33 = c33store(i,j,k,ispec)
+                c34 = c34store(i,j,k,ispec)
+                c35 = c35store(i,j,k,ispec)
+                c36 = c36store(i,j,k,ispec)
+                c44 = c44store(i,j,k,ispec)
+                c45 = c45store(i,j,k,ispec)
+                c46 = c46store(i,j,k,ispec)
+                c55 = c55store(i,j,k,ispec)
+                c56 = c56store(i,j,k,ispec)
+                c66 = c66store(i,j,k,ispec)
+
+                sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
+                          c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+                sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
+                          c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+                sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
+                          c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+                sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
+                          c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
+                sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
+                          c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+                sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
+                          c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
+
+              else
+
+  ! isotropic case
+                lambdalplus2mul = kappal + FOUR_THIRDS * mul
+                lambdal = lambdalplus2mul - 2.*mul
+
+                ! compute stress sigma
+                sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+                sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+                sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+                sigma_xy = mul*duxdyl_plus_duydxl
+                sigma_xz = mul*duzdxl_plus_duxdzl
+                sigma_yz = mul*duzdyl_plus_duydzl
+
+              endif ! ANISOTROPY
+
+              ! subtract memory variables if attenuation
+              if(ATTENUATION) then
+! way 1
+!                do i_sls = 1,N_SLS
+!                  R_xx_val = R_xx(i,j,k,ispec,i_sls)
+!                  R_yy_val = R_yy(i,j,k,ispec,i_sls)
+!                  sigma_xx = sigma_xx - R_xx_val
+!                  sigma_yy = sigma_yy - R_yy_val
+!                  sigma_zz = sigma_zz + R_xx_val + R_yy_val
+!                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+!                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+!                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+!                enddo
+
+! way 2
+! note: this should help compilers to pipeline the code and make better use of the cache;
+!          depending on compilers, it can further decrease the computation time by ~ 30%.
+!          by default, N_SLS = 3, therefore we take steps of 3
+              if(imodulo_N_SLS >= 1) then
+                do i_sls = 1,imodulo_N_SLS
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+                enddo
+              endif
+
+              if(N_SLS >= imodulo_N_SLS+1) then
+                do i_sls = imodulo_N_SLS+1,N_SLS,3
+                  R_xx_val1 = R_xx(i,j,k,ispec,i_sls)
+                  R_yy_val1 = R_yy(i,j,k,ispec,i_sls)
+                  sigma_xx = sigma_xx - R_xx_val1
+                  sigma_yy = sigma_yy - R_yy_val1
+                  sigma_zz = sigma_zz + R_xx_val1 + R_yy_val1
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls)
+
+                  R_xx_val2 = R_xx(i,j,k,ispec,i_sls+1)
+                  R_yy_val2 = R_yy(i,j,k,ispec,i_sls+1)
+                  sigma_xx = sigma_xx - R_xx_val2
+                  sigma_yy = sigma_yy - R_yy_val2
+                  sigma_zz = sigma_zz + R_xx_val2 + R_yy_val2
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+1)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+1)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+1)
+
+                  R_xx_val3 = R_xx(i,j,k,ispec,i_sls+2)
+                  R_yy_val3 = R_yy(i,j,k,ispec,i_sls+2)
+                  sigma_xx = sigma_xx - R_xx_val3
+                  sigma_yy = sigma_yy - R_yy_val3
+                  sigma_zz = sigma_zz + R_xx_val3 + R_yy_val3
+                  sigma_xy = sigma_xy - R_xy(i,j,k,ispec,i_sls+2)
+                  sigma_xz = sigma_xz - R_xz(i,j,k,ispec,i_sls+2)
+                  sigma_yz = sigma_yz - R_yz(i,j,k,ispec,i_sls+2)
+                enddo
+              endif
+
+
+              endif
+
+            ! define symmetric components of sigma
+            sigma_yx = sigma_xy
+            sigma_zx = sigma_xz
+            sigma_zy = sigma_yz
+
+            ! form dot product with test vector, non-symmetric form (which is useful in the case of PML)
+            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+
+            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+
+            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+
+            enddo
+          enddo
+        enddo
+
+    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
+    ! for incompressible fluid flow, Cambridge University Press (2002),
+    ! pages 386 and 389 and Figure 8.3.1
+        ! call mxm_m1_m2_10points(hprimewgll_xxT,tempx1,tempy1,tempz1,newtempx1,newtempy1,newtempz1)
+        do j=1,m2
+          do i=1,m1
+            E1_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C1_m1_m2_10points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C1_m1_m2_10points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C1_m1_m2_10points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C1_m1_m2_10points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C1_m1_m2_10points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C1_m1_m2_10points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C1_m1_m2_10points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C1_m1_m2_10points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C1_m1_m2_10points(9,j) + &
+                                  hprimewgll_xxT(i,10)*C1_m1_m2_10points(10,j)
+            E2_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C2_m1_m2_10points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C2_m1_m2_10points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C2_m1_m2_10points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C2_m1_m2_10points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C2_m1_m2_10points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C2_m1_m2_10points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C2_m1_m2_10points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C2_m1_m2_10points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C2_m1_m2_10points(9,j) + &
+                                  hprimewgll_xxT(i,10)*C2_m1_m2_10points(10,j)
+            E3_m1_m2_10points(i,j) = hprimewgll_xxT(i,1)*C3_m1_m2_10points(1,j) + &
+                                  hprimewgll_xxT(i,2)*C3_m1_m2_10points(2,j) + &
+                                  hprimewgll_xxT(i,3)*C3_m1_m2_10points(3,j) + &
+                                  hprimewgll_xxT(i,4)*C3_m1_m2_10points(4,j) + &
+                                  hprimewgll_xxT(i,5)*C3_m1_m2_10points(5,j) + &
+                                  hprimewgll_xxT(i,6)*C3_m1_m2_10points(6,j) + &
+                                  hprimewgll_xxT(i,7)*C3_m1_m2_10points(7,j) + &
+                                  hprimewgll_xxT(i,8)*C3_m1_m2_10points(8,j) + &
+                                  hprimewgll_xxT(i,9)*C3_m1_m2_10points(9,j) + &
+                                  hprimewgll_xxT(i,10)*C3_m1_m2_10points(10,j)
+          enddo
+        enddo
+
+        !   call mxm_m1_m1_10points(tempx2(1,1,k),tempy2(1,1,k),tempz2(1,1,k), &
+        !         hprimewgll_xx,newtempx2(1,1,k),newtempy2(1,1,k),newtempz2(1,1,k))
+        do i=1,m1
+          do j=1,m1
+            ! for efficiency it is better to leave this loop on k inside, it leads to slightly faster code
+            do k = 1,NGLLX
+              newtempx2(i,j,k) = tempx2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempx2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempx2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempx2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempx2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempx2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempx2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempx2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempx2(i,9,k)*hprimewgll_xx(9,j) + &
+                               tempx2(i,10,k)*hprimewgll_xx(10,j)
+              newtempy2(i,j,k) = tempy2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempy2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempy2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempy2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempy2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempy2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempy2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempy2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempy2(i,9,k)*hprimewgll_xx(9,j) + &
+                               tempy2(i,10,k)*hprimewgll_xx(10,j)
+              newtempz2(i,j,k) = tempz2(i,1,k)*hprimewgll_xx(1,j) + &
+                               tempz2(i,2,k)*hprimewgll_xx(2,j) + &
+                               tempz2(i,3,k)*hprimewgll_xx(3,j) + &
+                               tempz2(i,4,k)*hprimewgll_xx(4,j) + &
+                               tempz2(i,5,k)*hprimewgll_xx(5,j) + &
+                               tempz2(i,6,k)*hprimewgll_xx(6,j) + &
+                               tempz2(i,7,k)*hprimewgll_xx(7,j) + &
+                               tempz2(i,8,k)*hprimewgll_xx(8,j) + &
+                               tempz2(i,9,k)*hprimewgll_xx(9,j) + &
+                               tempz2(i,10,k)*hprimewgll_xx(10,j)
+            enddo
+          enddo
+        enddo
+
+        ! call mxm_m2_m1_10points(tempx3,tempy3,tempz3,hprimewgll_xx,newtempx3,newtempy3,newtempz3)
+        do j=1,m1
+          do i=1,m2
+            E1_mxm_m2_m1_10points(i,j) = C1_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
+                                      C1_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
+                                      C1_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
+                                      C1_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
+                                      C1_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
+                                      C1_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
+                                      C1_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
+                                      C1_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
+                                      C1_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
+                                      C1_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
+            E2_mxm_m2_m1_10points(i,j) = C2_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
+                                      C2_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
+                                      C2_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
+                                      C2_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
+                                      C2_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
+                                      C2_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
+                                      C2_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
+                                      C2_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
+                                      C2_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
+                                      C2_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
+            E3_mxm_m2_m1_10points(i,j) = C3_mxm_m2_m1_10points(i,1)*hprimewgll_xx(1,j) + &
+                                      C3_mxm_m2_m1_10points(i,2)*hprimewgll_xx(2,j) + &
+                                      C3_mxm_m2_m1_10points(i,3)*hprimewgll_xx(3,j) + &
+                                      C3_mxm_m2_m1_10points(i,4)*hprimewgll_xx(4,j) + &
+                                      C3_mxm_m2_m1_10points(i,5)*hprimewgll_xx(5,j) + &
+                                      C3_mxm_m2_m1_10points(i,6)*hprimewgll_xx(6,j) + &
+                                      C3_mxm_m2_m1_10points(i,7)*hprimewgll_xx(7,j) + &
+                                      C3_mxm_m2_m1_10points(i,8)*hprimewgll_xx(8,j) + &
+                                      C3_mxm_m2_m1_10points(i,9)*hprimewgll_xx(9,j) + &
+                                      C3_mxm_m2_m1_10points(i,10)*hprimewgll_xx(10,j)
+          enddo
+        enddo
+
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+
+              fac1 = wgllwgll_yz(j,k)
+              fac2 = wgllwgll_xz(i,k)
+              fac3 = wgllwgll_xy(i,j)
+
+              ! sum contributions from each element to the global mesh using indirect addressing
+              iglob = ibool(i,j,k,ispec)
+              accel(1,iglob) = accel(1,iglob) - fac1*newtempx1(i,j,k) - &
+                                fac2*newtempx2(i,j,k) - fac3*newtempx3(i,j,k)
+              accel(2,iglob) = accel(2,iglob) - fac1*newtempy1(i,j,k) - &
+                                fac2*newtempy2(i,j,k) - fac3*newtempy3(i,j,k)
+              accel(3,iglob) = accel(3,iglob) - fac1*newtempz1(i,j,k) - &
+                                fac2*newtempz2(i,j,k) - fac3*newtempz3(i,j,k)
+
+              !  update memory variables based upon the Runge-Kutta scheme
+              if(ATTENUATION) then
+
+                 ! use Runge-Kutta scheme to march in time
+                 do i_sls = 1,N_SLS
+
+                    factor_loc = mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+                    alphaval_loc = alphaval(i_sls)
+                    betaval_loc = betaval(i_sls)
+                    gammaval_loc = gammaval(i_sls)
+
+                    ! term in xx
+                    Sn   = factor_loc * epsilondev_xx(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xx_loc(i,j,k)
+                    R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yy
+                    Sn   = factor_loc * epsilondev_yy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yy_loc(i,j,k)
+                    R_yy(i,j,k,ispec,i_sls) = alphaval_loc * R_yy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in zz not computed since zero trace
+                    ! term in xy
+                    Sn   = factor_loc * epsilondev_xy(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xy_loc(i,j,k)
+                    R_xy(i,j,k,ispec,i_sls) = alphaval_loc * R_xy(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in xz
+                    Sn   = factor_loc * epsilondev_xz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_xz_loc(i,j,k)
+                    R_xz(i,j,k,ispec,i_sls) = alphaval_loc * R_xz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+                    ! term in yz
+                    Sn   = factor_loc * epsilondev_yz(i,j,k,ispec)
+                    Snp1   = factor_loc * epsilondev_yz_loc(i,j,k)
+                    R_yz(i,j,k,ispec,i_sls) = alphaval_loc * R_yz(i,j,k,ispec,i_sls) + &
+                                      betaval_loc * Sn + gammaval_loc * Snp1
+
+                 enddo   ! end of loop on memory variables
+
+              endif  !  end attenuation
+
+            enddo
+          enddo
+        enddo
+
+        ! save deviatoric strain for Runge-Kutta scheme
+        if ( COMPUTE_AND_STORE_STRAIN ) then
+          epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+          epsilondev_yy(:,:,:,ispec) = epsilondev_yy_loc(:,:,:)
+          epsilondev_xy(:,:,:,ispec) = epsilondev_xy_loc(:,:,:)
+          epsilondev_xz(:,:,:,ispec) = epsilondev_xz_loc(:,:,:)
+          epsilondev_yz(:,:,:,ispec) = epsilondev_yz_loc(:,:,:)
+        endif
+
+  enddo  ! spectral element loop
+
+end subroutine compute_forces_elastic_Dev_10p

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -114,6 +114,8 @@
     write(IMAIN,'(a)',advance='yes') '  tomo'
     case( IMODEL_USER_EXTERNAL )
     write(IMAIN,'(a)',advance='yes') '  external'
+    case( IMODEL_IPATI )
+    write(IMAIN,'(a)',advance='yes') '  ipati'
     end select
 
     write(IMAIN,*)
@@ -191,6 +193,9 @@
   ! initializes adjoint simulations
   call initialize_simulation_adjoint()
 
+  ! initializes GPU cards
+  if( GPU_MODE ) call initialize_GPU()
+
   end subroutine initialize_simulation
 
 !
@@ -239,23 +244,10 @@
       stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ is in [5-10]'
   endif
 
-  ! check for GPU runs
-  if( GPU_MODE ) then
-    if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) &
-      stop 'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5'
-    if( CUSTOM_REAL /= 4 ) &
-      stop 'GPU mode runs only with CUSTOM_REAL == 4'
-    if( SAVE_MOHO_MESH ) &
-      stop 'GPU mode does not support SAVE_MOHO_MESH yet'
-    if( ATTENUATION ) then
-      if( N_SLS /= 3 ) &
-        stop 'GPU mode does not support N_SLS /= 3 yet'
-    endif
-  endif
+  ! gravity only on GPU supported
   if( .not. GPU_MODE .and. GRAVITY ) &
     stop 'GRAVITY only supported in GPU mode'
 
-
   ! absorbing surfaces
   if( ABSORBING_CONDITIONS ) then
     ! for arbitrary orientation of elements, which face belongs to xmin,xmax,etc... -
@@ -301,6 +293,7 @@
   endif
 
   end subroutine initialize_simulation_check
+
 !
 !-------------------------------------------------------------------------------------------------
 !
@@ -341,3 +334,55 @@
   endif
 
   end subroutine initialize_simulation_adjoint
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine initialize_GPU()
+
+! initialization for GPU cards
+
+  use specfem_par
+  use specfem_par_elastic
+  use specfem_par_acoustic
+  use specfem_par_poroelastic
+  implicit none
+  integer :: ncuda_devices,ncuda_devices_min,ncuda_devices_max
+
+  ! GPU_MODE now defined in Par_file
+  if(myrank == 0 ) then
+    write(IMAIN,*)
+    write(IMAIN,*) "GPU_MODE Active."
+  endif
+
+  ! check for GPU runs
+  if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) &
+    stop 'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5'
+  if( CUSTOM_REAL /= 4 ) &
+    stop 'GPU mode runs only with CUSTOM_REAL == 4'
+  if( SAVE_MOHO_MESH ) &
+    stop 'GPU mode does not support SAVE_MOHO_MESH yet'
+  if( ATTENUATION ) then
+    if( N_SLS /= 3 ) &
+      stop 'GPU mode does not support N_SLS /= 3 yet'
+  endif
+  if( POROELASTIC_SIMULATION ) then
+    stop 'poroelastic simulations on GPU not supported yet'
+  endif
+
+  ! initializes GPU and outputs info to files for all processes
+  call prepare_cuda_device(myrank,ncuda_devices)
+
+  ! collects min/max of local devices found for statistics
+  call sync_all()
+  call min_all_i(ncuda_devices,ncuda_devices_min)
+  call max_all_i(ncuda_devices,ncuda_devices_max)
+
+  if( myrank == 0 ) then
+    write(IMAIN,*)"  GPU number of devices per node: min =",ncuda_devices_min
+    write(IMAIN,*)"                                  max =",ncuda_devices_max
+    write(IMAIN,*)
+  endif
+
+  end subroutine initialize_GPU

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -187,6 +187,12 @@
   ! norm of the backward displacement
   real(kind=CUSTOM_REAL) b_Usolidnorm, b_Usolidnorm_all
 
+  ! initializes
+  Usolidnorm_all = 0.0_CUSTOM_REAL
+  Usolidnormp_all = 0.0_CUSTOM_REAL
+  Usolidnorms_all = 0.0_CUSTOM_REAL
+  Usolidnormw_all = 0.0_CUSTOM_REAL
+
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 !chris: Rewrite to get norm for each material when coupled simulations
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -199,6 +205,15 @@
     else
       Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2))
     endif
+
+    ! check stability of the code, exit if unstable
+    ! negative values can occur with some compilers when the unstable value is greater
+    ! than the greatest possible floating-point number of the machine
+    if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0) &
+      call exit_MPI(myrank,'forward simulation became unstable and blew up')
+
+    ! compute the maximum of the maxima for all the slices using an MPI reduction
+    call max_all_cr(Usolidnorm,Usolidnorm_all)
   endif
 
   if( ACOUSTIC_SIMULATION ) then
@@ -208,6 +223,9 @@
     else
       Usolidnormp = maxval(abs(potential_dot_dot_acoustic(:)))
     endif
+
+    ! compute the maximum of the maxima for all the slices using an MPI reduction
+    call max_all_cr(Usolidnormp,Usolidnormp_all)
   endif
 
   if( POROELASTIC_SIMULATION ) then
@@ -215,18 +233,12 @@
                              displs_poroelastic(3,:)**2))
     Usolidnormw = maxval(sqrt(displw_poroelastic(1,:)**2 + displw_poroelastic(2,:)**2 + &
                              displw_poroelastic(3,:)**2))
+
+    ! compute the maximum of the maxima for all the slices using an MPI reduction
+    call max_all_cr(Usolidnorms,Usolidnorms_all)
+    call max_all_cr(Usolidnormw,Usolidnormw_all)
   endif
-  ! check stability of the code, exit if unstable
-  ! negative values can occur with some compilers when the unstable value is greater
-  ! than the greatest possible floating-point number of the machine
-  if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0) &
-    call exit_MPI(myrank,'forward simulation became unstable and blew up')
 
-  ! compute the maximum of the maxima for all the slices using an MPI reduction
-  call max_all_cr(Usolidnorm,Usolidnorm_all)
-  call max_all_cr(Usolidnormp,Usolidnormp_all)
-  call max_all_cr(Usolidnorms,Usolidnorms_all)
-  call max_all_cr(Usolidnormw,Usolidnormw_all)
 
   ! adjoint simulations
   if( SIMULATION_TYPE == 3 ) then
@@ -353,14 +365,14 @@
     ! check stability of the code, exit if unstable
     ! negative values can occur with some compilers when the unstable value is greater
     ! than the greatest possible floating-point number of the machine
-    if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0 &
-     .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0 &
-     .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0 &
-     .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0) &
+    if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0 &
+     .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0 &
+     .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0 &
+     .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0) &
         call exit_MPI(myrank,'forward simulation became unstable and blew up')
     ! adjoint simulations
     if(SIMULATION_TYPE == 3 .and. (b_Usolidnorm_all > STABILITY_THRESHOLD &
-      .or. b_Usolidnorm_all < 0)) &
+      .or. b_Usolidnorm_all < 0.0)) &
         call exit_MPI(myrank,'backward simulation became unstable and blew up')
 
   endif ! myrank

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/locate_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/locate_receivers.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/locate_receivers.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -499,6 +499,7 @@
       ! end of loop on all the spectral elements in current slice
       enddo
     else
+      ! SeismicUnix format
       ispec_selected_rec(irec) = 0
       ix_initial_guess(irec) = 0
       iy_initial_guess(irec) = 0

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -874,23 +874,14 @@
 
   implicit none
   real :: free_mb,used_mb,total_mb
-  integer :: ncuda_devices,ncuda_devices_min,ncuda_devices_max
 
   ! GPU_MODE now defined in Par_file
   if(myrank == 0 ) then
     write(IMAIN,*)
-    write(IMAIN,*) "GPU_MODE Active. Preparing Fields and Constants on Device."
+    write(IMAIN,*) "GPU Preparing Fields and Constants on Device."
     write(IMAIN,*)
   endif
 
-  ! initializes GPU and outputs info to files for all processes
-  call prepare_cuda_device(myrank,NPROC,ncuda_devices)
-
-  ! collects min/max of local devices found for statistics
-  call sync_all()
-  call min_all_i(ncuda_devices,ncuda_devices_min)
-  call max_all_i(ncuda_devices,ncuda_devices_max)
-
   ! prepares general fields on GPU
   call prepare_constants_device(Mesh_pointer, &
                                   NGLLX, NSPEC_AB, NGLOB_AB, &
@@ -985,6 +976,11 @@
 
   endif
 
+  ! prepares fields on GPU for poroelastic simulations
+  if( POROELASTIC_SIMULATION ) then
+    stop 'todo poroelastic simulations on GPU'
+  endif
+
   ! prepares needed receiver array for adjoint runs
   if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) &
     call prepare_sim2_or_3_const_device(Mesh_pointer, &
@@ -1038,11 +1034,8 @@
 
   ! outputs usage for main process
   if( myrank == 0 ) then
-    write(IMAIN,*)"  GPU number of devices per node: min =",ncuda_devices_min
-    write(IMAIN,*)"                                  max =",ncuda_devices_max
-    write(IMAIN,*)
-
     call get_free_device_memory(free_mb,used_mb,total_mb)
+    write(IMAIN,*)
     write(IMAIN,*)"  GPU usage: free  =",free_mb," MB",nint(free_mb/total_mb*100.0),"%"
     write(IMAIN,*)"             used  =",used_mb," MB",nint(used_mb/total_mb*100.0),"%"
     write(IMAIN,*)"             total =",total_mb," MB",nint(total_mb/total_mb*100.0),"%"

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90	2012-06-19 22:21:52 UTC (rev 20391)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90	2012-06-19 22:23:01 UTC (rev 20392)
@@ -70,21 +70,16 @@
       ! this transfers fields only in elements with stations for efficiency
       if( ELASTIC_SIMULATION ) then
          if(USE_CUDA_SEISMOGRAMS) then
-            call transfer_seismograms_el_from_device(&
-                nrec_local,&
-                Mesh_pointer, &
-                SIMULATION_TYPE,&
-                seismograms_d,&
-                seismograms_v,&
-                seismograms_a,&
-                it)
+            call transfer_seismograms_el_from_d(nrec_local,Mesh_pointer, &
+                                               SIMULATION_TYPE,&
+                                               seismograms_d,seismograms_v,seismograms_a,&
+                                               it)
          else
-            call transfer_station_el_from_device( &
-                    displ,veloc,accel, &
-                    b_displ,b_veloc,b_accel, &
-                    Mesh_pointer,number_receiver_global, &
-                    ispec_selected_rec,ispec_selected_source, &
-                    ibool,SIMULATION_TYPE)
+            call transfer_station_el_from_device(displ,veloc,accel, &
+                                                b_displ,b_veloc,b_accel, &
+                                                Mesh_pointer,number_receiver_global, &
+                                                ispec_selected_rec,ispec_selected_source, &
+                                                ibool,SIMULATION_TYPE)
          endif
         ! alternative: transfers whole fields
         !  call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)



More information about the CIG-COMMITS mailing list