[cig-commits] r22971 - in seismo/3D/SPECFEM3D_GLOBE/trunk: . setup src/cuda src/meshfem3D src/shared src/specfem3D

Wed Oct 23 07:27:26 PDT 2013

Author: danielpeter
Date: 2013-10-23 07:27:26 -0700 (Wed, 23 Oct 2013)
New Revision: 22971

Modified:
   seismo/3D/SPECFEM3D_GLOBE/trunk/configure
   seismo/3D/SPECFEM3D_GLOBE/trunk/setup/config.fh.in
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_ppm.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_s40rts.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/adios_helpers_definitions.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/adios_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/broadcast_computed_parameters.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/intgrl.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/param_reader.c
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_att_memory.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_strain.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_noDev.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_noDev.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_outer_core_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_GLL_points.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D_par.F90
Log:
updates vectorization routines to reduce code redundancy

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/configure
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/trunk/configure	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/configure	2013-10-23 14:27:26 UTC (rev 22971)
@@ -1381,7 +1381,7 @@
                           solver in double precision [default=no]
   --enable-vtk            build VTK enabled version [default=no]
   --enable-vectorization  build FORCE_VECTORIZATION enabled version
-                          [default=yes]
+                          [default=no]
 
 Optional Packages:
   --with-PACKAGE[=ARG]    use PACKAGE [ARG=yes]
@@ -2781,8 +2781,6 @@
 if test "${enable_vectorization+set}" = set; then :
   enableval=$enable_vectorization; want_vec="$enableval"
 else
-#  want_vec=yes
-## DK DK changed the default to no_vectorization
   want_vec=no
 fi
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/setup/config.fh.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/setup/config.fh.in	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/setup/config.fh.in	2013-10-23 14:27:26 UTC (rev 22971)
@@ -1,3 +1,9 @@
+!-----------------------------------------------------------------------
+!
+! ADIOS
+!
+!-----------------------------------------------------------------------
+
 ! for ADIOS header informations
 
 ! configuration flags to be saved in the adios output file.
@@ -17,3 +23,32 @@
 #else
 #define STRINGIFY_VAR(a) "a", a
 #endif
+
+!-----------------------------------------------------------------------
+!
+! Force vectorization
+!
+!-----------------------------------------------------------------------
+
+! macros for vectorization
+
+! switches indexing between: array( i,j,k .. ) <-> array( ijk,1,1 .. )
+#ifdef FORCE_VECTORIZATION
+#  define INDEX_IJK  ijk,1,1
+#else
+#  define INDEX_IJK  i,j,k
+#endif
+
+! switches do-loops between: do k=1,NGLLZ; do j=1,NGLLY; do i=1,NGLLX  <-> do ijk=1,NGLLCUBE
+#ifdef FORCE_VECTORIZATION
+#  define DO_LOOP_IJK  do ijk=1,NGLLCUBE
+#else
+#  define DO_LOOP_IJK  do k=1,NGLLZ; do j=1,NGLLY; do i=1,NGLLX
+#endif
+
+! switches enddo-loops between: enddo; enddo; enddo ! NGLLZ,NGLLY,NGLLX <-> enddo ! NGLLCUBE
+#ifdef FORCE_VECTORIZATION
+#  define ENDDO_LOOP_IJK  enddo ! NGLLCUBE
+#else
+#  define ENDDO_LOOP_IJK  enddo; enddo; enddo ! NGLLZ,NGLLY,NGLLX
+#endif

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2013-10-23 14:27:26 UTC (rev 22971)
@@ -1863,7 +1863,7 @@
         print_CUDA_error_if_any(cudaGetTextureReference(&d_b_accel_ic_tex_ref_ptr, "d_b_accel_ic_tex"), 6023);
         print_CUDA_error_if_any(cudaBindTexture(0, d_b_accel_ic_tex_ref_ptr, mp->d_b_accel_inner_core,
                                                 &channelDesc, sizeof(realw)*size), 6023);
-  
+
       }
     #else
       cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2013-10-23 14:27:26 UTC (rev 22971)
@@ -1,4 +1,4 @@
-/* 
+/*
 !=====================================================================
 !
 !          S p e c f e m 3 D  G l o b e  V e r s i o n  6 . 0
@@ -34,8 +34,8 @@
 
 typedef float realw;
 
- 
 
+
 //
 // src/cuda/assemble_MPI_scalar_cuda.cu
 //
@@ -43,12 +43,12 @@
 void FC_FUNC_(transfer_boun_pot_from_device,
               TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer_f,
                                              realw* send_potential_dot_dot_buffer,
-                                             int* FORWARD_OR_ADJOINT){} 
+                                             int* FORWARD_OR_ADJOINT){}
 
 void FC_FUNC_(transfer_asmbl_pot_to_device,
               TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
                                             realw* buffer_recv_scalar,
-                                            int* FORWARD_OR_ADJOINT) {} 
+                                            int* FORWARD_OR_ADJOINT) {}
 
 
 //
@@ -59,13 +59,13 @@
               TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
                                                realw* send_accel_buffer,
                                                int* IREGION,
-                                               int* FORWARD_OR_ADJOINT){} 
+                                               int* FORWARD_OR_ADJOINT){}
 
 void FC_FUNC_(transfer_asmbl_accel_to_device,
               TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
                                               realw* buffer_recv_vector,
                                               int* IREGION,
-                                              int* FORWARD_OR_ADJOINT) {} 
+                                              int* FORWARD_OR_ADJOINT) {}
 
 
 //
@@ -73,58 +73,58 @@
 //
 
 void FC_FUNC_(pause_for_debug,
-              PAUSE_FOR_DEBUG)() {} 
+              PAUSE_FOR_DEBUG)() {}
 
 void FC_FUNC_(output_free_device_memory,
-              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {} 
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
 
 void FC_FUNC_(get_free_device_memory,
-              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {} 
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
 
 void FC_FUNC_(check_norm_acoustic_from_device,
               CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
-                                                  int* FORWARD_OR_ADJOINT) {} 
+                                                  int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(check_norm_elastic_from_device,
               CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
                                               long* Mesh_pointer_f,
-                                              int* FORWARD_OR_ADJOINT) {} 
+                                              int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(check_norm_strain_from_device,
               CHECK_NORM_STRAIN_FROM_DEVICE)(realw* strain_norm,
                                              realw* strain_norm2,
-                                             long* Mesh_pointer_f) {} 
+                                             long* Mesh_pointer_f) {}
 
  void FC_FUNC_(check_max_norm_displ_gpu,
- CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {} 
+ CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
 
  void FC_FUNC_(check_max_norm_vector,
- CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {} 
+ CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
 
  void FC_FUNC_(check_max_norm_displ,
- CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {} 
+ CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
 
  void FC_FUNC_(check_max_norm_b_displ_gpu,
- CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {} 
+ CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
 
  void FC_FUNC_(check_max_norm_b_accel_gpu,
- CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {} 
+ CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
 
  void FC_FUNC_(check_max_norm_b_veloc_gpu,
- CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {} 
+ CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
 
  void FC_FUNC_(check_max_norm_b_displ,
- CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {} 
+ CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
 
  void FC_FUNC_(check_max_norm_b_accel,
- CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {} 
+ CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
 
  void FC_FUNC_(check_error_vectors,
- CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {} 
+ CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
 
  void FC_FUNC_(get_max_accel,
- GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} 
+ GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
 
 
 //
@@ -134,12 +134,12 @@
 void FC_FUNC_(compute_add_sources_cuda,
               COMPUTE_ADD_SOURCES_CUDA)(long* Mesh_pointer_f,
                                         int* NSOURCESf,
-                                        double* h_stf_pre_compute) {} 
+                                        double* h_stf_pre_compute) {}
 
 void FC_FUNC_(compute_add_sources_backward_cuda,
               COMPUTE_ADD_SOURCES_BACKWARD_CUDA)(long* Mesh_pointer_f,
                                                  int* NSOURCESf,
-                                                 double* h_stf_pre_compute) {} 
+                                                 double* h_stf_pre_compute) {}
 
 void FC_FUNC_(compute_add_sources_adjoint_cuda,
               COMPUTE_ADD_SOURCES_ADJOINT_CUDA)(long* Mesh_pointer,
@@ -147,7 +147,7 @@
                                                 realw* h_adj_sourcearrays,
                                                 int* h_islice_selected_rec,
                                                 int* h_ispec_selected_rec,
-                                                int* time_index) {} 
+                                                int* time_index) {}
 
 
 //
@@ -156,23 +156,23 @@
 
 void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
               COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f,
-                                               int* FORWARD_OR_ADJOINT) {} 
+                                               int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(compute_coupling_fluid_icb_cuda,
               COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f,
-                                               int* FORWARD_OR_ADJOINT) {} 
+                                               int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
               COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
-                                               int* FORWARD_OR_ADJOINT) {} 
+                                               int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(compute_coupling_icb_fluid_cuda,
               COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f,
-                                               int* FORWARD_OR_ADJOINT) {} 
+                                               int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(compute_coupling_ocean_cuda,
               COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
-                                           int* FORWARD_OR_ADJOINT) {} 
+                                           int* FORWARD_OR_ADJOINT) {}
 
 
 //
@@ -182,7 +182,7 @@
 void FC_FUNC_(compute_forces_crust_mantle_cuda,
               COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
                                                 int* iphase,
-                                                int* FORWARD_OR_ADJOINT_f) {} 
+                                                int* FORWARD_OR_ADJOINT_f) {}
 
 
 //
@@ -192,7 +192,7 @@
 void FC_FUNC_(compute_forces_inner_core_cuda,
               COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
                                               int* iphase,
-                                              int* FORWARD_OR_ADJOINT_f) {} 
+                                              int* FORWARD_OR_ADJOINT_f) {}
 
 
 //
@@ -203,7 +203,7 @@
               COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
                                               int* iphase,
                                               realw* time_f,
-                                              int* FORWARD_OR_ADJOINT_f) {} 
+                                              int* FORWARD_OR_ADJOINT_f) {}
 
 
 //
@@ -211,22 +211,22 @@
 //
 
 void FC_FUNC_(compute_kernels_cm_cuda,
-              COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {} 
+              COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
 
 void FC_FUNC_(compute_kernels_ic_cuda,
-              COMPUTE_KERNELS_IC_CUDA)(long* Mesh_pointer,realw* deltat_f) {} 
+              COMPUTE_KERNELS_IC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
 
 void FC_FUNC_(compute_kernels_oc_cuda,
-              COMPUTE_KERNELS_OC_CUDA)(long* Mesh_pointer,realw* deltat_f) {} 
+              COMPUTE_KERNELS_OC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
 
 void FC_FUNC_(compute_kernels_strgth_noise_cu,
               COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
                                                realw* h_noise_surface_movie,
-                                               realw* deltat_f) {} 
+                                               realw* deltat_f) {}
 
 void FC_FUNC_(compute_kernels_hess_cuda,
               COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
-                                         realw* deltat_f) {} 
+                                         realw* deltat_f) {}
 
 
 //
@@ -236,16 +236,16 @@
 void FC_FUNC_(compute_stacey_acoustic_cuda,
               COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
                                             realw* absorb_potential,
-                                            int* itype) {} 
+                                            int* itype) {}
 
 void FC_FUNC_(compute_stacey_acoustic_backward_cuda,
               COMPUTE_STACEY_ACOUSTIC_BACKWARD_CUDA)(long* Mesh_pointer_f,
                                                      realw* absorb_potential,
-                                                     int* itype) {} 
+                                                     int* itype) {}
 
 void FC_FUNC_(compute_stacey_acoustic_undoatt_cuda,
               COMPUTE_STACEY_ACOUSTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
-                                                    int* itype) {} 
+                                                    int* itype) {}
 
 
 //
@@ -255,16 +255,16 @@
 void FC_FUNC_(compute_stacey_elastic_cuda,
               COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
                                            realw* absorb_field,
-                                           int* itype) {} 
+                                           int* itype) {}
 
 void FC_FUNC_(compute_stacey_elastic_backward_cuda,
               COMPUTE_STACEY_ELASTIC_BACKWARD_CUDA)(long* Mesh_pointer_f,
                                                     realw* absorb_field,
-                                                    int* itype) {} 
+                                                    int* itype) {}
 
 void FC_FUNC_(compute_stacey_elastic_undoatt_cuda,
               COMPUTE_STACEY_ELASTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
-                                                   int* itype) {} 
+                                                   int* itype) {}
 
 
 //
@@ -272,39 +272,39 @@
 //
 
 void FC_FUNC_(initialize_cuda_device,
-              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { 
+              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
  fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
  exit(1);
-} 
+}
 
 
 //
 // src/cuda/noise_tomography_cuda.cu
 //
 
-void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){} 
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
 
-void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {} 
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
 
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {} 
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
 
-void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {} 
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
 
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {} 
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
 
 void FC_FUNC_(noise_transfer_surface_to_host,
               NOISE_TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
-                                              realw* h_noise_surface_movie) {} 
+                                              realw* h_noise_surface_movie) {}
 
 void FC_FUNC_(noise_add_source_master_rec_cu,
               NOISE_ADD_SOURCE_MASTER_REC_CU)(long* Mesh_pointer_f,
                                               int* it_f,
                                               int* irec_master_noise_f,
-                                              int* islice_selected_rec) {} 
+                                              int* islice_selected_rec) {}
 
 void FC_FUNC_(noise_add_surface_movie_cuda,
               NOISE_ADD_SURFACE_MOVIE_CUDA)(long* Mesh_pointer_f,
-                                            realw* h_noise_surface_movie) {} 
+                                            realw* h_noise_surface_movie) {}
 
 
 //
@@ -339,7 +339,7 @@
                                         int* SAVE_BOUNDARY_MESH_f,
                                         int* USE_MESH_COLORING_GPU_f,
                                         int* ANISOTROPIC_KL_f,int* APPROXIMATE_HESS_KL_f,
-                                        realw* deltat_f,realw* b_deltat_f) {} 
+                                        realw* deltat_f,realw* b_deltat_f) {}
 
 void FC_FUNC_(prepare_fields_rotation_device,
               PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
@@ -349,7 +349,7 @@
                                               realw* b_two_omega_earth_f,
                                               realw* b_A_array_rotation,
                                               realw* b_B_array_rotation,
-                                              int* NSPEC_OUTER_CORE_ROTATION) {} 
+                                              int* NSPEC_OUTER_CORE_ROTATION) {}
 
 void FC_FUNC_(prepare_fields_gravity_device,
               PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
@@ -363,7 +363,7 @@
                                              realw* minus_g_icb,
                                              realw* minus_g_cmb,
                                              double* RHO_BOTTOM_OC,
-                                             double* RHO_TOP_OC) {} 
+                                             double* RHO_TOP_OC) {}
 
 void FC_FUNC_(prepare_fields_attenuat_device,
               PREPARE_FIELDS_ATTENUAT_DEVICE)(long* Mesh_pointer_f,
@@ -392,7 +392,7 @@
                                                  realw* factor_common_inner_core,
                                                  realw* one_minus_sum_beta_inner_core,
                                                  realw* alphaval,realw* betaval,realw* gammaval,
-                                                 realw* b_alphaval,realw* b_betaval,realw* b_gammaval) {} 
+                                                 realw* b_alphaval,realw* b_betaval,realw* b_gammaval) {}
 
 void FC_FUNC_(prepare_fields_strain_device,
               PREPARE_FIELDS_STRAIN_DEVICE)(long* Mesh_pointer_f,
@@ -419,7 +419,7 @@
                                             realw* b_epsilondev_xz_inner_core,
                                             realw* b_epsilondev_yz_inner_core,
                                             realw* eps_trace_over_3_inner_core,
-                                            realw* b_eps_trace_over_3_inner_core) {} 
+                                            realw* b_eps_trace_over_3_inner_core) {}
 
 void FC_FUNC_(prepare_fields_absorb_device,
               PREPARE_FIELDS_ABSORB_DEVICE)(long* Mesh_pointer_f,
@@ -448,7 +448,7 @@
                                             int* ibelm_ymin_outer_core,int* ibelm_ymax_outer_core,
                                             realw* jacobian2D_xmin_outer_core, realw* jacobian2D_xmax_outer_core,
                                             realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
-                                            realw* vp_outer_core) {} 
+                                            realw* vp_outer_core) {}
 
 void FC_FUNC_(prepare_mpi_buffers_device,
               PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f,
@@ -463,7 +463,7 @@
                                           int* num_interfaces_outer_core,
                                           int* max_nibool_interfaces_oc,
                                           int* nibool_interfaces_outer_core,
-                                          int* ibool_interfaces_outer_core){} 
+                                          int* ibool_interfaces_outer_core){}
 
 void FC_FUNC_(prepare_fields_noise_device,
               PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
@@ -475,14 +475,14 @@
                                            realw* normal_y_noise,
                                            realw* normal_z_noise,
                                            realw* mask_noise,
-                                           realw* jacobian2D_top_crust_mantle) {} 
+                                           realw* jacobian2D_top_crust_mantle) {}
 
 void FC_FUNC_(prepare_oceans_device,
               PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
                                      int* npoin_oceans,
                                      int* h_iglob_ocean_load,
                                      realw* h_rmass_ocean_load_selected,
-                                     realw* h_normal_ocean_load) {} 
+                                     realw* h_normal_ocean_load) {}
 
 void FC_FUNC_(prepare_crust_mantle_device,
              PREPARE_CRUST_MANTLE_DEVICE)(long* Mesh_pointer_f,
@@ -514,7 +514,7 @@
                                           int* NCHUNKS_VAL,
                                           int* num_colors_outer,
                                           int* num_colors_inner,
-                                          int* num_elem_colors) {} 
+                                          int* num_elem_colors) {}
 
 void FC_FUNC_(prepare_outer_core_device,
               PREPARE_OUTER_CORE_DEVICE)(long* Mesh_pointer_f,
@@ -539,7 +539,7 @@
                                          int* h_ibelm_bottom_outer_core,
                                          int* num_colors_outer,
                                          int* num_colors_inner,
-                                         int* num_elem_colors) {} 
+                                         int* num_elem_colors) {}
 
 void FC_FUNC_(prepare_inner_core_device,
               PREPARE_INNER_CORE_DEVICE)(long* Mesh_pointer_f,
@@ -562,11 +562,11 @@
                                          int* h_ibelm_top_inner_core,
                                          int* num_colors_outer,
                                          int* num_colors_inner,
-                                         int* num_elem_colors) {} 
+                                         int* num_elem_colors) {}
 
 void FC_FUNC_(prepare_cleanup_device,
               PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
-              int* NCHUNKS_VAL) {} 
+              int* NCHUNKS_VAL) {}
 
 
 //
@@ -574,88 +574,88 @@
 //
 
 void FC_FUNC_(transfer_fields_cm_to_device,
-              TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+              TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_fields_ic_to_device,
-              TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+              TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_fields_oc_to_device,
-              TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+              TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_b_fields_cm_to_device,
               TRANSFER_FIELDS_B_CM_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                              long* Mesh_pointer_f) {} 
+                                              long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_b_fields_ic_to_device,
               TRANSFER_FIELDS_B_IC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                              long* Mesh_pointer_f) {} 
+                                              long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_b_fields_oc_to_device,
               TRANSFER_FIELDS_B_OC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                              long* Mesh_pointer_f) {} 
+                                              long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_fields_cm_from_device,
-              TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+              TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_fields_ic_from_device,
-              TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+              TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_fields_oc_from_device,
-              TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
+              TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_b_fields_cm_from_device,
               TRANSFER_B_FIELDS_CM_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                long* Mesh_pointer_f) {} 
+                                                long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_b_fields_ic_from_device,
               TRANSFER_B_FIELDS_IC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                long* Mesh_pointer_f) {} 
+                                                long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_b_fields_oc_from_device,
               TRANSFER_B_FIELDS_OC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                long* Mesh_pointer_f) {} 
+                                                long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_displ_cm_from_device,
-              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
+              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_b_displ_cm_from_device,
-              TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
+              TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_displ_ic_from_device,
-              TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
+              TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_b_displ_ic_from_device,
-              TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
+              TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_displ_oc_from_device,
-              TRANSFER_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
+              TRANSFER_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_b_displ_oc_from_device,
-              TRANSFER_B_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
+              TRANSFER_B_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_veloc_cm_from_device,
-              TRANSFER_VELOC_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {} 
+              TRANSFER_VELOC_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_veloc_ic_from_device,
-              TRANSFER_VELOC_IC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {} 
+              TRANSFER_VELOC_IC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_veloc_oc_from_device,
-              TRANSFER_VELOC_OC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {} 
+              TRANSFER_VELOC_OC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_accel_cm_to_device,
-              TRANSFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
+              TRANSFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_accel_cm_from_device,
-              TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
+              TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_b_accel_cm_from_device,
-              TRANSFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {} 
+              TRANSFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_accel_ic_from_device,
-              TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
+              TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_accel_oc_from_device,
-              TRANSFER_ACCEL_OC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
+              TRANSFER_ACCEL_OC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
 
 void FC_FUNC_(transfer_strain_cm_from_device,
               TRANSFER_STRAIN_CM_FROM_DEVICE)(long* Mesh_pointer,
@@ -664,7 +664,7 @@
                                                   realw* epsilondev_yy,
                                                   realw* epsilondev_xy,
                                                   realw* epsilondev_xz,
-                                                  realw* epsilondev_yz) {} 
+                                                  realw* epsilondev_yz) {}
 
 void FC_FUNC_(transfer_b_strain_cm_to_device,
               TRANSFER_B_STRAIN_CM_TO_DEVICE)(long* Mesh_pointer,
@@ -672,7 +672,7 @@
                                               realw* epsilondev_yy,
                                               realw* epsilondev_xy,
                                               realw* epsilondev_xz,
-                                              realw* epsilondev_yz) {} 
+                                              realw* epsilondev_yz) {}
 
 void FC_FUNC_(transfer_strain_ic_from_device,
               TRANSFER_STRAIN_IC_FROM_DEVICE)(long* Mesh_pointer,
@@ -681,7 +681,7 @@
                                               realw* epsilondev_yy,
                                               realw* epsilondev_xy,
                                               realw* epsilondev_xz,
-                                              realw* epsilondev_yz) {} 
+                                              realw* epsilondev_yz) {}
 
 void FC_FUNC_(transfer_b_strain_ic_to_device,
               TRANSFER_B_STRAIN_IC_TO_DEVICE)(long* Mesh_pointer,
@@ -689,7 +689,7 @@
                                               realw* epsilondev_yy,
                                               realw* epsilondev_xy,
                                               realw* epsilondev_xz,
-                                              realw* epsilondev_yz) {} 
+                                              realw* epsilondev_yz) {}
 
 void FC_FUNC_(transfer_b_rmemory_cm_to_device,
               TRANSFER_B_RMEMORY_CM_TO_DEVICE)(long* Mesh_pointer,
@@ -697,7 +697,7 @@
                                                realw* b_R_yy,
                                                realw* b_R_xy,
                                                realw* b_R_xz,
-                                               realw* b_R_yz) {} 
+                                               realw* b_R_yz) {}
 
 void FC_FUNC_(transfer_b_rmemory_ic_to_device,
               TRANSFER_B_RMEMORY_IC_TO_DEVICE)(long* Mesh_pointer,
@@ -705,17 +705,17 @@
                                                realw* b_R_yy,
                                                realw* b_R_xy,
                                                realw* b_R_xz,
-                                               realw* b_R_yz) {} 
+                                               realw* b_R_yz) {}
 
 void FC_FUNC_(transfer_rotation_from_device,
               TRANSFER_ROTATION_FROM_DEVICE)(long* Mesh_pointer,
                                              realw* A_array_rotation,
-                                             realw* B_array_rotation) {} 
+                                             realw* B_array_rotation) {}
 
 void FC_FUNC_(transfer_b_rotation_to_device,
               TRANSFER_B_ROTATION_TO_DEVICE)(long* Mesh_pointer,
                                               realw* A_array_rotation,
-                                              realw* B_array_rotation) {} 
+                                              realw* B_array_rotation) {}
 
 void FC_FUNC_(transfer_kernels_cm_to_host,
               TRANSFER_KERNELS_CM_TO_HOST)(long* Mesh_pointer,
@@ -723,30 +723,30 @@
                                            realw* h_alpha_kl,
                                            realw* h_beta_kl,
                                            realw* h_cijkl_kl,
-                                           int* NSPEC) {} 
+                                           int* NSPEC) {}
 
 void FC_FUNC_(transfer_kernels_ic_to_host,
               TRANSFER_KERNELS_IC_TO_HOST)(long* Mesh_pointer,
                                            realw* h_rho_kl,
                                            realw* h_alpha_kl,
                                            realw* h_beta_kl,
-                                           int* NSPEC) {} 
+                                           int* NSPEC) {}
 
 void FC_FUNC_(transfer_kernels_oc_to_host,
               TRANSFER_KERNELS_OC_TO_HOST)(long* Mesh_pointer,
                                            realw* h_rho_kl,
                                            realw* h_alpha_kl,
-                                           int* NSPEC) {} 
+                                           int* NSPEC) {}
 
 void FC_FUNC_(transfer_kernels_noise_to_host,
               TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
                                               realw* h_Sigma_kl,
-                                              int* NSPEC) {} 
+                                              int* NSPEC) {}
 
 void FC_FUNC_(transfer_kernels_hess_cm_tohost,
               TRANSFER_KERNELS_HESS_CM_TOHOST)(long* Mesh_pointer,
                                               realw* h_hess_kl,
-                                              int* NSPEC) {} 
+                                              int* NSPEC) {}
 
 
 //
@@ -758,39 +758,39 @@
                                           realw* deltat_f,
                                           realw* deltatsqover2_f,
                                           realw* deltatover2_f,
-                                          int* FORWARD_OR_ADJOINT) {} 
+                                          int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(update_displacement_cm_cuda,
               UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
                                           realw* deltat_f,
                                           realw* deltatsqover2_f,
                                           realw* deltatover2_f,
-                                          int* FORWARD_OR_ADJOINT) {} 
+                                          int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(update_displacement_oc_cuda,
               UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
                                            realw* deltat_f,
                                            realw* deltatsqover2_f,
                                            realw* deltatover2_f,
-                                           int* FORWARD_OR_ADJOINT) {} 
+                                           int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(multiply_accel_elastic_cuda,
               MULTIPLY_ACCEL_ELASTIC_CUDA)(long* Mesh_pointer,
-                                           int* FORWARD_OR_ADJOINT) {} 
+                                           int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(update_veloc_elastic_cuda,
               UPDATE_VELOC_ELASTIC_CUDA)(long* Mesh_pointer,
                                          realw* deltatover2_f,
-                                         int* FORWARD_OR_ADJOINT) {} 
+                                         int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(multiply_accel_acoustic_cuda,
               MULTIPLY_ACCEL_ACOUSTIC_CUDA)(long* Mesh_pointer,
-                                            int* FORWARD_OR_ADJOINT) {} 
+                                            int* FORWARD_OR_ADJOINT) {}
 
 void FC_FUNC_(update_veloc_acoustic_cuda,
               UPDATE_VELOC_ACOUSTIC_CUDA)(long* Mesh_pointer,
                                           realw* deltatover2_f,
-                                          int* FORWARD_OR_ADJOINT) {} 
+                                          int* FORWARD_OR_ADJOINT) {}
 
 
 //
@@ -810,5 +810,5 @@
                                                int* number_receiver_global,
                                                int* ispec_selected_rec,
                                                int* ispec_selected_source,
-                                               int* ibool) {} 
+                                               int* ibool) {}
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_ppm.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_ppm.f90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_ppm.f90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -110,7 +110,7 @@
   call bcast_all_singlei(PPM_num_v)
   call bcast_all_singlei(PPM_num_latperlon)
   call bcast_all_singlei(PPM_num_lonperdepth)
-  
+
   if( myrank /= 0 ) then
     allocate(PPM_lat(PPM_num_v),PPM_lon(PPM_num_v),PPM_depth(PPM_num_v),PPM_dvs(PPM_num_v))
   endif
@@ -119,7 +119,7 @@
   call bcast_all_dp(PPM_lat(1:PPM_num_v),PPM_num_v)
   call bcast_all_dp(PPM_lon(1:PPM_num_v),PPM_num_v)
   call bcast_all_dp(PPM_depth(1:PPM_num_v),PPM_num_v)
-  
+
   call bcast_all_singledp(PPM_maxlat)
   call bcast_all_singledp(PPM_minlat)
   call bcast_all_singledp(PPM_maxlon)

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_s40rts.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_s40rts.f90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_s40rts.f90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -119,12 +119,12 @@
   call get_value_string(P12, 'model.P12', 'DATA/s20rts/P12.dat')    !model P12 is in s20rts data directory
 
   ! S40RTS degree 40 S model from Ritsema
-  open(unit=10,file=S40RTS,status='old',action='read',iostat=ier)
+  open(unit=10,file=trim(S40RTS),status='old',action='read',iostat=ier)
   if ( ier /= 0 ) then
     write(IMAIN,*) 'error opening "', trim(S40RTS), '": ', ier
     call flush_IMAIN()
     ! stop
-    call exit_MPI(0, 'error model s40rts')
+    call exit_MPI(0, 'error: opening model s40rts data file DATA/s40rts/S40RTS.dat failed, please check...')
   endif
 
   do k=0,NK_20
@@ -135,10 +135,10 @@
   close(10)
 
   ! P12 degree 12 P model from Ritsema
-  open(unit=10,file=P12,status='old',action='read',iostat=ier)
+  open(unit=10,file=trim(P12),status='old',action='read',iostat=ier)
   if ( ier /= 0 ) then
     write(IMAIN,*) 'error opening "', trim(P12), '": ', ier
-    call exit_MPI(0, 'error model s40rts')
+    call exit_MPI(0, 'error : opening model s40rts data file DATA/s20rts/P12.dat failed, please check...')
   endif
   do k=0,NK_20
     do l=0,12

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/adios_helpers_definitions.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/adios_helpers_definitions.f90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/adios_helpers_definitions.f90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -1268,7 +1268,7 @@
   full_name = trim(path) // trim(array_name)
 
   call define_adios_global_1d_logical_generic(adios_group, group_size_inc, trim(full_name), local_dim)
-  
+
 end subroutine define_adios_global_1d_logical_5d
 
 end module adios_helpers_definitions_mod

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/adios_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/adios_method_stubs.c	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/adios_method_stubs.c	2013-10-23 14:27:26 UTC (rev 22971)
@@ -1,4 +1,4 @@
-/* 
+/*
 !=====================================================================
 !
 !          S p e c f e m 3 D  G l o b e  V e r s i o n  6 . 0

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/broadcast_computed_parameters.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/broadcast_computed_parameters.f90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/broadcast_computed_parameters.f90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -93,7 +93,7 @@
             USE_LDDRK,INCREASE_CFL_FOR_LDDRK, &
             ANISOTROPIC_KL,SAVE_TRANSVERSE_KL_ONLY,APPROXIMATE_HESS_KL, &
             USE_FULL_TISO_MANTLE,SAVE_SOURCE_MASK, &
-            EXACT_MASS_MATRIX_FOR_ROTATION,ATTENUATION_1D_WITH_3D_STORAGE, &            
+            EXACT_MASS_MATRIX_FOR_ROTATION,ATTENUATION_1D_WITH_3D_STORAGE, &
             GPU_MODE, &
             ADIOS_ENABLED,ADIOS_FOR_FORWARD_ARRAYS, &
             ADIOS_FOR_MPI_ARRAYS,ADIOS_FOR_ARRAYS_SOLVER, &

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/intgrl.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/intgrl.f90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/intgrl.f90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -33,29 +33,28 @@
   implicit none
 
 ! Argument variables
-  integer ner,nir
-  double precision f(640),r(640),s1(640),s2(640)
-  double precision s3(640),sum
+  integer :: ner,nir
+  double precision :: f(640),r(640),s1(640),s2(640)
+  double precision :: s3(640),sum
 
 ! Local variables
   double precision, parameter :: third = 1.0d0/3.0d0
   double precision, parameter :: fifth = 1.0d0/5.0d0
   double precision, parameter :: sixth = 1.0d0/6.0d0
 
-  double precision rji,yprime(640)
-  double precision s1l,s2l,s3l
+  double precision :: rji,yprime(640)
+  double precision :: s1l,s2l,s3l
 
-  integer i,j,n,kdis(28)
-  integer ndis,nir1
+  integer :: i,j,n,kdis(28)
+  integer :: ndis,nir1
 
-
-
   data kdis/163,323,336,517,530,540,565,590,609,619,626,633,16*0/
 
   ndis = 12
   n = 640
 
   call deriv(f,yprime,n,r,ndis,kdis,s1,s2,s3)
+
   nir1 = nir + 1
   sum = 0.0d0
   do i=nir1,ner
@@ -79,17 +78,17 @@
   implicit none
 
 ! Argument variables
-  integer kdis(28),n,ndis
-  double precision r(n),s1(n),s2(n),s3(n)
-  double precision y(n),yprime(n)
+  integer :: kdis(28),n,ndis
+  double precision :: r(n),s1(n),s2(n),s3(n)
+  double precision :: y(n),yprime(n)
 
 ! Local variables
-  integer i,j,j1,j2
-  integer k,nd,ndp
-  double precision a0,b0,b1
-  double precision f(3,1000),h,h2,h2a
-  double precision h2b,h3a,ha,s13
-  double precision s21,s32,yy(3)
+  integer :: i,j,j1,j2
+  integer :: k,nd,ndp
+  double precision :: a0,b0,b1
+  double precision :: f(3,1000),h,h2,h2a
+  double precision :: h2b,h3a,ha,s13
+  double precision :: s21,s32,yy(3)
 
   yy(1) = 0.d0
   yy(2) = 0.d0

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/param_reader.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/param_reader.c	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/param_reader.c	2013-10-23 14:27:26 UTC (rev 22971)
@@ -1,4 +1,4 @@
-/* 
+/*
 !=====================================================================
 !
 !          S p e c f e m 3 D  G l o b e  V e r s i o n  6 . 0

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element.F90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -25,17 +25,18 @@
 !
 !=====================================================================
 
+! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION
+! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh
+#include "config.fh"
 
+
+
 !--------------------------------------------------------------------------------------------
 !
 ! isotropic element
 !
 !--------------------------------------------------------------------------------------------
 
-#ifndef FORCE_VECTORIZATION
-
-  ! default, without vectorization
-
   subroutine compute_element_iso(ispec, &
                                  minus_gravity_table,density_table,minus_deriv_gravity_table, &
                                  xstore,ystore,zstore, &
@@ -48,7 +49,7 @@
                                  one_minus_sum_beta,vnspec, &
                                  tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
                                  dummyx_loc,dummyy_loc,dummyz_loc, &
-                                 epsilondev_loc,rho_s_H,is_backward_field)
+                                 epsilondev_loc,rho_s_H)
 
   use constants_solver
   use specfem_par,only: COMPUTE_AND_STORE_STRAIN
@@ -98,8 +99,6 @@
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
   real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
 
-  logical :: is_backward_field
-
 ! local parameters
   real(kind=CUSTOM_REAL) one_minus_sum_beta_use
   real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
@@ -122,404 +121,50 @@
   double precision factor,sx_l,sy_l,sz_l,gxl,gyl,gzl
   double precision Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl
 
-  integer :: i,j,k
   integer :: int_radius
   integer :: iglob
 
-  logical :: dummyl
-
-!daniel debug
-  ! dummy to avoid compiler warning
-  dummyl = is_backward_field
-
-  ! isotropic element
-
-  do k=1,NGLLZ
-    do j=1,NGLLY
-      do i=1,NGLLX
-        ! get derivatives of ux, uy and uz with respect to x, y and z
-        xixl = xix(i,j,k,ispec)
-        xiyl = xiy(i,j,k,ispec)
-        xizl = xiz(i,j,k,ispec)
-        etaxl = etax(i,j,k,ispec)
-        etayl = etay(i,j,k,ispec)
-        etazl = etaz(i,j,k,ispec)
-        gammaxl = gammax(i,j,k,ispec)
-        gammayl = gammay(i,j,k,ispec)
-        gammazl = gammaz(i,j,k,ispec)
-
-        ! compute the jacobian
-        jacobianl = 1.0_CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                                     - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                                     + xizl*(etaxl*gammayl-etayl*gammaxl))
-
-        duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-        duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-        duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-        duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-        duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-        duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-        duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-        duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-        duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-        ! precompute some sums to save CPU time
-        duxdxl_plus_duydyl = duxdxl + duydyl
-        duxdxl_plus_duzdzl = duxdxl + duzdzl
-        duydyl_plus_duzdzl = duydyl + duzdzl
-        duxdyl_plus_duydxl = duxdyl + duydxl
-        duzdxl_plus_duxdzl = duzdxl + duxdzl
-        duzdyl_plus_duydzl = duzdyl + duydzl
-
-!ZN beware, here the expression differs from the strain used in memory variable equation (6) in D. Komatitsch and J. Tromp 1999,
-!ZN here Brian Savage uses the engineering strain which are epsilon = 1/2*(grad U + (grad U)^T),
-!ZN where U is the displacement vector and grad the gradient operator, i.e. there is a 1/2 factor difference between the two.
-!ZN Both expressions are fine, but we need to keep in mind that if we put the 1/2 factor here there we need to remove it
-!ZN from the expression in which we use the strain later in the code.
-
-        ! compute deviatoric strain
-        if (COMPUTE_AND_STORE_STRAIN) then
-          templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-          if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) then
-            if( ispec == 1) then
-              epsilon_trace_over_3(i,j,k,1) = templ
-            endif
-          else
-            epsilon_trace_over_3(i,j,k,ispec) = templ
-          endif
-          epsilondev_loc(1,i,j,k) = duxdxl - templ
-          epsilondev_loc(2,i,j,k) = duydyl - templ
-          epsilondev_loc(3,i,j,k) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-          epsilondev_loc(4,i,j,k) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-          epsilondev_loc(5,i,j,k) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
-        endif
-
-        !
-        ! compute  isotropic  elements
-        !
-
-        ! layer with no transverse isotropy, use kappav and muv
-        kappal = kappavstore(i,j,k,ispec)
-        mul = muvstore(i,j,k,ispec)
-
-        ! use unrelaxed parameters if attenuation
-        if(ATTENUATION_VAL ) then
-          ! precompute terms for attenuation if needed
-          if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-            one_minus_sum_beta_use = one_minus_sum_beta(i,j,k,ispec)
-          else
-            one_minus_sum_beta_use = one_minus_sum_beta(1,1,1,ispec)
-          endif
-          mul = mul * one_minus_sum_beta_use
-        endif
-
-        lambdalplus2mul = kappal + FOUR_THIRDS * mul
-        lambdal = lambdalplus2mul - 2.0_CUSTOM_REAL*mul
-
-        ! compute stress sigma
-        sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-        sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-        sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
-        sigma_xy = mul*duxdyl_plus_duydxl
-        sigma_xz = mul*duzdxl_plus_duxdzl
-        sigma_yz = mul*duzdyl_plus_duydzl
-
-        ! subtract memory variables if attenuation
-        if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
-
-!daniel debug: att - debug update
-!          call compute_element_att_mem_up_cm(ispec,i,j,k, &
-!                                          R_xx(1,i,j,k,ispec), &
-!                                          R_yy(1,i,j,k,ispec), &
-!                                          R_xy(1,i,j,k,ispec), &
-!                                          R_xz(1,i,j,k,ispec), &
-!                                          R_yz(1,i,j,k,ispec), &
-!                                          epsilondev_loc(:,i,j,k),muvstore(i,j,k,ispec),is_backward_field)
-
-          ! note: function inlining is generally done by fortran compilers;
-          !       compilers decide based on performance heuristics
-          ! note: fortran passes pointers to array location, thus R_memory(1,1,...) should be fine
-          call compute_element_att_stress(R_xx(1,i,j,k,ispec), &
-                                          R_yy(1,i,j,k,ispec), &
-                                          R_xy(1,i,j,k,ispec), &
-                                          R_xz(1,i,j,k,ispec), &
-                                          R_yz(1,i,j,k,ispec), &
-                                          sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
-
-        endif ! ATTENUATION_VAL
-
-        ! define symmetric components of sigma for gravity
-        sigma_yx = sigma_xy
-        sigma_zx = sigma_xz
-        sigma_zy = sigma_yz
-
-        ! compute non-symmetric terms for gravity
-        if(GRAVITY_VAL) then
-            ! use mesh coordinates to get theta and phi
-            ! x y and z contain r theta and phi
-            iglob = ibool(i,j,k,ispec)
-
-            dtheta = dble(ystore(iglob))
-            dphi = dble(zstore(iglob))
-
-            cos_theta = dcos(dtheta)
-            sin_theta = dsin(dtheta)
-            cos_phi = dcos(dphi)
-            sin_phi = dsin(dphi)
-
-            cos_theta_sq = cos_theta*cos_theta
-            sin_theta_sq = sin_theta*sin_theta
-            cos_phi_sq = cos_phi*cos_phi
-            sin_phi_sq = sin_phi*sin_phi
-
-            ! get g, rho and dg/dr=dg
-            ! spherical components of the gravitational acceleration
-            ! for efficiency replace with lookup table every 100 m in radial direction
-            radius = dble(xstore(iglob))
-
-            int_radius = nint(10.d0 * radius * R_EARTH_KM )
-            minus_g = minus_gravity_table(int_radius)
-            minus_dg = minus_deriv_gravity_table(int_radius)
-            rho = density_table(int_radius)
-
-            ! Cartesian components of the gravitational acceleration
-            gxl = minus_g*sin_theta*cos_phi
-            gyl = minus_g*sin_theta*sin_phi
-            gzl = minus_g*cos_theta
-
-            ! Cartesian components of gradient of gravitational acceleration
-            ! obtained from spherical components
-            minus_g_over_radius = minus_g / radius
-            minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius
-
-            Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq
-            Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq
-            Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq
-            Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq
-            Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
-            Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
-
-            ! distinguish between single and double precision for reals
-            if(CUSTOM_REAL == SIZE_REAL) then
-
-              ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * dble(dummyx_loc(i,j,k)) ! dble(displ_crust_mantle(1,iglob))
-              sy_l = rho * dble(dummyy_loc(i,j,k)) ! dble(displ_crust_mantle(2,iglob))
-              sz_l = rho * dble(dummyz_loc(i,j,k)) ! dble(displ_crust_mantle(3,iglob))
-
-              ! compute G tensor from s . g and add to sigma (not symmetric)
-              sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
-              sigma_yy = sigma_yy + sngl(sx_l*gxl + sz_l*gzl)
-              sigma_zz = sigma_zz + sngl(sx_l*gxl + sy_l*gyl)
-
-              sigma_xy = sigma_xy - sngl(sx_l * gyl)
-              sigma_yx = sigma_yx - sngl(sy_l * gxl)
-
-              sigma_xz = sigma_xz - sngl(sx_l * gzl)
-              sigma_zx = sigma_zx - sngl(sz_l * gxl)
-
-              sigma_yz = sigma_yz - sngl(sy_l * gzl)
-              sigma_zy = sigma_zy - sngl(sz_l * gyl)
-
-              ! precompute vector
-              factor = dble(jacobianl) * wgll_cube(i,j,k)
-              rho_s_H(1,i,j,k) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
-              rho_s_H(2,i,j,k) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
-              rho_s_H(3,i,j,k) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
-
-            else
-
-              ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * dummyx_loc(i,j,k) ! displ_crust_mantle(1,iglob)
-              sy_l = rho * dummyy_loc(i,j,k) ! displ_crust_mantle(2,iglob)
-              sz_l = rho * dummyz_loc(i,j,k) ! displ_crust_mantle(3,iglob)
-
-              ! compute G tensor from s . g and add to sigma (not symmetric)
-              sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
-              sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
-              sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
-
-              sigma_xy = sigma_xy - sx_l * gyl
-              sigma_yx = sigma_yx - sy_l * gxl
-
-              sigma_xz = sigma_xz - sx_l * gzl
-              sigma_zx = sigma_zx - sz_l * gxl
-
-              sigma_yz = sigma_yz - sy_l * gzl
-              sigma_zy = sigma_zy - sz_l * gyl
-
-              ! precompute vector
-              factor = jacobianl * wgll_cube(i,j,k)
-              rho_s_H(1,i,j,k) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-              rho_s_H(2,i,j,k) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-              rho_s_H(3,i,j,k) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
-
-            endif
-
-        endif  ! end of section with gravity terms
-
-        ! form dot product with test vector, non-symmetric form
-        tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-        tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-        tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-        tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-        tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-        tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-        tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-        tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-        tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-      enddo ! NGLLX
-    enddo ! NGLLY
-  enddo ! NGLLZ
-
-  end subroutine compute_element_iso
-
-#else
-
-! FORCE_VECTORIZATION
-! vectorized routine
-
-  subroutine compute_element_iso(ispec, &
-                                 minus_gravity_table,density_table,minus_deriv_gravity_table, &
-                                 xstore,ystore,zstore, &
-                                 xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                 wgll_cube, &
-                                 kappavstore,muvstore, &
-                                 ibool, &
-                                 R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                 epsilon_trace_over_3, &
-                                 one_minus_sum_beta,vnspec, &
-                                 tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
-                                 dummyx_loc,dummyy_loc,dummyz_loc, &
-                                 epsilondev_loc,rho_s_H,is_backward_field)
-
-  use constants_solver
-  use specfem_par,only: COMPUTE_AND_STORE_STRAIN
-
-  implicit none
-
-
-  ! element id
-  integer :: ispec
-
-  ! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool
-
-  ! x y and z contain r theta and phi
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: xstore,ystore,zstore
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-
-  ! array with derivatives of Lagrange polynomials and precalculated products
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
-
-  ! store anisotropic properties only where needed to save memory
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
-        kappavstore,muvstore
-
-  ! variable sized array variables
-  integer :: vnspec
-
-  ! attenuation
-  ! memory variables for attenuation
-  ! memory variables R_ij are stored at the local rather than global level
-  ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
-
-  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec) :: one_minus_sum_beta
-
-  ! gravity
-  double precision, dimension(NRAD_GRAVITY) :: minus_gravity_table,density_table,minus_deriv_gravity_table
-
-  ! element info
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
-
-  logical :: is_backward_field
-
-! local parameters
-  real(kind=CUSTOM_REAL) one_minus_sum_beta_use
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
-  real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
-
-  real(kind=CUSTOM_REAL) templ
-  real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
-  real(kind=CUSTOM_REAL) kappal
-
-  ! for gravity
-  double precision dphi,dtheta
-  double precision radius,rho,minus_g,minus_dg
-  double precision minus_g_over_radius,minus_dg_plus_g_over_radius
-  double precision cos_theta,sin_theta,cos_phi,sin_phi
-  double precision cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq
-  double precision factor,sx_l,sy_l,sz_l,gxl,gyl,gzl
-  double precision Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl
-
-  integer :: int_radius
-  integer :: iglob
-
-  logical :: dummyl
-
-  ! force vectorization
-  integer :: ijk
-  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
-
+#ifdef FORCE_VECTORIZATION
 ! in this vectorized version we have to assume that N_SLS == 3 in order to be able to unroll and thus suppress
 ! an inner loop that would otherwise prevent vectorization; this is safe in practice in all cases because N_SLS == 3
-! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop otherwise
+! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop
+  integer :: ijk
+  real(kind=CUSTOM_REAL) :: R_xx_val,R_yy_val
+#else
+  integer :: i,j,k
+#endif
 
-!daniel debug
-  ! dummy to avoid compiler warning
-  dummyl = is_backward_field
-
   ! isotropic element
 
-  do ijk=1,NGLLCUBE
+  DO_LOOP_IJK
 
     ! get derivatives of ux, uy and uz with respect to x, y and z
-    xixl = xix(ijk,1,1,ispec)
-    xiyl = xiy(ijk,1,1,ispec)
-    xizl = xiz(ijk,1,1,ispec)
-    etaxl = etax(ijk,1,1,ispec)
-    etayl = etay(ijk,1,1,ispec)
-    etazl = etaz(ijk,1,1,ispec)
-    gammaxl = gammax(ijk,1,1,ispec)
-    gammayl = gammay(ijk,1,1,ispec)
-    gammazl = gammaz(ijk,1,1,ispec)
+    xixl = xix(INDEX_IJK,ispec)
+    xiyl = xiy(INDEX_IJK,ispec)
+    xizl = xiz(INDEX_IJK,ispec)
+    etaxl = etax(INDEX_IJK,ispec)
+    etayl = etay(INDEX_IJK,ispec)
+    etazl = etaz(INDEX_IJK,ispec)
+    gammaxl = gammax(INDEX_IJK,ispec)
+    gammayl = gammay(INDEX_IJK,ispec)
+    gammazl = gammaz(INDEX_IJK,ispec)
 
     ! compute the jacobian
     jacobianl = 1.0_CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
                                  - xiyl*(etaxl*gammazl-etazl*gammaxl) &
                                  + xizl*(etaxl*gammayl-etayl*gammaxl))
 
-    duxdxl = xixl*tempx1(ijk,1,1) + etaxl*tempx2(ijk,1,1) + gammaxl*tempx3(ijk,1,1)
-    duxdyl = xiyl*tempx1(ijk,1,1) + etayl*tempx2(ijk,1,1) + gammayl*tempx3(ijk,1,1)
-    duxdzl = xizl*tempx1(ijk,1,1) + etazl*tempx2(ijk,1,1) + gammazl*tempx3(ijk,1,1)
+    duxdxl = xixl*tempx1(INDEX_IJK) + etaxl*tempx2(INDEX_IJK) + gammaxl*tempx3(INDEX_IJK)
+    duxdyl = xiyl*tempx1(INDEX_IJK) + etayl*tempx2(INDEX_IJK) + gammayl*tempx3(INDEX_IJK)
+    duxdzl = xizl*tempx1(INDEX_IJK) + etazl*tempx2(INDEX_IJK) + gammazl*tempx3(INDEX_IJK)
 
-    duydxl = xixl*tempy1(ijk,1,1) + etaxl*tempy2(ijk,1,1) + gammaxl*tempy3(ijk,1,1)
-    duydyl = xiyl*tempy1(ijk,1,1) + etayl*tempy2(ijk,1,1) + gammayl*tempy3(ijk,1,1)
-    duydzl = xizl*tempy1(ijk,1,1) + etazl*tempy2(ijk,1,1) + gammazl*tempy3(ijk,1,1)
+    duydxl = xixl*tempy1(INDEX_IJK) + etaxl*tempy2(INDEX_IJK) + gammaxl*tempy3(INDEX_IJK)
+    duydyl = xiyl*tempy1(INDEX_IJK) + etayl*tempy2(INDEX_IJK) + gammayl*tempy3(INDEX_IJK)
+    duydzl = xizl*tempy1(INDEX_IJK) + etazl*tempy2(INDEX_IJK) + gammazl*tempy3(INDEX_IJK)
 
-    duzdxl = xixl*tempz1(ijk,1,1) + etaxl*tempz2(ijk,1,1) + gammaxl*tempz3(ijk,1,1)
-    duzdyl = xiyl*tempz1(ijk,1,1) + etayl*tempz2(ijk,1,1) + gammayl*tempz3(ijk,1,1)
-    duzdzl = xizl*tempz1(ijk,1,1) + etazl*tempz2(ijk,1,1) + gammazl*tempz3(ijk,1,1)
+    duzdxl = xixl*tempz1(INDEX_IJK) + etaxl*tempz2(INDEX_IJK) + gammaxl*tempz3(INDEX_IJK)
+    duzdyl = xiyl*tempz1(INDEX_IJK) + etayl*tempz2(INDEX_IJK) + gammayl*tempz3(INDEX_IJK)
+    duzdzl = xizl*tempz1(INDEX_IJK) + etazl*tempz2(INDEX_IJK) + gammazl*tempz3(INDEX_IJK)
 
     ! precompute some sums to save CPU time
     duxdxl_plus_duydyl = duxdxl + duydyl
@@ -538,18 +183,18 @@
     ! compute deviatoric strain
     if (COMPUTE_AND_STORE_STRAIN) then
       templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-      if( NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1 ) then
+      if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) then
         if( ispec == 1) then
-          epsilon_trace_over_3(ijk,1,1,1) = templ
+          epsilon_trace_over_3(INDEX_IJK,1) = templ
         endif
       else
-        epsilon_trace_over_3(ijk,1,1,ispec) = templ
+        epsilon_trace_over_3(INDEX_IJK,ispec) = templ
       endif
-      epsilondev_loc(1,ijk,1,1) = duxdxl - templ
-      epsilondev_loc(2,ijk,1,1) = duydyl - templ
-      epsilondev_loc(3,ijk,1,1) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-      epsilondev_loc(4,ijk,1,1) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-      epsilondev_loc(5,ijk,1,1) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+      epsilondev_loc(1,INDEX_IJK) = duxdxl - templ
+      epsilondev_loc(2,INDEX_IJK) = duydyl - templ
+      epsilondev_loc(3,INDEX_IJK) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+      epsilondev_loc(4,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+      epsilondev_loc(5,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
     endif
 
     !
@@ -557,14 +202,14 @@
     !
 
     ! layer with no transverse isotropy, use kappav and muv
-    kappal = kappavstore(ijk,1,1,ispec)
-    mul = muvstore(ijk,1,1,ispec)
+    kappal = kappavstore(INDEX_IJK,ispec)
+    mul = muvstore(INDEX_IJK,ispec)
 
     ! use unrelaxed parameters if attenuation
-    if(ATTENUATION_VAL) then
+    if(ATTENUATION_VAL ) then
       ! precompute terms for attenuation if needed
       if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-        one_minus_sum_beta_use = one_minus_sum_beta(ijk,1,1,ispec)
+        one_minus_sum_beta_use = one_minus_sum_beta(INDEX_IJK,ispec)
       else
         one_minus_sum_beta_use = one_minus_sum_beta(1,1,1,ispec)
       endif
@@ -584,35 +229,61 @@
     sigma_yz = mul*duzdyl_plus_duydzl
 
     ! subtract memory variables if attenuation
-    if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL) then
-  ! do NOT put this is a subroutine, otherwise the call to the subroutine prevents compilers from vectorizing the outer loop
-  ! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop in order to vectorize the outer loop
-      R_xx_val = R_xx(1,ijk,1,1,ispec)
-      R_yy_val = R_yy(1,ijk,1,1,ispec)
+    if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
+
+#ifdef FORCE_VECTORIZATION
+      ! do NOT put this is a subroutine, otherwise the call to the subroutine prevents compilers
+      !from vectorizing the outer loop
+
+      ! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop
+      ! in order to vectorize the outer loop
+      R_xx_val = R_xx(1,INDEX_IJK,ispec)
+      R_yy_val = R_yy(1,INDEX_IJK,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(1,ijk,1,1,ispec)
-      sigma_xz = sigma_xz - R_xz(1,ijk,1,1,ispec)
-      sigma_yz = sigma_yz - R_yz(1,ijk,1,1,ispec)
+      sigma_xy = sigma_xy - R_xy(1,INDEX_IJK,ispec)
+      sigma_xz = sigma_xz - R_xz(1,INDEX_IJK,ispec)
+      sigma_yz = sigma_yz - R_yz(1,INDEX_IJK,ispec)
 
-      R_xx_val = R_xx(2,ijk,1,1,ispec)
-      R_yy_val = R_yy(2,ijk,1,1,ispec)
+      R_xx_val = R_xx(2,INDEX_IJK,ispec)
+      R_yy_val = R_yy(2,INDEX_IJK,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(2,ijk,1,1,ispec)
-      sigma_xz = sigma_xz - R_xz(2,ijk,1,1,ispec)
-      sigma_yz = sigma_yz - R_yz(2,ijk,1,1,ispec)
+      sigma_xy = sigma_xy - R_xy(2,INDEX_IJK,ispec)
+      sigma_xz = sigma_xz - R_xz(2,INDEX_IJK,ispec)
+      sigma_yz = sigma_yz - R_yz(2,INDEX_IJK,ispec)
 
-      R_xx_val = R_xx(3,ijk,1,1,ispec)
-      R_yy_val = R_yy(3,ijk,1,1,ispec)
+      R_xx_val = R_xx(3,INDEX_IJK,ispec)
+      R_yy_val = R_yy(3,INDEX_IJK,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(3,ijk,1,1,ispec)
-      sigma_xz = sigma_xz - R_xz(3,ijk,1,1,ispec)
-      sigma_yz = sigma_yz - R_yz(3,ijk,1,1,ispec)
+      sigma_xy = sigma_xy - R_xy(3,INDEX_IJK,ispec)
+      sigma_xz = sigma_xz - R_xz(3,INDEX_IJK,ispec)
+      sigma_yz = sigma_yz - R_yz(3,INDEX_IJK,ispec)
+#else
+!daniel debug: att - debug update
+!          call compute_element_att_mem_up_cm(ispec,INDEX_IJK, &
+!                                          R_xx(1,INDEX_IJK,ispec), &
+!                                          R_yy(1,INDEX_IJK,ispec), &
+!                                          R_xy(1,INDEX_IJK,ispec), &
+!                                          R_xz(1,INDEX_IJK,ispec), &
+!                                          R_yz(1,INDEX_IJK,ispec), &
+!                                          epsilondev_loc(:,INDEX_IJK),muvstore(INDEX_IJK,ispec),is_backward_field)
+
+      ! note: function inlining is generally done by fortran compilers;
+      !       compilers decide based on performance heuristics
+      ! note: fortran passes pointers to array location, thus R_memory(1,1,...) should be fine
+      call compute_element_att_stress(R_xx(1,INDEX_IJK,ispec), &
+                                      R_yy(1,INDEX_IJK,ispec), &
+                                      R_xy(1,INDEX_IJK,ispec), &
+                                      R_xz(1,INDEX_IJK,ispec), &
+                                      R_yz(1,INDEX_IJK,ispec), &
+                                      sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
+#endif
+
     endif ! ATTENUATION_VAL
 
     ! define symmetric components of sigma for gravity
@@ -624,7 +295,7 @@
     if(GRAVITY_VAL) then
         ! use mesh coordinates to get theta and phi
         ! x y and z contain r theta and phi
-        iglob = ibool(ijk,1,1,ispec)
+        iglob = ibool(INDEX_IJK,ispec)
 
         dtheta = dble(ystore(iglob))
         dphi = dble(zstore(iglob))
@@ -666,602 +337,89 @@
         Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
         Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
 
-        ! get displacement and multiply by density to compute G tensor
-        sx_l = rho * dummyx_loc(ijk,1,1)
-        sy_l = rho * dummyy_loc(ijk,1,1)
-        sz_l = rho * dummyz_loc(ijk,1,1)
+        ! distinguish between single and double precision for reals
+        if(CUSTOM_REAL == SIZE_REAL) then
 
-        ! compute G tensor from s . g and add to sigma (not symmetric)
-        sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
-        sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
-        sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
+          ! get displacement and multiply by density to compute G tensor
+          sx_l = rho * dble(dummyx_loc(INDEX_IJK)) ! dble(displ_crust_mantle(1,iglob))
+          sy_l = rho * dble(dummyy_loc(INDEX_IJK)) ! dble(displ_crust_mantle(2,iglob))
+          sz_l = rho * dble(dummyz_loc(INDEX_IJK)) ! dble(displ_crust_mantle(3,iglob))
 
-        sigma_xy = sigma_xy - sx_l * gyl
-        sigma_yx = sigma_yx - sy_l * gxl
+          ! compute G tensor from s . g and add to sigma (not symmetric)
+          sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
+          sigma_yy = sigma_yy + sngl(sx_l*gxl + sz_l*gzl)
+          sigma_zz = sigma_zz + sngl(sx_l*gxl + sy_l*gyl)
 
-        sigma_xz = sigma_xz - sx_l * gzl
-        sigma_zx = sigma_zx - sz_l * gxl
+          sigma_xy = sigma_xy - sngl(sx_l * gyl)
+          sigma_yx = sigma_yx - sngl(sy_l * gxl)
 
-        sigma_yz = sigma_yz - sy_l * gzl
-        sigma_zy = sigma_zy - sz_l * gyl
+          sigma_xz = sigma_xz - sngl(sx_l * gzl)
+          sigma_zx = sigma_zx - sngl(sz_l * gxl)
 
-        ! precompute vector
-        factor = jacobianl * wgll_cube(ijk,1,1)
-        rho_s_H(1,ijk,1,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-        rho_s_H(2,ijk,1,1) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-        rho_s_H(3,ijk,1,1) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
-    endif  ! end of section with gravity terms
+          sigma_yz = sigma_yz - sngl(sy_l * gzl)
+          sigma_zy = sigma_zy - sngl(sz_l * gyl)
 
-    ! form dot product with test vector, non-symmetric form
-    tempx1(ijk,1,1) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-    tempy1(ijk,1,1) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-    tempz1(ijk,1,1) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+          ! precompute vector
+          factor = dble(jacobianl) * wgll_cube(INDEX_IJK)
+          rho_s_H(1,INDEX_IJK) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
+          rho_s_H(2,INDEX_IJK) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
+          rho_s_H(3,INDEX_IJK) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
 
-    tempx2(ijk,1,1) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-    tempy2(ijk,1,1) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-    tempz2(ijk,1,1) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+        else
 
-    tempx3(ijk,1,1) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-    tempy3(ijk,1,1) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-    tempz3(ijk,1,1) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-  enddo
+          ! get displacement and multiply by density to compute G tensor
+          sx_l = rho * dummyx_loc(INDEX_IJK) ! displ_crust_mantle(1,iglob)
+          sy_l = rho * dummyy_loc(INDEX_IJK) ! displ_crust_mantle(2,iglob)
+          sz_l = rho * dummyz_loc(INDEX_IJK) ! displ_crust_mantle(3,iglob)
 
-  end subroutine compute_element_iso
+          ! compute G tensor from s . g and add to sigma (not symmetric)
+          sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
+          sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
+          sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
 
-! end of FORCE_VECTORIZATION
-#endif
+          sigma_xy = sigma_xy - sx_l * gyl
+          sigma_yx = sigma_yx - sy_l * gxl
 
-!--------------------------------------------------------------------------------------------
-!
-! transversely isotropic element
-!
-!--------------------------------------------------------------------------------------------
+          sigma_xz = sigma_xz - sx_l * gzl
+          sigma_zx = sigma_zx - sz_l * gxl
 
-#ifndef FORCE_VECTORIZATION
+          sigma_yz = sigma_yz - sy_l * gzl
+          sigma_zy = sigma_zy - sz_l * gyl
 
-  ! default, without vectorization
+          ! precompute vector
+          factor = jacobianl * wgll_cube(INDEX_IJK)
+          rho_s_H(1,INDEX_IJK) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+          rho_s_H(2,INDEX_IJK) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+          rho_s_H(3,INDEX_IJK) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
 
-  subroutine compute_element_tiso(ispec, &
-                                  minus_gravity_table,density_table,minus_deriv_gravity_table, &
-                                  xstore,ystore,zstore, &
-                                  xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                  wgll_cube, &
-                                  kappavstore,kappahstore,muvstore,muhstore,eta_anisostore, &
-                                  ibool, &
-                                  R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                  epsilon_trace_over_3, &
-                                  one_minus_sum_beta,vnspec, &
-                                  tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
-                                  dummyx_loc,dummyy_loc,dummyz_loc, &
-                                  epsilondev_loc,rho_s_H,is_backward_field)
-
-! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
-
-  use constants_solver
-  use specfem_par,only: COMPUTE_AND_STORE_STRAIN
-
-  implicit none
-
-  ! element id
-  integer :: ispec
-
-  ! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool
-
-  ! x y and z contain r theta and phi
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: xstore,ystore,zstore
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-
-  ! array with derivatives of Lagrange polynomials and precalculated products
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
-
-  ! store anisotropic properties only where needed to save memory
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_TISO_MANTLE) :: &
-        kappahstore,muhstore,eta_anisostore
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ISO_MANTLE) :: &
-        kappavstore,muvstore
-
-  ! attenuation
-  ! memory variables for attenuation
-  ! memory variables R_ij are stored at the local rather than global level
-  ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
-
-  ! variable sized array variables
-  integer :: vnspec
-
-  ! [alpha,beta,gamma]val reduced to N_SLS  to N_SLS*NUM_NODES
-  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec) :: one_minus_sum_beta
-
-  ! gravity
-  double precision, dimension(NRAD_GRAVITY) :: minus_gravity_table,density_table,minus_deriv_gravity_table
-
-  ! element info
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
-
-  logical :: is_backward_field
-
-! local parameters
-  real(kind=CUSTOM_REAL) one_minus_sum_beta_use
-  ! the 21 coefficients for an anisotropic medium in reduced notation
-  real(kind=CUSTOM_REAL) c11,c22,c33,c44,c55,c66,c12,c13,c23,c14,c24,c34,c15,c25,c35,c45,c16,c26,c36,c46,c56
-
-  real(kind=CUSTOM_REAL) rhovphsq,sinphifour,cosphisq,sinphisq,costhetasq,rhovsvsq,sinthetasq, &
-        cosphifour,costhetafour,rhovpvsq,sinthetafour,rhovshsq,cosfourphi, &
-        costwotheta,cosfourtheta,sintwophisq,costheta,sinphi,sintheta,cosphi, &
-        sintwotheta,costwophi,sintwophi,costwothetasq,costwophisq,phi,theta
-
-  real(kind=CUSTOM_REAL) two_rhovsvsq,two_rhovshsq ! two_rhovpvsq,two_rhovphsq
-  real(kind=CUSTOM_REAL) four_rhovsvsq,four_rhovshsq ! four_rhovpvsq,four_rhovphsq
-
-  real(kind=CUSTOM_REAL) twoetaminone,etaminone,eta_aniso
-  real(kind=CUSTOM_REAL) two_eta_aniso,four_eta_aniso,six_eta_aniso
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
-
-  real(kind=CUSTOM_REAL) templ
-  real(kind=CUSTOM_REAL) templ1,templ1_cos,templ2,templ2_cos,templ3,templ3_two,templ3_cos
-  real(kind=CUSTOM_REAL) kappavl,kappahl,muvl,muhl
-
-  ! for gravity
-  double precision dphi,dtheta
-  double precision radius,rho,minus_g,minus_dg
-  double precision minus_g_over_radius,minus_dg_plus_g_over_radius
-  double precision cos_theta,sin_theta,cos_phi,sin_phi
-  double precision cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq
-  double precision factor,sx_l,sy_l,sz_l,gxl,gyl,gzl
-  double precision Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl
-  real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
-
-  integer :: i,j,k
-  integer :: int_radius
-  integer :: iglob
-
-  logical :: dummyl
-
-!daniel debug
-  ! dummy to avoid compiler warning
-  dummyl = is_backward_field
-
-  ! transverse isotropic element
-
-  do k=1,NGLLZ
-    do j=1,NGLLY
-      do i=1,NGLLX
-        ! get derivatives of ux, uy and uz with respect to x, y and z
-        xixl = xix(i,j,k,ispec)
-        xiyl = xiy(i,j,k,ispec)
-        xizl = xiz(i,j,k,ispec)
-        etaxl = etax(i,j,k,ispec)
-        etayl = etay(i,j,k,ispec)
-        etazl = etaz(i,j,k,ispec)
-        gammaxl = gammax(i,j,k,ispec)
-        gammayl = gammay(i,j,k,ispec)
-        gammazl = gammaz(i,j,k,ispec)
-
-        ! compute the jacobian
-        jacobianl = 1.0_CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                      - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                      + xizl*(etaxl*gammayl-etayl*gammaxl))
-
-        duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-        duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-        duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-        duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-        duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-        duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-        duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-        duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-        duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-        ! precompute some sums to save CPU time
-        duxdxl_plus_duydyl = duxdxl + duydyl
-        duxdxl_plus_duzdzl = duxdxl + duzdzl
-        duydyl_plus_duzdzl = duydyl + duzdzl
-        duxdyl_plus_duydxl = duxdyl + duydxl
-        duzdxl_plus_duxdzl = duzdxl + duxdzl
-        duzdyl_plus_duydzl = duzdyl + duydzl
-
-!ZN beware, here the expression differs from the strain used in memory variable equation (6) in D. Komatitsch and J. Tromp 1999,
-!ZN here Brian Savage uses the engineering strain which are epsilon = 1/2*(grad U + (grad U)^T),
-!ZN where U is the displacement vector and grad the gradient operator, i.e. there is a 1/2 factor difference between the two.
-!ZN Both expressions are fine, but we need to keep in mind that if we put the 1/2 factor here there we need to remove it
-!ZN from the expression in which we use the strain later in the code.
-
-        ! compute deviatoric strain
-        if (COMPUTE_AND_STORE_STRAIN) then
-          templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-          if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) then
-            if( ispec == 1) then
-              epsilon_trace_over_3(i,j,k,1) = templ
-            endif
-          else
-            epsilon_trace_over_3(i,j,k,ispec) = templ
-          endif
-          epsilondev_loc(1,i,j,k) = duxdxl - templ
-          epsilondev_loc(2,i,j,k) = duydyl - templ
-          epsilondev_loc(3,i,j,k) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-          epsilondev_loc(4,i,j,k) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-          epsilondev_loc(5,i,j,k) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
         endif
 
-        !
-        ! compute either isotropic or anisotropic elements
-        !
+    endif  ! end of section with gravity terms
 
-! note: the mesh is built such that anisotropic elements are created first in anisotropic layers,
-!           thus they are listed first ( see in create_regions_mesh.f90: perm_layer() ordering )
-!           this is therefore still in bounds of 1:NSPECMAX_TISO_MANTLE even if NSPECMAX_TISO is less than NSPEC
+    ! form dot product with test vector, non-symmetric form
+    tempx1(INDEX_IJK) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+    tempy1(INDEX_IJK) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+    tempz1(INDEX_IJK) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
 
-        ! use kappa and mu from transversely isotropic model
-        kappavl = kappavstore(i,j,k,ispec)
-        muvl = muvstore(i,j,k,ispec)
+    tempx2(INDEX_IJK) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+    tempy2(INDEX_IJK) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+    tempz2(INDEX_IJK) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
 
-        kappahl = kappahstore(i,j,k,ispec)
-        muhl = muhstore(i,j,k,ispec)
+    tempx3(INDEX_IJK) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+    tempy3(INDEX_IJK) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+    tempz3(INDEX_IJK) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
 
-        ! use unrelaxed parameters if attenuation
-        ! eta does not need to be shifted since it is a ratio
-        if( ATTENUATION_VAL ) then
-          ! precompute terms for attenuation if needed
-          if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-            one_minus_sum_beta_use = one_minus_sum_beta(i,j,k,ispec)
-          else
-            one_minus_sum_beta_use = one_minus_sum_beta(1,1,1,ispec)
-          endif
-          muvl = muvl * one_minus_sum_beta_use
-          muhl = muhl * one_minus_sum_beta_use
-        endif
+  ENDDO_LOOP_IJK
 
-        rhovpvsq = kappavl + FOUR_THIRDS * muvl  !!! that is C
-        rhovphsq = kappahl + FOUR_THIRDS * muhl  !!! that is A
+  end subroutine compute_element_iso
 
-        rhovsvsq = muvl  !!! that is L
-        rhovshsq = muhl  !!! that is N
 
-        eta_aniso = eta_anisostore(i,j,k,ispec)  !!! that is  F / (A - 2 L)
+!--------------------------------------------------------------------------------------------
+!
+! transversely isotropic element
+!
+!--------------------------------------------------------------------------------------------
 
-        ! use mesh coordinates to get theta and phi
-        ! ystore and zstore contain theta and phi
-        iglob = ibool(i,j,k,ispec)
-
-        theta = ystore(iglob)
-        phi = zstore(iglob)
-
-         ! precompute some products to reduce the CPU time
-
-        costheta = cos(theta)
-        sintheta = sin(theta)
-        cosphi = cos(phi)
-        sinphi = sin(phi)
-
-        costhetasq = costheta * costheta
-        sinthetasq = sintheta * sintheta
-        cosphisq = cosphi * cosphi
-        sinphisq = sinphi * sinphi
-
-        costhetafour = costhetasq * costhetasq
-        sinthetafour = sinthetasq * sinthetasq
-        cosphifour = cosphisq * cosphisq
-        sinphifour = sinphisq * sinphisq
-
-        costwotheta = cos(2.0_CUSTOM_REAL*theta)
-        sintwotheta = sin(2.0_CUSTOM_REAL*theta)
-        costwophi = cos(2.0_CUSTOM_REAL*phi)
-        sintwophi = sin(2.0_CUSTOM_REAL*phi)
-
-        cosfourtheta = cos(4.0_CUSTOM_REAL*theta)
-        cosfourphi = cos(4.0_CUSTOM_REAL*phi)
-
-        costwothetasq = costwotheta * costwotheta
-
-        costwophisq = costwophi * costwophi
-        sintwophisq = sintwophi * sintwophi
-
-        etaminone = eta_aniso - 1.0_CUSTOM_REAL
-        twoetaminone = 2.0_CUSTOM_REAL * eta_aniso - 1.0_CUSTOM_REAL
-
-        ! precompute some products to reduce the CPU time
-        two_eta_aniso = 2.0_CUSTOM_REAL*eta_aniso
-        four_eta_aniso = 4.0_CUSTOM_REAL*eta_aniso
-        six_eta_aniso = 6.0_CUSTOM_REAL*eta_aniso
-
-        two_rhovsvsq = 2.0_CUSTOM_REAL*rhovsvsq
-        two_rhovshsq = 2.0_CUSTOM_REAL*rhovshsq
-        four_rhovsvsq = 4.0_CUSTOM_REAL*rhovsvsq
-        four_rhovshsq = 4.0_CUSTOM_REAL*rhovshsq
-
-        ! pre-compute temporary values
-        templ1 = four_rhovsvsq - rhovpvsq + twoetaminone*rhovphsq - four_eta_aniso*rhovsvsq
-        templ1_cos = rhovphsq - rhovpvsq + costwotheta*templ1
-        templ2 = four_rhovsvsq - rhovpvsq - rhovphsq + two_eta_aniso*rhovphsq - four_eta_aniso*rhovsvsq
-        templ2_cos = rhovpvsq - rhovphsq + costwotheta*templ2
-        templ3 = rhovphsq + rhovpvsq - two_eta_aniso*rhovphsq + four_eta_aniso*rhovsvsq
-        templ3_two = templ3 - two_rhovshsq - two_rhovsvsq
-        templ3_cos = templ3_two + costwotheta*templ2
-
-        ! reordering operations to facilitate compilation, avoiding divisions, using locality for temporary values
-        c11 = rhovphsq*sinphifour &
-              + 2.0_CUSTOM_REAL*cosphisq*sinphisq* &
-              ( rhovphsq*costhetasq + sinthetasq*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq) ) &
-              + cosphifour*(rhovphsq*costhetafour &
-                + 2.0_CUSTOM_REAL*costhetasq*sinthetasq*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq) &
-                + rhovpvsq*sinthetafour)
-
-        c12 = 0.25_CUSTOM_REAL*costhetasq*(rhovphsq - two_rhovshsq)*(3.0_CUSTOM_REAL + cosfourphi) &
-              - four_rhovshsq*cosphisq*costhetasq*sinphisq &
-              + 0.03125_CUSTOM_REAL*rhovphsq*sintwophisq*(11.0_CUSTOM_REAL + cosfourtheta + 4.0*costwotheta) &
-              + eta_aniso*sinthetasq*(rhovphsq - two_rhovsvsq) &
-                         *(cosphifour + sinphifour + 2.0_CUSTOM_REAL*cosphisq*costhetasq*sinphisq) &
-              + rhovpvsq*cosphisq*sinphisq*sinthetafour &
-              - rhovsvsq*sintwophisq*sinthetafour
-
-        c13 = 0.125_CUSTOM_REAL*cosphisq*(rhovphsq + six_eta_aniso*rhovphsq + rhovpvsq - four_rhovsvsq &
-                    - 12.0_CUSTOM_REAL*eta_aniso*rhovsvsq + cosfourtheta*templ1) &
-              + sinphisq*(eta_aniso*costhetasq*(rhovphsq - two_rhovsvsq) + sinthetasq*(rhovphsq - two_rhovshsq))
-
-        ! uses temporary templ1 from c13
-        c15 = cosphi*costheta*sintheta* &
-              ( 0.5_CUSTOM_REAL*cosphisq* (rhovpvsq - rhovphsq + costwotheta*templ1) &
-                + etaminone*sinphisq*(rhovphsq - two_rhovsvsq))
-
-        c14 = costheta*sinphi*sintheta* &
-              ( 0.5_CUSTOM_REAL*cosphisq*(templ2_cos + four_rhovshsq - four_rhovsvsq) &
-                + sinphisq*(etaminone*rhovphsq + 2.0_CUSTOM_REAL*(rhovshsq - eta_aniso*rhovsvsq)) )
-
-        ! uses temporary templ2_cos from c14
-        c16 = 0.5_CUSTOM_REAL*cosphi*sinphi*sinthetasq* &
-              ( cosphisq*templ2_cos &
-                + 2.0_CUSTOM_REAL*etaminone*sinphisq*(rhovphsq - two_rhovsvsq) )
-
-        c22 = rhovphsq*cosphifour + 2.0_CUSTOM_REAL*cosphisq*sinphisq* &
-              (rhovphsq*costhetasq + sinthetasq*(eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)) &
-              + sinphifour* &
-              (rhovphsq*costhetafour + 2.0_CUSTOM_REAL*costhetasq*sinthetasq*(eta_aniso*rhovphsq  &
-                    + two_rhovsvsq - two_eta_aniso*rhovsvsq) + rhovpvsq*sinthetafour)
-
-        ! uses temporary templ1 from c13
-        c23 = 0.125_CUSTOM_REAL*sinphisq*(rhovphsq + six_eta_aniso*rhovphsq &
-                + rhovpvsq - four_rhovsvsq - 12.0_CUSTOM_REAL*eta_aniso*rhovsvsq + cosfourtheta*templ1) &
-              + cosphisq*(eta_aniso*costhetasq*(rhovphsq - two_rhovsvsq) + sinthetasq*(rhovphsq - two_rhovshsq))
-
-        ! uses temporary templ1 from c13
-        c24 = costheta*sinphi*sintheta* &
-              ( etaminone*cosphisq*(rhovphsq - two_rhovsvsq) &
-                + 0.5_CUSTOM_REAL*sinphisq*(rhovpvsq - rhovphsq + costwotheta*templ1) )
-
-        ! uses temporary templ2_cos from c14
-        c25 = cosphi*costheta*sintheta* &
-              ( cosphisq*(etaminone*rhovphsq + 2.0_CUSTOM_REAL*(rhovshsq - eta_aniso*rhovsvsq)) &
-                + 0.5_CUSTOM_REAL*sinphisq*(templ2_cos + four_rhovshsq - four_rhovsvsq) )
-
-        ! uses temporary templ2_cos from c14
-        c26 = 0.5_CUSTOM_REAL*cosphi*sinphi*sinthetasq* &
-              ( 2.0_CUSTOM_REAL*etaminone*cosphisq*(rhovphsq - two_rhovsvsq) &
-                + sinphisq*templ2_cos )
-
-        c33 = rhovpvsq*costhetafour &
-              + 2.0_CUSTOM_REAL*costhetasq*sinthetasq*(two_rhovsvsq + eta_aniso*(rhovphsq - two_rhovsvsq)) &
-              + rhovphsq*sinthetafour
-
-        ! uses temporary templ1_cos from c13
-        c34 = - 0.25_CUSTOM_REAL*sinphi*sintwotheta*templ1_cos
-
-        ! uses temporary templ1_cos from c34
-        c35 = - 0.25_CUSTOM_REAL*cosphi*sintwotheta*templ1_cos
-
-        ! uses temporary templ1_cos from c34
-        c36 = - 0.25_CUSTOM_REAL*sintwophi*sinthetasq*(templ1_cos - four_rhovshsq + four_rhovsvsq)
-
-        c44 = cosphisq*(rhovsvsq*costhetasq + rhovshsq*sinthetasq) &
-              + sinphisq*(rhovsvsq*costwothetasq + costhetasq*sinthetasq*templ3)
-
-        ! uses temporary templ3 from c44
-        c46 = - cosphi*costheta*sintheta* &
-                ( cosphisq*(rhovshsq - rhovsvsq) - 0.5_CUSTOM_REAL*sinphisq*templ3_cos  )
-
-        ! uses templ3 from c46
-        c45 = 0.25_CUSTOM_REAL*sintwophi*sinthetasq* &
-              (templ3_two + costwotheta*(rhovphsq + rhovpvsq - two_eta_aniso*rhovphsq + 4.0_CUSTOM_REAL*etaminone*rhovsvsq))
-
-        c55 = sinphisq*(rhovsvsq*costhetasq + rhovshsq*sinthetasq) &
-              + cosphisq*(rhovsvsq*costwothetasq &
-                  + costhetasq*sinthetasq*(rhovphsq - two_eta_aniso*rhovphsq + rhovpvsq + four_eta_aniso*rhovsvsq) )
-
-        ! uses temporary templ3_cos from c46
-        c56 = costheta*sinphi*sintheta* &
-              ( 0.5_CUSTOM_REAL*cosphisq*templ3_cos + sinphisq*(rhovsvsq - rhovshsq) )
-
-        c66 = rhovshsq*costwophisq*costhetasq &
-              - 2.0_CUSTOM_REAL*cosphisq*costhetasq*sinphisq*(rhovphsq - two_rhovshsq) &
-              + 0.03125_CUSTOM_REAL*rhovphsq*sintwophisq*(11.0_CUSTOM_REAL + 4.0_CUSTOM_REAL*costwotheta + cosfourtheta) &
-              - 0.125_CUSTOM_REAL*rhovsvsq*sinthetasq* &
-              ( -6.0_CUSTOM_REAL - 2.0_CUSTOM_REAL*costwotheta - 2.0_CUSTOM_REAL*cosfourphi &
-                        + cos(4.0_CUSTOM_REAL*phi - 2.0_CUSTOM_REAL*theta) &
-                        + cos(2.0_CUSTOM_REAL*(2.0_CUSTOM_REAL*phi + theta)) ) &
-              + rhovpvsq*cosphisq*sinphisq*sinthetafour &
-              - 0.5_CUSTOM_REAL*eta_aniso*sintwophisq*sinthetafour*(rhovphsq - two_rhovsvsq)
-
-        ! general expression of stress tensor for full Cijkl with 21 coefficients
-        sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                 c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
-
-        sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                 c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
-
-        sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                 c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
-
-        sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                 c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
-
-        sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                 c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
-
-        sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                 c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-        ! subtract memory variables if attenuation
-        if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
-
-          ! note: function inlining is generally done by fortran compilers;
-          !       compilers decide based on performance heuristics
-          ! note: fortran passes pointers to array location, thus R_memory(1,1,...) is fine
-          call compute_element_att_stress(R_xx(1,i,j,k,ispec), &
-                                          R_yy(1,i,j,k,ispec), &
-                                          R_xy(1,i,j,k,ispec), &
-                                          R_xz(1,i,j,k,ispec), &
-                                          R_yz(1,i,j,k,ispec), &
-                                          sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
-
-        endif ! ATTENUATION_VAL
-
-        ! define symmetric components of sigma for gravity
-        sigma_yx = sigma_xy
-        sigma_zx = sigma_xz
-        sigma_zy = sigma_yz
-
-        ! compute non-symmetric terms for gravity
-        if(GRAVITY_VAL) then
-            ! use mesh coordinates to get theta and phi
-            ! x y and z contain r theta and phi
-            iglob = ibool(i,j,k,ispec)
-
-            dtheta = dble(ystore(iglob))
-            dphi = dble(zstore(iglob))
-            radius = dble(xstore(iglob))
-
-            cos_theta = dcos(dtheta)
-            sin_theta = dsin(dtheta)
-            cos_phi = dcos(dphi)
-            sin_phi = dsin(dphi)
-
-            cos_theta_sq = cos_theta*cos_theta
-            sin_theta_sq = sin_theta*sin_theta
-            cos_phi_sq = cos_phi*cos_phi
-            sin_phi_sq = sin_phi*sin_phi
-
-            ! get g, rho and dg/dr=dg
-            ! spherical components of the gravitational acceleration
-            ! for efficiency replace with lookup table every 100 m in radial direction
-            int_radius = nint(10.d0 * radius * R_EARTH_KM )
-            minus_g = minus_gravity_table(int_radius)
-            minus_dg = minus_deriv_gravity_table(int_radius)
-            rho = density_table(int_radius)
-
-            ! Cartesian components of the gravitational acceleration
-            gxl = minus_g*sin_theta*cos_phi
-            gyl = minus_g*sin_theta*sin_phi
-            gzl = minus_g*cos_theta
-
-            ! Cartesian components of gradient of gravitational acceleration
-            ! obtained from spherical components
-            minus_g_over_radius = minus_g / radius
-            minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius
-
-            Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq
-            Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq
-            Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq
-            Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq
-            Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
-            Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
-
-            ! distinguish between single and double precision for reals
-            if(CUSTOM_REAL == SIZE_REAL) then
-
-              ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * dble(dummyx_loc(i,j,k))
-              sy_l = rho * dble(dummyy_loc(i,j,k))
-              sz_l = rho * dble(dummyz_loc(i,j,k))
-
-              ! compute G tensor from s . g and add to sigma (not symmetric)
-              sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
-              sigma_yy = sigma_yy + sngl(sx_l*gxl + sz_l*gzl)
-              sigma_zz = sigma_zz + sngl(sx_l*gxl + sy_l*gyl)
-
-              sigma_xy = sigma_xy - sngl(sx_l * gyl)
-              sigma_yx = sigma_yx - sngl(sy_l * gxl)
-
-              sigma_xz = sigma_xz - sngl(sx_l * gzl)
-              sigma_zx = sigma_zx - sngl(sz_l * gxl)
-
-              sigma_yz = sigma_yz - sngl(sy_l * gzl)
-              sigma_zy = sigma_zy - sngl(sz_l * gyl)
-
-              ! precompute vector
-              factor = dble(jacobianl) * wgll_cube(i,j,k)
-              rho_s_H(1,i,j,k) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
-              rho_s_H(2,i,j,k) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
-              rho_s_H(3,i,j,k) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
-
-            else
-
-              ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * dummyx_loc(i,j,k)
-              sy_l = rho * dummyy_loc(i,j,k)
-              sz_l = rho * dummyz_loc(i,j,k)
-
-              ! compute G tensor from s . g and add to sigma (not symmetric)
-              sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
-              sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
-              sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
-
-              sigma_xy = sigma_xy - sx_l * gyl
-              sigma_yx = sigma_yx - sy_l * gxl
-
-              sigma_xz = sigma_xz - sx_l * gzl
-              sigma_zx = sigma_zx - sz_l * gxl
-
-              sigma_yz = sigma_yz - sy_l * gzl
-              sigma_zy = sigma_zy - sz_l * gyl
-
-              ! precompute vector
-              factor = jacobianl * wgll_cube(i,j,k)
-              rho_s_H(1,i,j,k) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-              rho_s_H(2,i,j,k) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-              rho_s_H(3,i,j,k) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
-
-            endif
-
-        endif  ! end of section with gravity terms
-
-        ! form dot product with test vector, non-symmetric form
-        tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-        tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-        tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-        tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-        tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-        tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-        tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-        tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-        tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-      enddo ! NGLLX
-    enddo ! NGLLY
-  enddo ! NGLLZ
-
-  end subroutine compute_element_tiso
-
-#else
-
-  ! FORCE_VECTORIZATION
-
   subroutine compute_element_tiso(ispec, &
                                   minus_gravity_table,density_table,minus_deriv_gravity_table, &
                                   xstore,ystore,zstore, &
@@ -1274,7 +432,7 @@
                                   one_minus_sum_beta,vnspec, &
                                   tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
                                   dummyx_loc,dummyy_loc,dummyz_loc, &
-                                  epsilondev_loc,rho_s_H,is_backward_field)
+                                  epsilondev_loc,rho_s_H)
 
 ! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
 
@@ -1329,8 +487,6 @@
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
   real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
 
-  logical :: is_backward_field
-
 ! local parameters
   real(kind=CUSTOM_REAL) one_minus_sum_beta_use
   ! the 21 coefficients for an anisotropic medium in reduced notation
@@ -1369,48 +525,47 @@
   integer :: int_radius
   integer :: iglob
 
-  logical :: dummyl
-
-  ! force vectorization
-  integer :: ijk
-  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
-
+#ifdef FORCE_VECTORIZATION
 ! in this vectorized version we have to assume that N_SLS == 3 in order to be able to unroll and thus suppress
 ! an inner loop that would otherwise prevent vectorization; this is safe in practice in all cases because N_SLS == 3
 ! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop
+  integer :: ijk
+  real(kind=CUSTOM_REAL) :: R_xx_val,R_yy_val
+#else
+  integer :: i,j,k
+#endif
 
-!daniel debug
-  ! dummy to avoid compiler warning
-  dummyl = is_backward_field
+  ! transverse isotropic element
 
-  do ijk=1,NGLLCUBE
+  DO_LOOP_IJK
+
     ! get derivatives of ux, uy and uz with respect to x, y and z
-    xixl = xix(ijk,1,1,ispec)
-    xiyl = xiy(ijk,1,1,ispec)
-    xizl = xiz(ijk,1,1,ispec)
-    etaxl = etax(ijk,1,1,ispec)
-    etayl = etay(ijk,1,1,ispec)
-    etazl = etaz(ijk,1,1,ispec)
-    gammaxl = gammax(ijk,1,1,ispec)
-    gammayl = gammay(ijk,1,1,ispec)
-    gammazl = gammaz(ijk,1,1,ispec)
+    xixl = xix(INDEX_IJK,ispec)
+    xiyl = xiy(INDEX_IJK,ispec)
+    xizl = xiz(INDEX_IJK,ispec)
+    etaxl = etax(INDEX_IJK,ispec)
+    etayl = etay(INDEX_IJK,ispec)
+    etazl = etaz(INDEX_IJK,ispec)
+    gammaxl = gammax(INDEX_IJK,ispec)
+    gammayl = gammay(INDEX_IJK,ispec)
+    gammazl = gammaz(INDEX_IJK,ispec)
 
     ! compute the jacobian
     jacobianl = 1.0_CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
                   - xiyl*(etaxl*gammazl-etazl*gammaxl) &
                   + xizl*(etaxl*gammayl-etayl*gammaxl))
 
-    duxdxl = xixl*tempx1(ijk,1,1) + etaxl*tempx2(ijk,1,1) + gammaxl*tempx3(ijk,1,1)
-    duxdyl = xiyl*tempx1(ijk,1,1) + etayl*tempx2(ijk,1,1) + gammayl*tempx3(ijk,1,1)
-    duxdzl = xizl*tempx1(ijk,1,1) + etazl*tempx2(ijk,1,1) + gammazl*tempx3(ijk,1,1)
+    duxdxl = xixl*tempx1(INDEX_IJK) + etaxl*tempx2(INDEX_IJK) + gammaxl*tempx3(INDEX_IJK)
+    duxdyl = xiyl*tempx1(INDEX_IJK) + etayl*tempx2(INDEX_IJK) + gammayl*tempx3(INDEX_IJK)
+    duxdzl = xizl*tempx1(INDEX_IJK) + etazl*tempx2(INDEX_IJK) + gammazl*tempx3(INDEX_IJK)
 
-    duydxl = xixl*tempy1(ijk,1,1) + etaxl*tempy2(ijk,1,1) + gammaxl*tempy3(ijk,1,1)
-    duydyl = xiyl*tempy1(ijk,1,1) + etayl*tempy2(ijk,1,1) + gammayl*tempy3(ijk,1,1)
-    duydzl = xizl*tempy1(ijk,1,1) + etazl*tempy2(ijk,1,1) + gammazl*tempy3(ijk,1,1)
+    duydxl = xixl*tempy1(INDEX_IJK) + etaxl*tempy2(INDEX_IJK) + gammaxl*tempy3(INDEX_IJK)
+    duydyl = xiyl*tempy1(INDEX_IJK) + etayl*tempy2(INDEX_IJK) + gammayl*tempy3(INDEX_IJK)
+    duydzl = xizl*tempy1(INDEX_IJK) + etazl*tempy2(INDEX_IJK) + gammazl*tempy3(INDEX_IJK)
 
-    duzdxl = xixl*tempz1(ijk,1,1) + etaxl*tempz2(ijk,1,1) + gammaxl*tempz3(ijk,1,1)
-    duzdyl = xiyl*tempz1(ijk,1,1) + etayl*tempz2(ijk,1,1) + gammayl*tempz3(ijk,1,1)
-    duzdzl = xizl*tempz1(ijk,1,1) + etazl*tempz2(ijk,1,1) + gammazl*tempz3(ijk,1,1)
+    duzdxl = xixl*tempz1(INDEX_IJK) + etaxl*tempz2(INDEX_IJK) + gammaxl*tempz3(INDEX_IJK)
+    duzdyl = xiyl*tempz1(INDEX_IJK) + etayl*tempz2(INDEX_IJK) + gammayl*tempz3(INDEX_IJK)
+    duzdzl = xizl*tempz1(INDEX_IJK) + etazl*tempz2(INDEX_IJK) + gammazl*tempz3(INDEX_IJK)
 
     ! precompute some sums to save CPU time
     duxdxl_plus_duydyl = duxdxl + duydyl
@@ -1431,16 +586,16 @@
       templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
       if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) then
         if( ispec == 1) then
-          epsilon_trace_over_3(ijk,1,1,1) = templ
+          epsilon_trace_over_3(INDEX_IJK,1) = templ
         endif
       else
-        epsilon_trace_over_3(ijk,1,1,ispec) = templ
+        epsilon_trace_over_3(INDEX_IJK,ispec) = templ
       endif
-      epsilondev_loc(1,ijk,1,1) = duxdxl - templ
-      epsilondev_loc(2,ijk,1,1) = duydyl - templ
-      epsilondev_loc(3,ijk,1,1) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-      epsilondev_loc(4,ijk,1,1) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-      epsilondev_loc(5,ijk,1,1) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+      epsilondev_loc(1,INDEX_IJK) = duxdxl - templ
+      epsilondev_loc(2,INDEX_IJK) = duydyl - templ
+      epsilondev_loc(3,INDEX_IJK) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+      epsilondev_loc(4,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+      epsilondev_loc(5,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
     endif
 
     !
@@ -1452,18 +607,18 @@
 !           this is therefore still in bounds of 1:NSPECMAX_TISO_MANTLE even if NSPECMAX_TISO is less than NSPEC
 
     ! use kappa and mu from transversely isotropic model
-    kappavl = kappavstore(ijk,1,1,ispec)
-    muvl = muvstore(ijk,1,1,ispec)
+    kappavl = kappavstore(INDEX_IJK,ispec)
+    muvl = muvstore(INDEX_IJK,ispec)
 
-    kappahl = kappahstore(ijk,1,1,ispec)
-    muhl = muhstore(ijk,1,1,ispec)
+    kappahl = kappahstore(INDEX_IJK,ispec)
+    muhl = muhstore(INDEX_IJK,ispec)
 
     ! use unrelaxed parameters if attenuation
     ! eta does not need to be shifted since it is a ratio
-    if(ATTENUATION_VAL) then
+    if( ATTENUATION_VAL ) then
       ! precompute terms for attenuation if needed
       if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-        one_minus_sum_beta_use = one_minus_sum_beta(ijk,1,1,ispec)
+        one_minus_sum_beta_use = one_minus_sum_beta(INDEX_IJK,ispec)
       else
         one_minus_sum_beta_use = one_minus_sum_beta(1,1,1,ispec)
       endif
@@ -1477,11 +632,11 @@
     rhovsvsq = muvl  !!! that is L
     rhovshsq = muhl  !!! that is N
 
-    eta_aniso = eta_anisostore(ijk,1,1,ispec)  !!! that is  F / (A - 2 L)
+    eta_aniso = eta_anisostore(INDEX_IJK,ispec)  !!! that is  F / (A - 2 L)
 
     ! use mesh coordinates to get theta and phi
     ! ystore and zstore contain theta and phi
-    iglob = ibool(ijk,1,1,ispec)
+    iglob = ibool(INDEX_IJK,ispec)
 
     theta = ystore(iglob)
     phi = zstore(iglob)
@@ -1660,35 +815,51 @@
              c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
 
     ! subtract memory variables if attenuation
-    if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL) then
-! do NOT put this is a subroutine, otherwise the call to the subroutine prevents compilers from vectorizing the outer loop
-! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop in order to vectorize the outer loop
-      R_xx_val = R_xx(1,ijk,1,1,ispec)
-      R_yy_val = R_yy(1,ijk,1,1,ispec)
+    if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
+#ifdef FORCE_VECTORIZATION
+      ! do NOT put this is a subroutine, otherwise the call to the subroutine prevents compilers
+      ! from vectorizing the outer loop
+
+      ! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop
+      ! in order to vectorize the outer loop
+      R_xx_val = R_xx(1,INDEX_IJK,ispec)
+      R_yy_val = R_yy(1,INDEX_IJK,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(1,ijk,1,1,ispec)
-      sigma_xz = sigma_xz - R_xz(1,ijk,1,1,ispec)
-      sigma_yz = sigma_yz - R_yz(1,ijk,1,1,ispec)
+      sigma_xy = sigma_xy - R_xy(1,INDEX_IJK,ispec)
+      sigma_xz = sigma_xz - R_xz(1,INDEX_IJK,ispec)
+      sigma_yz = sigma_yz - R_yz(1,INDEX_IJK,ispec)
 
-      R_xx_val = R_xx(2,ijk,1,1,ispec)
-      R_yy_val = R_yy(2,ijk,1,1,ispec)
+      R_xx_val = R_xx(2,INDEX_IJK,ispec)
+      R_yy_val = R_yy(2,INDEX_IJK,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(2,ijk,1,1,ispec)
-      sigma_xz = sigma_xz - R_xz(2,ijk,1,1,ispec)
-      sigma_yz = sigma_yz - R_yz(2,ijk,1,1,ispec)
+      sigma_xy = sigma_xy - R_xy(2,INDEX_IJK,ispec)
+      sigma_xz = sigma_xz - R_xz(2,INDEX_IJK,ispec)
+      sigma_yz = sigma_yz - R_yz(2,INDEX_IJK,ispec)
 
       R_xx_val = R_xx(3,ijk,1,1,ispec)
       R_yy_val = R_yy(3,ijk,1,1,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(3,ijk,1,1,ispec)
-      sigma_xz = sigma_xz - R_xz(3,ijk,1,1,ispec)
-      sigma_yz = sigma_yz - R_yz(3,ijk,1,1,ispec)
+      sigma_xy = sigma_xy - R_xy(3,INDEX_IJK,ispec)
+      sigma_xz = sigma_xz - R_xz(3,INDEX_IJK,ispec)
+      sigma_yz = sigma_yz - R_yz(3,INDEX_IJK,ispec)
+#else
+      ! note: function inlining is generally done by fortran compilers;
+      !       compilers decide based on performance heuristics
+      ! note: fortran passes pointers to array location, thus R_memory(1,1,...) is fine
+      call compute_element_att_stress(R_xx(1,INDEX_IJK,ispec), &
+                                      R_yy(1,INDEX_IJK,ispec), &
+                                      R_xy(1,INDEX_IJK,ispec), &
+                                      R_xz(1,INDEX_IJK,ispec), &
+                                      R_yz(1,INDEX_IJK,ispec), &
+                                      sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
+#endif
+
     endif ! ATTENUATION_VAL
 
     ! define symmetric components of sigma for gravity
@@ -1700,7 +871,7 @@
     if(GRAVITY_VAL) then
         ! use mesh coordinates to get theta and phi
         ! x y and z contain r theta and phi
-        iglob = ibool(ijk,1,1,ispec)
+        iglob = ibool(INDEX_IJK,ispec)
 
         dtheta = dble(ystore(iglob))
         dphi = dble(zstore(iglob))
@@ -1741,440 +912,89 @@
         Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
         Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
 
-        ! get displacement and multiply by density to compute G tensor
-        sx_l = rho * dummyx_loc(ijk,1,1)
-        sy_l = rho * dummyy_loc(ijk,1,1)
-        sz_l = rho * dummyz_loc(ijk,1,1)
+        ! distinguish between single and double precision for reals
+        if(CUSTOM_REAL == SIZE_REAL) then
 
-        ! compute G tensor from s . g and add to sigma (not symmetric)
-        sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
-        sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
-        sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
+          ! get displacement and multiply by density to compute G tensor
+          sx_l = rho * dble(dummyx_loc(INDEX_IJK))
+          sy_l = rho * dble(dummyy_loc(INDEX_IJK))
+          sz_l = rho * dble(dummyz_loc(INDEX_IJK))
 
-        sigma_xy = sigma_xy - sx_l * gyl
-        sigma_yx = sigma_yx - sy_l * gxl
+          ! compute G tensor from s . g and add to sigma (not symmetric)
+          sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
+          sigma_yy = sigma_yy + sngl(sx_l*gxl + sz_l*gzl)
+          sigma_zz = sigma_zz + sngl(sx_l*gxl + sy_l*gyl)
 
-        sigma_xz = sigma_xz - sx_l * gzl
-        sigma_zx = sigma_zx - sz_l * gxl
+          sigma_xy = sigma_xy - sngl(sx_l * gyl)
+          sigma_yx = sigma_yx - sngl(sy_l * gxl)
 
-        sigma_yz = sigma_yz - sy_l * gzl
-        sigma_zy = sigma_zy - sz_l * gyl
+          sigma_xz = sigma_xz - sngl(sx_l * gzl)
+          sigma_zx = sigma_zx - sngl(sz_l * gxl)
 
-        ! precompute vector
-        factor = jacobianl * wgll_cube(ijk,1,1)
-        rho_s_H(1,ijk,1,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-        rho_s_H(2,ijk,1,1) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-        rho_s_H(3,ijk,1,1) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
-    endif  ! end of section with gravity terms
+          sigma_yz = sigma_yz - sngl(sy_l * gzl)
+          sigma_zy = sigma_zy - sngl(sz_l * gyl)
 
-    ! form dot product with test vector, non-symmetric form
-    tempx1(ijk,1,1) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-    tempy1(ijk,1,1) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-    tempz1(ijk,1,1) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+          ! precompute vector
+          factor = dble(jacobianl) * wgll_cube(INDEX_IJK)
+          rho_s_H(1,INDEX_IJK) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
+          rho_s_H(2,INDEX_IJK) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
+          rho_s_H(3,INDEX_IJK) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
 
-    tempx2(ijk,1,1) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-    tempy2(ijk,1,1) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-    tempz2(ijk,1,1) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+        else
 
-    tempx3(ijk,1,1) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-    tempy3(ijk,1,1) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-    tempz3(ijk,1,1) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-  enddo
+          ! get displacement and multiply by density to compute G tensor
+          sx_l = rho * dummyx_loc(INDEX_IJK)
+          sy_l = rho * dummyy_loc(INDEX_IJK)
+          sz_l = rho * dummyz_loc(INDEX_IJK)
 
-  end subroutine compute_element_tiso
+          ! compute G tensor from s . g and add to sigma (not symmetric)
+          sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
+          sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
+          sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
 
-! end of FORCE_VECTORIZATION
-#endif
+          sigma_xy = sigma_xy - sx_l * gyl
+          sigma_yx = sigma_yx - sy_l * gxl
 
+          sigma_xz = sigma_xz - sx_l * gzl
+          sigma_zx = sigma_zx - sz_l * gxl
 
-!--------------------------------------------------------------------------------------------
-!
-! anisotropic element
-!
-!--------------------------------------------------------------------------------------------
+          sigma_yz = sigma_yz - sy_l * gzl
+          sigma_zy = sigma_zy - sz_l * gyl
 
-#ifndef FORCE_VECTORIZATION
+          ! precompute vector
+          factor = jacobianl * wgll_cube(INDEX_IJK)
+          rho_s_H(1,INDEX_IJK) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+          rho_s_H(2,INDEX_IJK) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+          rho_s_H(3,INDEX_IJK) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
 
-  ! default, without vectorization
-
-  subroutine compute_element_aniso(ispec, &
-                                   minus_gravity_table,density_table,minus_deriv_gravity_table, &
-                                   xstore,ystore,zstore, &
-                                   xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-                                   wgll_cube, &
-                                   c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
-                                   c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
-                                   c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
-                                   ibool, &
-                                   R_xx,R_yy,R_xy,R_xz,R_yz, &
-                                   epsilon_trace_over_3, &
-                                   one_minus_sum_beta,vnspec, &
-                                   tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
-                                   dummyx_loc,dummyy_loc,dummyz_loc, &
-                                   epsilondev_loc,rho_s_H,is_backward_field)
-
-  use constants_solver
-  use specfem_par,only: COMPUTE_AND_STORE_STRAIN
-
-  implicit none
-
-  ! element id
-  integer :: ispec
-
-  ! arrays with mesh parameters per slice
-  integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool
-
-  ! x y and z contain r theta and phi
-  real(kind=CUSTOM_REAL), dimension(NGLOB_CRUST_MANTLE) :: xstore,ystore,zstore
-
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: &
-        xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz
-
-  ! array with derivatives of Lagrange polynomials and precalculated products
-  double precision, dimension(NGLLX,NGLLY,NGLLZ) :: wgll_cube
-
-  ! arrays for full anisotropy only when needed
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPECMAX_ANISO_MANTLE) :: &
-        c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
-        c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
-        c36store,c44store,c45store,c46store,c55store,c56store,c66store
-
-  ! attenuation
-  ! memory variables for attenuation
-  ! memory variables R_ij are stored at the local rather than global level
-  ! to allow for optimization of cache access by compiler
-  real(kind=CUSTOM_REAL), dimension(N_SLS,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ATTENUATION) :: R_xx,R_yy,R_xy,R_xz,R_yz
-
-  real(kind=CUSTOM_REAL),dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: epsilon_trace_over_3
-
-  ! variable sized array variables
-  integer :: vnspec
-
-  ! [alpha,beta,gamma]val reduced to N_SLS  to N_SLS*NUM_NODES
-  real(kind=CUSTOM_REAL), dimension(ATT1_VAL,ATT2_VAL,ATT3_VAL,vnspec) :: one_minus_sum_beta
-
-  ! gravity
-  double precision, dimension(NRAD_GRAVITY) :: minus_gravity_table,density_table,minus_deriv_gravity_table
-
-  ! element info
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
-    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
-
-  real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
-  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
-
-  logical :: is_backward_field
-
-! local parameters
-  ! real(kind=CUSTOM_REAL) one_minus_sum_beta_use
-  real(kind=CUSTOM_REAL) minus_sum_beta,mul
-  ! the 21 coefficients for an anisotropic medium in reduced notation
-  real(kind=CUSTOM_REAL) c11,c22,c33,c44,c55,c66,c12,c13,c23,c14,c24,c34,c15,c25,c35,c45,c16,c26,c36,c46,c56
-
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-  real(kind=CUSTOM_REAL) duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
-  real(kind=CUSTOM_REAL) duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-  real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
-
-  real(kind=CUSTOM_REAL) templ
-
-  ! for gravity
-  double precision dphi,dtheta
-  double precision radius,rho,minus_g,minus_dg
-  double precision minus_g_over_radius,minus_dg_plus_g_over_radius
-  double precision cos_theta,sin_theta,cos_phi,sin_phi
-  double precision cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq
-  double precision factor,sx_l,sy_l,sz_l,gxl,gyl,gzl
-  double precision Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl
-  real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
-
-  integer :: i,j,k
-  integer :: int_radius
-  integer :: iglob
-
-  logical :: dummyl
-
-!daniel debug
-  ! dummy to avoid compiler warning
-  dummyl = is_backward_field
-
-  !  anisotropic elements
-
-  do k=1,NGLLZ
-    do j=1,NGLLY
-      do i=1,NGLLX
-        ! get derivatives of ux, uy and uz with respect to x, y and z
-        xixl = xix(i,j,k,ispec)
-        xiyl = xiy(i,j,k,ispec)
-        xizl = xiz(i,j,k,ispec)
-        etaxl = etax(i,j,k,ispec)
-        etayl = etay(i,j,k,ispec)
-        etazl = etaz(i,j,k,ispec)
-        gammaxl = gammax(i,j,k,ispec)
-        gammayl = gammay(i,j,k,ispec)
-        gammazl = gammaz(i,j,k,ispec)
-
-        ! compute the jacobian
-        jacobianl = 1.0_CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                      - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                      + xizl*(etaxl*gammayl-etayl*gammaxl))
-
-        duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-        duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-        duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-        duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-        duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-        duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-        duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-        duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-        duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-        ! precompute some sums to save CPU time
-        duxdxl_plus_duydyl = duxdxl + duydyl
-        duxdxl_plus_duzdzl = duxdxl + duzdzl
-        duydyl_plus_duzdzl = duydyl + duzdzl
-        duxdyl_plus_duydxl = duxdyl + duydxl
-        duzdxl_plus_duxdzl = duzdxl + duxdzl
-        duzdyl_plus_duydzl = duzdyl + duydzl
-
-!ZN beware, here the expression differs from the strain used in memory variable equation (6) in D. Komatitsch and J. Tromp 1999,
-!ZN here Brian Savage uses the engineering strain which are epsilon = 1/2*(grad U + (grad U)^T),
-!ZN where U is the displacement vector and grad the gradient operator, i.e. there is a 1/2 factor difference between the two.
-!ZN Both expressions are fine, but we need to keep in mind that if we put the 1/2 factor here there we need to remove it
-!ZN from the expression in which we use the strain later in the code.
-
-        ! compute deviatoric strain
-        if (COMPUTE_AND_STORE_STRAIN) then
-          templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-          if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) then
-            if( ispec == 1) then
-              epsilon_trace_over_3(i,j,k,1) = templ
-            endif
-          else
-            epsilon_trace_over_3(i,j,k,ispec) = templ
-          endif
-          epsilondev_loc(1,i,j,k) = duxdxl - templ
-          epsilondev_loc(2,i,j,k) = duydyl - templ
-          epsilondev_loc(3,i,j,k) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-          epsilondev_loc(4,i,j,k) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-          epsilondev_loc(5,i,j,k) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
         endif
 
-        !
-        ! compute anisotropic elements
-        !
+    endif  ! end of section with gravity terms
 
-        c11 = c11store(i,j,k,ispec)
-        c12 = c12store(i,j,k,ispec)
-        c13 = c13store(i,j,k,ispec)
-        c14 = c14store(i,j,k,ispec)
-        c15 = c15store(i,j,k,ispec)
-        c16 = c16store(i,j,k,ispec)
-        c22 = c22store(i,j,k,ispec)
-        c23 = c23store(i,j,k,ispec)
-        c24 = c24store(i,j,k,ispec)
-        c25 = c25store(i,j,k,ispec)
-        c26 = c26store(i,j,k,ispec)
-        c33 = c33store(i,j,k,ispec)
-        c34 = c34store(i,j,k,ispec)
-        c35 = c35store(i,j,k,ispec)
-        c36 = c36store(i,j,k,ispec)
-        c44 = c44store(i,j,k,ispec)
-        c45 = c45store(i,j,k,ispec)
-        c46 = c46store(i,j,k,ispec)
-        c55 = c55store(i,j,k,ispec)
-        c56 = c56store(i,j,k,ispec)
-        c66 = c66store(i,j,k,ispec)
+    ! form dot product with test vector, non-symmetric form
+    tempx1(INDEX_IJK) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+    tempy1(INDEX_IJK) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+    tempz1(INDEX_IJK) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
 
-        if( ATTENUATION_VAL ) then
-          ! precompute terms for attenuation if needed
-          if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-            minus_sum_beta =  one_minus_sum_beta(i,j,k,ispec) - 1.0_CUSTOM_REAL
-          else
-            minus_sum_beta =  one_minus_sum_beta(1,1,1,ispec) - 1.0_CUSTOM_REAL
-          endif
-          mul = c44 * minus_sum_beta
-          c11 = c11 + FOUR_THIRDS * mul ! * minus_sum_beta * mul
-          c12 = c12 - TWO_THIRDS * mul
-          c13 = c13 - TWO_THIRDS * mul
-          c22 = c22 + FOUR_THIRDS * mul
-          c23 = c23 - TWO_THIRDS * mul
-          c33 = c33 + FOUR_THIRDS * mul
-          c44 = c44 + mul
-          c55 = c55 + mul
-          c66 = c66 + mul
-        endif
+    tempx2(INDEX_IJK) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+    tempy2(INDEX_IJK) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+    tempz2(INDEX_IJK) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
 
-        sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl + &
-                 c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl
+    tempx3(INDEX_IJK) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+    tempy3(INDEX_IJK) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+    tempz3(INDEX_IJK) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
 
-        sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl + &
-                 c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl
+  ENDDO_LOOP_IJK
 
-        sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl + &
-                 c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl
+  end subroutine compute_element_tiso
 
-        sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl + &
-                 c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl
 
-        sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl + &
-                 c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl
+!--------------------------------------------------------------------------------------------
+!
+! anisotropic element
+!
+!--------------------------------------------------------------------------------------------
 
-        sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl + &
-                 c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
-
-        ! subtract memory variables if attenuation
-        if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
-          ! note: function inlining is generally done by fortran compilers;
-          !       compilers decide based on performance heuristics
-
-          ! note: Fortran passes pointers to array location, thus R_memory(1,1,...) is fine
-          call compute_element_att_stress(R_xx(1,i,j,k,ispec), &
-                                          R_yy(1,i,j,k,ispec), &
-                                          R_xy(1,i,j,k,ispec), &
-                                          R_xz(1,i,j,k,ispec), &
-                                          R_yz(1,i,j,k,ispec), &
-                                          sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
-
-        endif ! ATTENUATION_VAL
-
-        ! define symmetric components of sigma for gravity
-        sigma_yx = sigma_xy
-        sigma_zx = sigma_xz
-        sigma_zy = sigma_yz
-
-        ! compute non-symmetric terms for gravity
-        if(GRAVITY_VAL) then
-            ! use mesh coordinates to get theta and phi
-            ! x y and z contain r theta and phi
-            iglob = ibool(i,j,k,ispec)
-
-            dtheta = dble(ystore(iglob))
-            dphi = dble(zstore(iglob))
-            radius = dble(xstore(iglob))
-
-            cos_theta = dcos(dtheta)
-            sin_theta = dsin(dtheta)
-            cos_phi = dcos(dphi)
-            sin_phi = dsin(dphi)
-
-            cos_theta_sq = cos_theta*cos_theta
-            sin_theta_sq = sin_theta*sin_theta
-            cos_phi_sq = cos_phi*cos_phi
-            sin_phi_sq = sin_phi*sin_phi
-
-            ! get g, rho and dg/dr=dg
-            ! spherical components of the gravitational acceleration
-            ! for efficiency replace with lookup table every 100 m in radial direction
-            int_radius = nint(10.d0 * radius * R_EARTH_KM )
-            minus_g = minus_gravity_table(int_radius)
-            minus_dg = minus_deriv_gravity_table(int_radius)
-            rho = density_table(int_radius)
-
-            ! Cartesian components of the gravitational acceleration
-            gxl = minus_g*sin_theta*cos_phi
-            gyl = minus_g*sin_theta*sin_phi
-            gzl = minus_g*cos_theta
-
-            ! Cartesian components of gradient of gravitational acceleration
-            ! obtained from spherical components
-            minus_g_over_radius = minus_g / radius
-            minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius
-
-            Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq
-            Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq
-            Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq
-            Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq
-            Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
-            Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
-
-            ! distinguish between single and double precision for reals
-            if(CUSTOM_REAL == SIZE_REAL) then
-
-              ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * dble(dummyx_loc(i,j,k)) ! dble(displ_crust_mantle(1,iglob))
-              sy_l = rho * dble(dummyy_loc(i,j,k)) ! dble(displ_crust_mantle(2,iglob))
-              sz_l = rho * dble(dummyz_loc(i,j,k)) !  dble(displ_crust_mantle(3,iglob))
-
-              ! compute G tensor from s . g and add to sigma (not symmetric)
-              sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
-              sigma_yy = sigma_yy + sngl(sx_l*gxl + sz_l*gzl)
-              sigma_zz = sigma_zz + sngl(sx_l*gxl + sy_l*gyl)
-
-              sigma_xy = sigma_xy - sngl(sx_l * gyl)
-              sigma_yx = sigma_yx - sngl(sy_l * gxl)
-
-              sigma_xz = sigma_xz - sngl(sx_l * gzl)
-              sigma_zx = sigma_zx - sngl(sz_l * gxl)
-
-              sigma_yz = sigma_yz - sngl(sy_l * gzl)
-              sigma_zy = sigma_zy - sngl(sz_l * gyl)
-
-              ! precompute vector
-              factor = dble(jacobianl) * wgll_cube(i,j,k)
-              rho_s_H(1,i,j,k) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
-              rho_s_H(2,i,j,k) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
-              rho_s_H(3,i,j,k) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
-
-            else
-
-              ! get displacement and multiply by density to compute G tensor
-              sx_l = rho * dummyx_loc(i,j,k)  ! displ_crust_mantle(1,iglob)
-              sy_l = rho * dummyy_loc(i,j,k)  !  displ_crust_mantle(2,iglob)
-              sz_l = rho * dummyz_loc(i,j,k)  ! displ_crust_mantle(3,iglob)
-
-              ! compute G tensor from s . g and add to sigma (not symmetric)
-              sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
-              sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
-              sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
-
-              sigma_xy = sigma_xy - sx_l * gyl
-              sigma_yx = sigma_yx - sy_l * gxl
-
-              sigma_xz = sigma_xz - sx_l * gzl
-              sigma_zx = sigma_zx - sz_l * gxl
-
-              sigma_yz = sigma_yz - sy_l * gzl
-              sigma_zy = sigma_zy - sz_l * gyl
-
-              ! precompute vector
-              factor = jacobianl * wgll_cube(i,j,k)
-              rho_s_H(1,i,j,k) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-              rho_s_H(2,i,j,k) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-              rho_s_H(3,i,j,k) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
-
-            endif
-
-        endif  ! end of section with gravity terms
-
-        ! form dot product with test vector, non-symmetric form
-        tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-        tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-        tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
-
-        tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-        tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-        tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
-
-        tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-        tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-        tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-
-      enddo ! NGLLX
-    enddo ! NGLLY
-  enddo ! NGLLZ
-
-  end subroutine compute_element_aniso
-
-
-#else
-
-  ! FORCE_VECTORIZATION
-
   subroutine compute_element_aniso(ispec, &
                                    minus_gravity_table,density_table,minus_deriv_gravity_table, &
                                    xstore,ystore,zstore, &
@@ -2189,7 +1009,7 @@
                                    one_minus_sum_beta,vnspec, &
                                    tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
                                    dummyx_loc,dummyy_loc,dummyz_loc, &
-                                   epsilondev_loc,rho_s_H,is_backward_field)
+                                   epsilondev_loc,rho_s_H)
 
   use constants_solver
   use specfem_par,only: COMPUTE_AND_STORE_STRAIN
@@ -2242,10 +1062,7 @@
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
   real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
 
-  logical :: is_backward_field
-
-! local parameters
-  ! real(kind=CUSTOM_REAL) one_minus_sum_beta_use
+  ! local parameters
   real(kind=CUSTOM_REAL) minus_sum_beta,mul
   ! the 21 coefficients for an anisotropic medium in reduced notation
   real(kind=CUSTOM_REAL) c11,c22,c33,c44,c55,c66,c12,c13,c23,c14,c24,c34,c15,c25,c35,c45,c16,c26,c36,c46,c56
@@ -2271,50 +1088,47 @@
   integer :: int_radius
   integer :: iglob
 
-  logical :: dummyl
-
-  ! force vectorization
-  integer :: ijk
-  real(kind=CUSTOM_REAL) R_xx_val,R_yy_val
-
+#ifdef FORCE_VECTORIZATION
 ! in this vectorized version we have to assume that N_SLS == 3 in order to be able to unroll and thus suppress
 ! an inner loop that would otherwise prevent vectorization; this is safe in practice in all cases because N_SLS == 3
-! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop 
+! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop
+  integer :: ijk
+  real(kind=CUSTOM_REAL) :: R_xx_val,R_yy_val
+#else
+  integer :: i,j,k
+#endif
 
-!daniel debug
-  ! dummy to avoid compiler warning
-  dummyl = is_backward_field
+  !  anisotropic elements
 
-  ! anisotropic elements
+  DO_LOOP_IJK
 
-  do ijk=1,NGLLCUBE
     ! get derivatives of ux, uy and uz with respect to x, y and z
-    xixl = xix(ijk,1,1,ispec)
-    xiyl = xiy(ijk,1,1,ispec)
-    xizl = xiz(ijk,1,1,ispec)
-    etaxl = etax(ijk,1,1,ispec)
-    etayl = etay(ijk,1,1,ispec)
-    etazl = etaz(ijk,1,1,ispec)
-    gammaxl = gammax(ijk,1,1,ispec)
-    gammayl = gammay(ijk,1,1,ispec)
-    gammazl = gammaz(ijk,1,1,ispec)
+    xixl = xix(INDEX_IJK,ispec)
+    xiyl = xiy(INDEX_IJK,ispec)
+    xizl = xiz(INDEX_IJK,ispec)
+    etaxl = etax(INDEX_IJK,ispec)
+    etayl = etay(INDEX_IJK,ispec)
+    etazl = etaz(INDEX_IJK,ispec)
+    gammaxl = gammax(INDEX_IJK,ispec)
+    gammayl = gammay(INDEX_IJK,ispec)
+    gammazl = gammaz(INDEX_IJK,ispec)
 
     ! compute the jacobian
     jacobianl = 1.0_CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
                   - xiyl*(etaxl*gammazl-etazl*gammaxl) &
                   + xizl*(etaxl*gammayl-etayl*gammaxl))
 
-    duxdxl = xixl*tempx1(ijk,1,1) + etaxl*tempx2(ijk,1,1) + gammaxl*tempx3(ijk,1,1)
-    duxdyl = xiyl*tempx1(ijk,1,1) + etayl*tempx2(ijk,1,1) + gammayl*tempx3(ijk,1,1)
-    duxdzl = xizl*tempx1(ijk,1,1) + etazl*tempx2(ijk,1,1) + gammazl*tempx3(ijk,1,1)
+    duxdxl = xixl*tempx1(INDEX_IJK) + etaxl*tempx2(INDEX_IJK) + gammaxl*tempx3(INDEX_IJK)
+    duxdyl = xiyl*tempx1(INDEX_IJK) + etayl*tempx2(INDEX_IJK) + gammayl*tempx3(INDEX_IJK)
+    duxdzl = xizl*tempx1(INDEX_IJK) + etazl*tempx2(INDEX_IJK) + gammazl*tempx3(INDEX_IJK)
 
-    duydxl = xixl*tempy1(ijk,1,1) + etaxl*tempy2(ijk,1,1) + gammaxl*tempy3(ijk,1,1)
-    duydyl = xiyl*tempy1(ijk,1,1) + etayl*tempy2(ijk,1,1) + gammayl*tempy3(ijk,1,1)
-    duydzl = xizl*tempy1(ijk,1,1) + etazl*tempy2(ijk,1,1) + gammazl*tempy3(ijk,1,1)
+    duydxl = xixl*tempy1(INDEX_IJK) + etaxl*tempy2(INDEX_IJK) + gammaxl*tempy3(INDEX_IJK)
+    duydyl = xiyl*tempy1(INDEX_IJK) + etayl*tempy2(INDEX_IJK) + gammayl*tempy3(INDEX_IJK)
+    duydzl = xizl*tempy1(INDEX_IJK) + etazl*tempy2(INDEX_IJK) + gammazl*tempy3(INDEX_IJK)
 
-    duzdxl = xixl*tempz1(ijk,1,1) + etaxl*tempz2(ijk,1,1) + gammaxl*tempz3(ijk,1,1)
-    duzdyl = xiyl*tempz1(ijk,1,1) + etayl*tempz2(ijk,1,1) + gammayl*tempz3(ijk,1,1)
-    duzdzl = xizl*tempz1(ijk,1,1) + etazl*tempz2(ijk,1,1) + gammazl*tempz3(ijk,1,1)
+    duzdxl = xixl*tempz1(INDEX_IJK) + etaxl*tempz2(INDEX_IJK) + gammaxl*tempz3(INDEX_IJK)
+    duzdyl = xiyl*tempz1(INDEX_IJK) + etayl*tempz2(INDEX_IJK) + gammayl*tempz3(INDEX_IJK)
+    duzdzl = xizl*tempz1(INDEX_IJK) + etazl*tempz2(INDEX_IJK) + gammazl*tempz3(INDEX_IJK)
 
     ! precompute some sums to save CPU time
     duxdxl_plus_duydyl = duxdxl + duydyl
@@ -2335,48 +1149,48 @@
       templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
       if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) then
         if( ispec == 1) then
-          epsilon_trace_over_3(ijk,1,1,1) = templ
+          epsilon_trace_over_3(INDEX_IJK,1) = templ
         endif
       else
-        epsilon_trace_over_3(ijk,1,1,ispec) = templ
+        epsilon_trace_over_3(INDEX_IJK,ispec) = templ
       endif
-      epsilondev_loc(1,ijk,1,1) = duxdxl - templ
-      epsilondev_loc(2,ijk,1,1) = duydyl - templ
-      epsilondev_loc(3,ijk,1,1) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-      epsilondev_loc(4,ijk,1,1) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-      epsilondev_loc(5,ijk,1,1) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+      epsilondev_loc(1,INDEX_IJK) = duxdxl - templ
+      epsilondev_loc(2,INDEX_IJK) = duydyl - templ
+      epsilondev_loc(3,INDEX_IJK) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+      epsilondev_loc(4,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+      epsilondev_loc(5,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
     endif
 
     !
     ! compute anisotropic elements
     !
 
-    c11 = c11store(ijk,1,1,ispec)
-    c12 = c12store(ijk,1,1,ispec)
-    c13 = c13store(ijk,1,1,ispec)
-    c14 = c14store(ijk,1,1,ispec)
-    c15 = c15store(ijk,1,1,ispec)
-    c16 = c16store(ijk,1,1,ispec)
-    c22 = c22store(ijk,1,1,ispec)
-    c23 = c23store(ijk,1,1,ispec)
-    c24 = c24store(ijk,1,1,ispec)
-    c25 = c25store(ijk,1,1,ispec)
-    c26 = c26store(ijk,1,1,ispec)
-    c33 = c33store(ijk,1,1,ispec)
-    c34 = c34store(ijk,1,1,ispec)
-    c35 = c35store(ijk,1,1,ispec)
-    c36 = c36store(ijk,1,1,ispec)
-    c44 = c44store(ijk,1,1,ispec)
-    c45 = c45store(ijk,1,1,ispec)
-    c46 = c46store(ijk,1,1,ispec)
-    c55 = c55store(ijk,1,1,ispec)
-    c56 = c56store(ijk,1,1,ispec)
-    c66 = c66store(ijk,1,1,ispec)
+    c11 = c11store(INDEX_IJK,ispec)
+    c12 = c12store(INDEX_IJK,ispec)
+    c13 = c13store(INDEX_IJK,ispec)
+    c14 = c14store(INDEX_IJK,ispec)
+    c15 = c15store(INDEX_IJK,ispec)
+    c16 = c16store(INDEX_IJK,ispec)
+    c22 = c22store(INDEX_IJK,ispec)
+    c23 = c23store(INDEX_IJK,ispec)
+    c24 = c24store(INDEX_IJK,ispec)
+    c25 = c25store(INDEX_IJK,ispec)
+    c26 = c26store(INDEX_IJK,ispec)
+    c33 = c33store(INDEX_IJK,ispec)
+    c34 = c34store(INDEX_IJK,ispec)
+    c35 = c35store(INDEX_IJK,ispec)
+    c36 = c36store(INDEX_IJK,ispec)
+    c44 = c44store(INDEX_IJK,ispec)
+    c45 = c45store(INDEX_IJK,ispec)
+    c46 = c46store(INDEX_IJK,ispec)
+    c55 = c55store(INDEX_IJK,ispec)
+    c56 = c56store(INDEX_IJK,ispec)
+    c66 = c66store(INDEX_IJK,ispec)
 
-    if(ATTENUATION_VAL) then
+    if( ATTENUATION_VAL ) then
       ! precompute terms for attenuation if needed
       if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-        minus_sum_beta =  one_minus_sum_beta(ijk,1,1,ispec) - 1.0_CUSTOM_REAL
+        minus_sum_beta =  one_minus_sum_beta(INDEX_IJK,ispec) - 1.0_CUSTOM_REAL
       else
         minus_sum_beta =  one_minus_sum_beta(1,1,1,ispec) - 1.0_CUSTOM_REAL
       endif
@@ -2411,35 +1225,51 @@
              c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl
 
     ! subtract memory variables if attenuation
-    if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL) then
-! do NOT put this is a subroutine, otherwise the call to the subroutine prevents compilers from vectorizing the outer loop
-! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop in order to vectorize the outer loop
-      R_xx_val = R_xx(1,ijk,1,1,ispec)
-      R_yy_val = R_yy(1,ijk,1,1,ispec)
+    if(ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
+#ifdef FORCE_VECTORIZATION
+      ! do NOT put this is a subroutine, otherwise the call to the subroutine prevents compilers
+      ! from vectorizing the outer loop
+
+      ! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop
+      ! in order to vectorize the outer loop
+      R_xx_val = R_xx(1,INDEX_IJK,ispec)
+      R_yy_val = R_yy(1,INDEX_IJK,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(1,ijk,1,1,ispec)
-      sigma_xz = sigma_xz - R_xz(1,ijk,1,1,ispec)
-      sigma_yz = sigma_yz - R_yz(1,ijk,1,1,ispec)
+      sigma_xy = sigma_xy - R_xy(1,INDEX_IJK,ispec)
+      sigma_xz = sigma_xz - R_xz(1,INDEX_IJK,ispec)
+      sigma_yz = sigma_yz - R_yz(1,INDEX_IJK,ispec)
 
-      R_xx_val = R_xx(2,ijk,1,1,ispec)
-      R_yy_val = R_yy(2,ijk,1,1,ispec)
+      R_xx_val = R_xx(2,INDEX_IJK,ispec)
+      R_yy_val = R_yy(2,INDEX_IJK,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(2,ijk,1,1,ispec)
-      sigma_xz = sigma_xz - R_xz(2,ijk,1,1,ispec)
-      sigma_yz = sigma_yz - R_yz(2,ijk,1,1,ispec)
+      sigma_xy = sigma_xy - R_xy(2,INDEX_IJK,ispec)
+      sigma_xz = sigma_xz - R_xz(2,INDEX_IJK,ispec)
+      sigma_yz = sigma_yz - R_yz(2,INDEX_IJK,ispec)
 
-      R_xx_val = R_xx(3,ijk,1,1,ispec)
-      R_yy_val = R_yy(3,ijk,1,1,ispec)
+      R_xx_val = R_xx(3,INDEX_IJK,ispec)
+      R_yy_val = R_yy(3,INDEX_IJK,ispec)
       sigma_xx = sigma_xx - R_xx_val
       sigma_yy = sigma_yy - R_yy_val
       sigma_zz = sigma_zz + R_xx_val + R_yy_val
-      sigma_xy = sigma_xy - R_xy(3,ijk,1,1,ispec)
-      sigma_xz = sigma_xz - R_xz(3,ijk,1,1,ispec)
-      sigma_yz = sigma_yz - R_yz(3,ijk,1,1,ispec)
+      sigma_xy = sigma_xy - R_xy(3,INDEX_IJK,ispec)
+      sigma_xz = sigma_xz - R_xz(3,INDEX_IJK,ispec)
+      sigma_yz = sigma_yz - R_yz(3,INDEX_IJK,ispec)
+#else
+      ! note: function inlining is generally done by fortran compilers;
+      !       compilers decide based on performance heuristics
+
+      ! note: Fortran passes pointers to array location, thus R_memory(1,1,...) is fine
+      call compute_element_att_stress(R_xx(1,INDEX_IJK,ispec), &
+                                      R_yy(1,INDEX_IJK,ispec), &
+                                      R_xy(1,INDEX_IJK,ispec), &
+                                      R_xz(1,INDEX_IJK,ispec), &
+                                      R_yz(1,INDEX_IJK,ispec), &
+                                      sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
+#endif
     endif ! ATTENUATION_VAL
 
     ! define symmetric components of sigma for gravity
@@ -2451,7 +1281,7 @@
     if(GRAVITY_VAL) then
         ! use mesh coordinates to get theta and phi
         ! x y and z contain r theta and phi
-        iglob = ibool(ijk,1,1,ispec)
+        iglob = ibool(INDEX_IJK,ispec)
 
         dtheta = dble(ystore(iglob))
         dphi = dble(zstore(iglob))
@@ -2492,50 +1322,82 @@
         Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
         Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
 
-        ! get displacement and multiply by density to compute G tensor
-        sx_l = rho * dummyx_loc(ijk,1,1)
-        sy_l = rho * dummyy_loc(ijk,1,1)
-        sz_l = rho * dummyz_loc(ijk,1,1)
+        ! distinguish between single and double precision for reals
+        if(CUSTOM_REAL == SIZE_REAL) then
 
-        ! compute G tensor from s . g and add to sigma (not symmetric)
-        sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
-        sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
-        sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
+          ! get displacement and multiply by density to compute G tensor
+          sx_l = rho * dble(dummyx_loc(INDEX_IJK)) ! dble(displ_crust_mantle(1,iglob))
+          sy_l = rho * dble(dummyy_loc(INDEX_IJK)) ! dble(displ_crust_mantle(2,iglob))
+          sz_l = rho * dble(dummyz_loc(INDEX_IJK)) !  dble(displ_crust_mantle(3,iglob))
 
-        sigma_xy = sigma_xy - sx_l * gyl
-        sigma_yx = sigma_yx - sy_l * gxl
+          ! compute G tensor from s . g and add to sigma (not symmetric)
+          sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
+          sigma_yy = sigma_yy + sngl(sx_l*gxl + sz_l*gzl)
+          sigma_zz = sigma_zz + sngl(sx_l*gxl + sy_l*gyl)
 
-        sigma_xz = sigma_xz - sx_l * gzl
-        sigma_zx = sigma_zx - sz_l * gxl
+          sigma_xy = sigma_xy - sngl(sx_l * gyl)
+          sigma_yx = sigma_yx - sngl(sy_l * gxl)
 
-        sigma_yz = sigma_yz - sy_l * gzl
-        sigma_zy = sigma_zy - sz_l * gyl
+          sigma_xz = sigma_xz - sngl(sx_l * gzl)
+          sigma_zx = sigma_zx - sngl(sz_l * gxl)
 
-        ! precompute vector
-        factor = jacobianl * wgll_cube(ijk,1,1)
-        rho_s_H(1,ijk,1,1) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-        rho_s_H(2,ijk,1,1) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-        rho_s_H(3,ijk,1,1) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+          sigma_yz = sigma_yz - sngl(sy_l * gzl)
+          sigma_zy = sigma_zy - sngl(sz_l * gyl)
+
+          ! precompute vector
+          factor = dble(jacobianl) * wgll_cube(INDEX_IJK)
+          rho_s_H(1,INDEX_IJK) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
+          rho_s_H(2,INDEX_IJK) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
+          rho_s_H(3,INDEX_IJK) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
+
+        else
+
+          ! get displacement and multiply by density to compute G tensor
+          sx_l = rho * dummyx_loc(INDEX_IJK)  ! displ_crust_mantle(1,iglob)
+          sy_l = rho * dummyy_loc(INDEX_IJK)  !  displ_crust_mantle(2,iglob)
+          sz_l = rho * dummyz_loc(INDEX_IJK)  ! displ_crust_mantle(3,iglob)
+
+          ! compute G tensor from s . g and add to sigma (not symmetric)
+          sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
+          sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
+          sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
+
+          sigma_xy = sigma_xy - sx_l * gyl
+          sigma_yx = sigma_yx - sy_l * gxl
+
+          sigma_xz = sigma_xz - sx_l * gzl
+          sigma_zx = sigma_zx - sz_l * gxl
+
+          sigma_yz = sigma_yz - sy_l * gzl
+          sigma_zy = sigma_zy - sz_l * gyl
+
+          ! precompute vector
+          factor = jacobianl * wgll_cube(INDEX_IJK)
+          rho_s_H(1,INDEX_IJK) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+          rho_s_H(2,INDEX_IJK) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+          rho_s_H(3,INDEX_IJK) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+
+        endif
+
     endif  ! end of section with gravity terms
 
     ! form dot product with test vector, non-symmetric form
-    tempx1(ijk,1,1) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-    tempy1(ijk,1,1) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-    tempz1(ijk,1,1) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+    tempx1(INDEX_IJK) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+    tempy1(INDEX_IJK) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+    tempz1(INDEX_IJK) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
 
-    tempx2(ijk,1,1) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-    tempy2(ijk,1,1) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-    tempz2(ijk,1,1) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+    tempx2(INDEX_IJK) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+    tempy2(INDEX_IJK) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+    tempz2(INDEX_IJK) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
 
-    tempx3(ijk,1,1) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-    tempy3(ijk,1,1) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-    tempz3(ijk,1,1) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
-  enddo
+    tempx3(INDEX_IJK) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+    tempy3(INDEX_IJK) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+    tempz3(INDEX_IJK) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
 
+  ENDDO_LOOP_IJK
+
   end subroutine compute_element_aniso
 
-! end of FORCE_VECTORIZATION
-#endif
 
 
 !--------------------------------------------------------------------------------------------

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_att_memory.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_att_memory.F90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_att_memory.F90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -38,7 +38,7 @@
                                            alphaval,betaval,gammaval, &
                                            c44store,muvstore, &
                                            epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
-                                           epsilondev_loc) !!!!!!!!!!!!!!!!! ,is_backward_field)
+                                           epsilondev_loc)
 ! crust mantle
 ! update memory variables based upon the Runge-Kutta scheme
 
@@ -83,8 +83,6 @@
 
   real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
 
-!!!!!!!!!!!!!!!  logical :: is_backward_field
-
 ! local parameters
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: factor_common_c44_muv
   integer :: i_SLS
@@ -314,7 +312,7 @@
                                            alphaval,betaval,gammaval, &
                                            muvstore, &
                                            epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
-                                           epsilondev_loc) !!!!!!!!!!!!! ,is_backward_field)
+                                           epsilondev_loc)
 ! inner core
 ! update memory variables based upon the Runge-Kutta scheme
 
@@ -356,8 +354,6 @@
 
   real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
 
-!!!!!!!!!!!  logical :: is_backward_field
-
 ! local parameters
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: factor_common_use
 
@@ -562,98 +558,99 @@
 !
 ! helper functions
 !
-
-
-  subroutine compute_element_att_mem_up_cm(ispec,i,j,k, &
-                                              R_xx_loc,R_yy_loc,R_xy_loc,R_xz_loc,R_yz_loc, &
-                                              epsilondev_loc,c44_muv,is_backward_field)
-! crust mantle
-! update memory variables based upon the Runge-Kutta scheme
-
-
-!daniel: att - debug update
-  use specfem_par,only: tau_sigma_dble,deltat,b_deltat
-
-  use specfem_par_crustmantle,only: factor_common=>factor_common_crust_mantle
-
-  use constants_solver
-
-  implicit none
-
-  ! element id
-  integer :: ispec,i,j,k
-
-  ! attenuation
-  ! memory variables for attenuation
-  ! memory variables R_ij are stored at the local rather than global level
-  ! to allow for optimization of cache access by compiler
-!  real(kind=CUSTOM_REAL), dimension(5,N_SLS) :: R_memory_loc
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xx_loc
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_yy_loc
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xy_loc
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xz_loc
-  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_yz_loc
-
-  real(kind=CUSTOM_REAL), dimension(5) :: epsilondev_loc
-  real(kind=CUSTOM_REAL) :: c44_muv
-
-  logical :: is_backward_field
-  double precision :: dt,kappa
-
-! local parameters
-  real(kind=CUSTOM_REAL) :: factor_common_c44_muv
-  integer :: i_SLS
-
-  if( .not. is_backward_field ) then
-    dt = dble(deltat)
-  else
-    ! backward/reconstruction: reverse time
-    dt = dble(b_deltat)
-  endif
-
-  do i_SLS = 1,N_SLS
-
-    ! runge-kutta scheme to update memory variables R(t)
-    if( .false. ) then
-! classical RK 4:       R'(t) =  - 1/tau * R(t)
 !
-! Butcher RK4:
-! 0     |
-! 1/2   | 1/2
-! 1/2   | 0    1/2
-! 1     | 0          1
-! -----------------------------------------------------------------------------
-!         1/6  1/3   1/3   1/6
-    kappa = - dt/tau_sigma_dble(i_SLS)
+!daniel debug: att - debug update
+!
+!  subroutine compute_element_att_mem_up_cm(ispec,i,j,k, &
+!                                              R_xx_loc,R_yy_loc,R_xy_loc,R_xz_loc,R_yz_loc, &
+!                                              epsilondev_loc,c44_muv,is_backward_field)
+!! crust mantle
+!! update memory variables based upon the Runge-Kutta scheme
+!
+!
+!!daniel: att - debug update
+!  use specfem_par,only: tau_sigma_dble,deltat,b_deltat
+!
+!  use specfem_par_crustmantle,only: factor_common=>factor_common_crust_mantle
+!
+!  use constants_solver
+!
+!  implicit none
+!
+!  ! element id
+!  integer :: ispec,i,j,k
+!
+!  ! attenuation
+!  ! memory variables for attenuation
+!  ! memory variables R_ij are stored at the local rather than global level
+!  ! to allow for optimization of cache access by compiler
+!!  real(kind=CUSTOM_REAL), dimension(5,N_SLS) :: R_memory_loc
+!  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xx_loc
+!  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_yy_loc
+!  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xy_loc
+!  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_xz_loc
+!  real(kind=CUSTOM_REAL), dimension(N_SLS) :: R_yz_loc
+!
+!  real(kind=CUSTOM_REAL), dimension(5) :: epsilondev_loc
+!  real(kind=CUSTOM_REAL) :: c44_muv
+!
+!  logical :: is_backward_field
+!  double precision :: dt,kappa
+!
+!! local parameters
+!  real(kind=CUSTOM_REAL) :: factor_common_c44_muv
+!  integer :: i_SLS
+!
+!  if( .not. is_backward_field ) then
+!    dt = dble(deltat)
+!  else
+!    ! backward/reconstruction: reverse time
+!    dt = dble(b_deltat)
+!  endif
+!
+!  do i_SLS = 1,N_SLS
+!
+!    ! runge-kutta scheme to update memory variables R(t)
+!    if( .false. ) then
+!! classical RK 4:       R'(t) =  - 1/tau * R(t)
+!!
+!! Butcher RK4:
+!! 0     |
+!! 1/2   | 1/2
+!! 1/2   | 0    1/2
+!! 1     | 0          1
+!! -----------------------------------------------------------------------------
+!!         1/6  1/3   1/3   1/6
+!    kappa = - dt/tau_sigma_dble(i_SLS)
+!
+!    R_xx_loc(i_SLS) = R_xx_loc(i_SLS) * &
+!      (1.0d0 + kappa*(1.d0 + 0.5d0*kappa*(1.d0 + 1.0d0/6.0d0*kappa*(1.d0 + 1.0d0/24.0d0*kappa))))
+!    R_yy_loc(i_SLS) = R_yy_loc(i_SLS) * &
+!      (1.0d0 + kappa*(1.d0 + 0.5d0*kappa*(1.d0 + 1.0d0/6.0d0*kappa*(1.d0 + 1.0d0/24.0d0*kappa))))
+!    R_xy_loc(i_SLS) = R_xy_loc(i_SLS) * &
+!      (1.0d0 + kappa*(1.d0 + 0.5d0*kappa*(1.d0 + 1.0d0/6.0d0*kappa*(1.d0 + 1.0d0/24.0d0*kappa))))
+!    R_xz_loc(i_SLS) = R_xz_loc(i_SLS) * &
+!      (1.0d0 + kappa*(1.d0 + 0.5d0*kappa*(1.d0 + 1.0d0/6.0d0*kappa*(1.d0 + 1.0d0/24.0d0*kappa))))
+!    R_yz_loc(i_SLS) = R_yz_loc(i_SLS) * &
+!      (1.0d0 + kappa*(1.d0 + 0.5d0*kappa*(1.d0 + 1.0d0/6.0d0*kappa*(1.d0 + 1.0d0/24.0d0*kappa))))
+!    endif
+!
+!    ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
+!    if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
+!      factor_common_c44_muv = factor_common(i_SLS,i,j,k,ispec) * c44_muv
+!    else
+!      factor_common_c44_muv = factor_common(i_SLS,1,1,1,ispec) * c44_muv
+!    endif
+!
+!    ! adds contributions from current strain
+!    R_xx_loc(i_SLS) = R_xx_loc(i_SLS) + 0.5d0 * dt * dble(factor_common_c44_muv) * dble(epsilondev_loc(1))
+!    R_yy_loc(i_SLS) = R_yy_loc(i_SLS) + 0.5d0 * dt * dble(factor_common_c44_muv) * dble(epsilondev_loc(2))
+!    R_xy_loc(i_SLS) = R_xy_loc(i_SLS) + 0.5d0 * dt * dble(factor_common_c44_muv) * dble(epsilondev_loc(3))
+!    R_xz_loc(i_SLS) = R_xz_loc(i_SLS) + 0.5d0 * dt * dble(factor_common_c44_muv) * dble(epsilondev_loc(4))
+!    R_yz_loc(i_SLS) = R_yz_loc(i_SLS) + 0.5d0 * dt * dble(factor_common_c44_muv) * dble(epsilondev_loc(5))
+!
+!  enddo ! i_SLS
+!
+!  end subroutine compute_element_att_mem_up_cm
+!
 
-    R_xx_loc(i_SLS) = R_xx_loc(i_SLS) * &
-      (1.0d0 + kappa*(1.d0 + 0.5d0*kappa*(1.d0 + 1.0d0/6.0d0*kappa*(1.d0 + 1.0d0/24.0d0*kappa))))
-    R_yy_loc(i_SLS) = R_yy_loc(i_SLS) * &
-      (1.0d0 + kappa*(1.d0 + 0.5d0*kappa*(1.d0 + 1.0d0/6.0d0*kappa*(1.d0 + 1.0d0/24.0d0*kappa))))
-    R_xy_loc(i_SLS) = R_xy_loc(i_SLS) * &
-      (1.0d0 + kappa*(1.d0 + 0.5d0*kappa*(1.d0 + 1.0d0/6.0d0*kappa*(1.d0 + 1.0d0/24.0d0*kappa))))
-    R_xz_loc(i_SLS) = R_xz_loc(i_SLS) * &
-      (1.0d0 + kappa*(1.d0 + 0.5d0*kappa*(1.d0 + 1.0d0/6.0d0*kappa*(1.d0 + 1.0d0/24.0d0*kappa))))
-    R_yz_loc(i_SLS) = R_yz_loc(i_SLS) * &
-      (1.0d0 + kappa*(1.d0 + 0.5d0*kappa*(1.d0 + 1.0d0/6.0d0*kappa*(1.d0 + 1.0d0/24.0d0*kappa))))
-    endif
-
-    ! reformatted R_memory to handle large factor_common and reduced [alpha,beta,gamma]val
-    if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-      factor_common_c44_muv = factor_common(i_SLS,i,j,k,ispec) * c44_muv
-    else
-      factor_common_c44_muv = factor_common(i_SLS,1,1,1,ispec) * c44_muv
-    endif
-
-    ! adds contributions from current strain
-    R_xx_loc(i_SLS) = R_xx_loc(i_SLS) + 0.5d0 * dt * dble(factor_common_c44_muv) * dble(epsilondev_loc(1))
-    R_yy_loc(i_SLS) = R_yy_loc(i_SLS) + 0.5d0 * dt * dble(factor_common_c44_muv) * dble(epsilondev_loc(2))
-    R_xy_loc(i_SLS) = R_xy_loc(i_SLS) + 0.5d0 * dt * dble(factor_common_c44_muv) * dble(epsilondev_loc(3))
-    R_xz_loc(i_SLS) = R_xz_loc(i_SLS) + 0.5d0 * dt * dble(factor_common_c44_muv) * dble(epsilondev_loc(4))
-    R_yz_loc(i_SLS) = R_yz_loc(i_SLS) + 0.5d0 * dt * dble(factor_common_c44_muv) * dble(epsilondev_loc(5))
-
-  enddo ! i_SLS
-
-  end subroutine compute_element_att_mem_up_cm
-
-

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_strain.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_strain.F90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_element_strain.F90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -25,8 +25,12 @@
 !
 !=====================================================================
 
+! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION
+! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh
+#include "config.fh"
 
 
+
   subroutine compute_element_strain_undo_att_Dev(ispec,nglob,nspec, &
                                                 displ,ibool, &
                                                 hprime_xx,hprime_xxT,&
@@ -53,7 +57,9 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ),intent(out) :: eps_trace_over_3_loc
 
 !  local variable
-  integer :: i,j,k,iglob
+  integer :: iglob
+  integer :: i,j,k
+
   real(kind=CUSTOM_REAL) :: templ
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
@@ -93,26 +99,16 @@
   integer :: ijk
 #endif
 
-#ifdef FORCE_VECTORIZATION
-  do ijk=1,NGLLCUBE
-    iglob = ibool(ijk,1,1,ispec)
-    dummyx_loc(ijk,1,1) = displ(1,iglob)
-    dummyy_loc(ijk,1,1) = displ(2,iglob)
-    dummyz_loc(ijk,1,1) = displ(3,iglob)
-  enddo
-#else
-  do k=1,NGLLZ
-    do j=1,NGLLY
-      do i=1,NGLLX
-        iglob = ibool(i,j,k,ispec)
-        dummyx_loc(i,j,k) = displ(1,iglob)
-        dummyy_loc(i,j,k) = displ(2,iglob)
-        dummyz_loc(i,j,k) = displ(3,iglob)
-      enddo
-    enddo
-  enddo
-#endif
+  DO_LOOP_IJK
 
+    iglob = ibool(INDEX_IJK,ispec)
+    dummyx_loc(INDEX_IJK) = displ(1,iglob)
+    dummyy_loc(INDEX_IJK) = displ(2,iglob)
+    dummyz_loc(INDEX_IJK) = displ(3,iglob)
+
+  ENDDO_LOOP_IJK
+
+  ! deville optimizations
   do j=1,m2
     do i=1,m1
       C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
@@ -182,35 +178,35 @@
     enddo
   enddo
 
-#ifdef FORCE_VECTORIZATION
-  do ijk=1,NGLLCUBE
+  DO_LOOP_IJK
+
     ! get derivatives of ux, uy and uz with respect to x, y and z
-    xixl = xix(ijk,1,1,ispec)
-    xiyl = xiy(ijk,1,1,ispec)
-    xizl = xiz(ijk,1,1,ispec)
-    etaxl = etax(ijk,1,1,ispec)
-    etayl = etay(ijk,1,1,ispec)
-    etazl = etaz(ijk,1,1,ispec)
-    gammaxl = gammax(ijk,1,1,ispec)
-    gammayl = gammay(ijk,1,1,ispec)
-    gammazl = gammaz(ijk,1,1,ispec)
+    xixl = xix(INDEX_IJK,ispec)
+    xiyl = xiy(INDEX_IJK,ispec)
+    xizl = xiz(INDEX_IJK,ispec)
+    etaxl = etax(INDEX_IJK,ispec)
+    etayl = etay(INDEX_IJK,ispec)
+    etazl = etaz(INDEX_IJK,ispec)
+    gammaxl = gammax(INDEX_IJK,ispec)
+    gammayl = gammay(INDEX_IJK,ispec)
+    gammazl = gammaz(INDEX_IJK,ispec)
 
     ! compute the jacobian
     jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
                   - xiyl*(etaxl*gammazl-etazl*gammaxl) &
                   + xizl*(etaxl*gammayl-etayl*gammaxl))
 
-    duxdxl = xixl*tempx1(ijk,1,1) + etaxl*tempx2(ijk,1,1) + gammaxl*tempx3(ijk,1,1)
-    duxdyl = xiyl*tempx1(ijk,1,1) + etayl*tempx2(ijk,1,1) + gammayl*tempx3(ijk,1,1)
-    duxdzl = xizl*tempx1(ijk,1,1) + etazl*tempx2(ijk,1,1) + gammazl*tempx3(ijk,1,1)
+    duxdxl = xixl*tempx1(INDEX_IJK) + etaxl*tempx2(INDEX_IJK) + gammaxl*tempx3(INDEX_IJK)
+    duxdyl = xiyl*tempx1(INDEX_IJK) + etayl*tempx2(INDEX_IJK) + gammayl*tempx3(INDEX_IJK)
+    duxdzl = xizl*tempx1(INDEX_IJK) + etazl*tempx2(INDEX_IJK) + gammazl*tempx3(INDEX_IJK)
 
-    duydxl = xixl*tempy1(ijk,1,1) + etaxl*tempy2(ijk,1,1) + gammaxl*tempy3(ijk,1,1)
-    duydyl = xiyl*tempy1(ijk,1,1) + etayl*tempy2(ijk,1,1) + gammayl*tempy3(ijk,1,1)
-    duydzl = xizl*tempy1(ijk,1,1) + etazl*tempy2(ijk,1,1) + gammazl*tempy3(ijk,1,1)
+    duydxl = xixl*tempy1(INDEX_IJK) + etaxl*tempy2(INDEX_IJK) + gammaxl*tempy3(INDEX_IJK)
+    duydyl = xiyl*tempy1(INDEX_IJK) + etayl*tempy2(INDEX_IJK) + gammayl*tempy3(INDEX_IJK)
+    duydzl = xizl*tempy1(INDEX_IJK) + etazl*tempy2(INDEX_IJK) + gammazl*tempy3(INDEX_IJK)
 
-    duzdxl = xixl*tempz1(ijk,1,1) + etaxl*tempz2(ijk,1,1) + gammaxl*tempz3(ijk,1,1)
-    duzdyl = xiyl*tempz1(ijk,1,1) + etayl*tempz2(ijk,1,1) + gammayl*tempz3(ijk,1,1)
-    duzdzl = xizl*tempz1(ijk,1,1) + etazl*tempz2(ijk,1,1) + gammazl*tempz3(ijk,1,1)
+    duzdxl = xixl*tempz1(INDEX_IJK) + etaxl*tempz2(INDEX_IJK) + gammaxl*tempz3(INDEX_IJK)
+    duzdyl = xiyl*tempz1(INDEX_IJK) + etayl*tempz2(INDEX_IJK) + gammayl*tempz3(INDEX_IJK)
+    duzdzl = xizl*tempz1(INDEX_IJK) + etazl*tempz2(INDEX_IJK) + gammazl*tempz3(INDEX_IJK)
 
     ! precompute some sums to save CPU time
     duxdxl_plus_duydyl = duxdxl + duydyl
@@ -222,66 +218,15 @@
 
     ! strains
     templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-    eps_trace_over_3_loc(ijk,1,1) = templ
-    epsilondev_loc(1,ijk,1,1) = duxdxl - templ
-    epsilondev_loc(2,ijk,1,1) = duydyl - templ
-    epsilondev_loc(3,ijk,1,1) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-    epsilondev_loc(4,ijk,1,1) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-    epsilondev_loc(5,ijk,1,1) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
-  enddo
-#else
-  do k=1,NGLLZ
-    do j=1,NGLLY
-      do i=1,NGLLX
-        ! get derivatives of ux, uy and uz with respect to x, y and z
-        xixl = xix(i,j,k,ispec)
-        xiyl = xiy(i,j,k,ispec)
-        xizl = xiz(i,j,k,ispec)
-        etaxl = etax(i,j,k,ispec)
-        etayl = etay(i,j,k,ispec)
-        etazl = etaz(i,j,k,ispec)
-        gammaxl = gammax(i,j,k,ispec)
-        gammayl = gammay(i,j,k,ispec)
-        gammazl = gammaz(i,j,k,ispec)
+    eps_trace_over_3_loc(INDEX_IJK) = templ
+    epsilondev_loc(1,INDEX_IJK) = duxdxl - templ
+    epsilondev_loc(2,INDEX_IJK) = duydyl - templ
+    epsilondev_loc(3,INDEX_IJK) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+    epsilondev_loc(4,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+    epsilondev_loc(5,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
 
-        ! compute the jacobian
-        jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                      - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                      + xizl*(etaxl*gammayl-etayl*gammaxl))
+  ENDDO_LOOP_IJK
 
-        duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-        duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-        duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-        duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-        duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-        duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-        duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-        duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-        duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-        ! precompute some sums to save CPU time
-        duxdxl_plus_duydyl = duxdxl + duydyl
-        duxdxl_plus_duzdzl = duxdxl + duzdzl
-        duydyl_plus_duzdzl = duydyl + duzdzl
-        duxdyl_plus_duydxl = duxdyl + duydxl
-        duzdxl_plus_duxdzl = duzdxl + duxdzl
-        duzdyl_plus_duydzl = duzdyl + duydzl
-
-        ! strains
-        templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-        eps_trace_over_3_loc(i,j,k) = templ
-        epsilondev_loc(1,i,j,k) = duxdxl - templ
-        epsilondev_loc(2,i,j,k) = duydyl - templ
-        epsilondev_loc(3,i,j,k) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-        epsilondev_loc(4,i,j,k) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-        epsilondev_loc(5,i,j,k) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
-      enddo
-    enddo
-  enddo
-#endif
-
   end subroutine compute_element_strain_undo_att_Dev
 
 
@@ -470,7 +415,9 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,nspec_strain_only) :: eps_trace_over_3_loc_nplus1
 
   ! local variable
-  integer :: i,j,k,iglob
+  integer :: iglob
+  integer :: i,j,k
+
   real(kind=CUSTOM_REAL) :: templ
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
@@ -503,26 +450,16 @@
   integer :: ijk
 #endif
 
-#ifdef FORCE_VECTORIZATION
-  do ijk=1,NGLLCUBE
-      iglob = ibool(ijk,1,1,ispec)
-      dummyx_loc(ijk,1,1) = displ(1,iglob) + deltat * veloc(1,iglob)
-      dummyy_loc(ijk,1,1) = displ(2,iglob) + deltat * veloc(2,iglob)
-      dummyz_loc(ijk,1,1) = displ(3,iglob) + deltat * veloc(3,iglob)
-  enddo
-#else
-  do k=1,NGLLZ
-    do j=1,NGLLY
-      do i=1,NGLLX
-          iglob = ibool(i,j,k,ispec)
-          dummyx_loc(i,j,k) = displ(1,iglob) + deltat * veloc(1,iglob)
-          dummyy_loc(i,j,k) = displ(2,iglob) + deltat * veloc(2,iglob)
-          dummyz_loc(i,j,k) = displ(3,iglob) + deltat * veloc(3,iglob)
-      enddo
-    enddo
-  enddo
-#endif
+  DO_LOOP_IJK
 
+    iglob = ibool(INDEX_IJK,ispec)
+    dummyx_loc(INDEX_IJK) = displ(1,iglob) + deltat * veloc(1,iglob)
+    dummyy_loc(INDEX_IJK) = displ(2,iglob) + deltat * veloc(2,iglob)
+    dummyz_loc(INDEX_IJK) = displ(3,iglob) + deltat * veloc(3,iglob)
+
+  ENDDO_LOOP_IJK
+
+  ! deville optimizations
   do j=1,m2
     do i=1,m1
       C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
@@ -592,35 +529,35 @@
     enddo
   enddo
 
-#ifdef FORCE_VECTORIZATION
-  do ijk=1,NGLLCUBE
+  DO_LOOP_IJK
+
     ! get derivatives of ux, uy and uz with respect to x, y and z
-    xixl = xix(ijk,1,1,ispec)
-    xiyl = xiy(ijk,1,1,ispec)
-    xizl = xiz(ijk,1,1,ispec)
-    etaxl = etax(ijk,1,1,ispec)
-    etayl = etay(ijk,1,1,ispec)
-    etazl = etaz(ijk,1,1,ispec)
-    gammaxl = gammax(ijk,1,1,ispec)
-    gammayl = gammay(ijk,1,1,ispec)
-    gammazl = gammaz(ijk,1,1,ispec)
+    xixl = xix(INDEX_IJK,ispec)
+    xiyl = xiy(INDEX_IJK,ispec)
+    xizl = xiz(INDEX_IJK,ispec)
+    etaxl = etax(INDEX_IJK,ispec)
+    etayl = etay(INDEX_IJK,ispec)
+    etazl = etaz(INDEX_IJK,ispec)
+    gammaxl = gammax(INDEX_IJK,ispec)
+    gammayl = gammay(INDEX_IJK,ispec)
+    gammazl = gammaz(INDEX_IJK,ispec)
 
     ! compute the jacobian
     jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
                   - xiyl*(etaxl*gammazl-etazl*gammaxl) &
                   + xizl*(etaxl*gammayl-etayl*gammaxl))
 
-    duxdxl = xixl*tempx1(ijk,1,1) + etaxl*tempx2(ijk,1,1) + gammaxl*tempx3(ijk,1,1)
-    duxdyl = xiyl*tempx1(ijk,1,1) + etayl*tempx2(ijk,1,1) + gammayl*tempx3(ijk,1,1)
-    duxdzl = xizl*tempx1(ijk,1,1) + etazl*tempx2(ijk,1,1) + gammazl*tempx3(ijk,1,1)
+    duxdxl = xixl*tempx1(INDEX_IJK) + etaxl*tempx2(INDEX_IJK) + gammaxl*tempx3(INDEX_IJK)
+    duxdyl = xiyl*tempx1(INDEX_IJK) + etayl*tempx2(INDEX_IJK) + gammayl*tempx3(INDEX_IJK)
+    duxdzl = xizl*tempx1(INDEX_IJK) + etazl*tempx2(INDEX_IJK) + gammazl*tempx3(INDEX_IJK)
 
-    duydxl = xixl*tempy1(ijk,1,1) + etaxl*tempy2(ijk,1,1) + gammaxl*tempy3(ijk,1,1)
-    duydyl = xiyl*tempy1(ijk,1,1) + etayl*tempy2(ijk,1,1) + gammayl*tempy3(ijk,1,1)
-    duydzl = xizl*tempy1(ijk,1,1) + etazl*tempy2(ijk,1,1) + gammazl*tempy3(ijk,1,1)
+    duydxl = xixl*tempy1(INDEX_IJK) + etaxl*tempy2(INDEX_IJK) + gammaxl*tempy3(INDEX_IJK)
+    duydyl = xiyl*tempy1(INDEX_IJK) + etayl*tempy2(INDEX_IJK) + gammayl*tempy3(INDEX_IJK)
+    duydzl = xizl*tempy1(INDEX_IJK) + etazl*tempy2(INDEX_IJK) + gammazl*tempy3(INDEX_IJK)
 
-    duzdxl = xixl*tempz1(ijk,1,1) + etaxl*tempz2(ijk,1,1) + gammaxl*tempz3(ijk,1,1)
-    duzdyl = xiyl*tempz1(ijk,1,1) + etayl*tempz2(ijk,1,1) + gammayl*tempz3(ijk,1,1)
-    duzdzl = xizl*tempz1(ijk,1,1) + etazl*tempz2(ijk,1,1) + gammazl*tempz3(ijk,1,1)
+    duzdxl = xixl*tempz1(INDEX_IJK) + etaxl*tempz2(INDEX_IJK) + gammaxl*tempz3(INDEX_IJK)
+    duzdyl = xiyl*tempz1(INDEX_IJK) + etayl*tempz2(INDEX_IJK) + gammayl*tempz3(INDEX_IJK)
+    duzdzl = xizl*tempz1(INDEX_IJK) + etazl*tempz2(INDEX_IJK) + gammazl*tempz3(INDEX_IJK)
 
     ! precompute some sums to save CPU time
     duxdxl_plus_duydyl = duxdxl + duydyl
@@ -633,75 +570,19 @@
     templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
     if( nspec_strain_only == 1 ) then
       if( ispec == 1 ) then
-        eps_trace_over_3_loc_nplus1(ijk,1,1,1) = templ
+        eps_trace_over_3_loc_nplus1(INDEX_IJK,1) = templ
       endif
     else
-      eps_trace_over_3_loc_nplus1(ijk,1,1,ispec) = templ
+      eps_trace_over_3_loc_nplus1(INDEX_IJK,ispec) = templ
     endif
-    epsilondev_xx_loc_nplus1(ijk,1,1,ispec) = duxdxl - templ
-    epsilondev_yy_loc_nplus1(ijk,1,1,ispec) = duydyl - templ
-    epsilondev_xy_loc_nplus1(ijk,1,1,ispec) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-    epsilondev_xz_loc_nplus1(ijk,1,1,ispec) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-    epsilondev_yz_loc_nplus1(ijk,1,1,ispec) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
-  enddo
-#else
-  do k=1,NGLLZ
-    do j=1,NGLLY
-      do i=1,NGLLX
-        ! get derivatives of ux, uy and uz with respect to x, y and z
-        xixl = xix(i,j,k,ispec)
-        xiyl = xiy(i,j,k,ispec)
-        xizl = xiz(i,j,k,ispec)
-        etaxl = etax(i,j,k,ispec)
-        etayl = etay(i,j,k,ispec)
-        etazl = etaz(i,j,k,ispec)
-        gammaxl = gammax(i,j,k,ispec)
-        gammayl = gammay(i,j,k,ispec)
-        gammazl = gammaz(i,j,k,ispec)
+    epsilondev_xx_loc_nplus1(INDEX_IJK,ispec) = duxdxl - templ
+    epsilondev_yy_loc_nplus1(INDEX_IJK,ispec) = duydyl - templ
+    epsilondev_xy_loc_nplus1(INDEX_IJK,ispec) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+    epsilondev_xz_loc_nplus1(INDEX_IJK,ispec) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+    epsilondev_yz_loc_nplus1(INDEX_IJK,ispec) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
 
-        ! compute the jacobian
-        jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                      - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                      + xizl*(etaxl*gammayl-etayl*gammaxl))
+  ENDDO_LOOP_IJK
 
-        duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-        duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-        duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
-        duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-        duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-        duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
-        duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-        duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-        duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
-        ! precompute some sums to save CPU time
-        duxdxl_plus_duydyl = duxdxl + duydyl
-        duxdxl_plus_duzdzl = duxdxl + duzdzl
-        duydyl_plus_duzdzl = duydyl + duzdzl
-        duxdyl_plus_duydxl = duxdyl + duydxl
-        duzdxl_plus_duxdzl = duzdxl + duxdzl
-        duzdyl_plus_duydzl = duzdyl + duydzl
-
-        templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-        if( nspec_strain_only == 1 ) then
-          if( ispec == 1 ) then
-            eps_trace_over_3_loc_nplus1(i,j,k,1) = templ
-          endif
-        else
-          eps_trace_over_3_loc_nplus1(i,j,k,ispec) = templ
-        endif
-        epsilondev_xx_loc_nplus1(i,j,k,ispec) = duxdxl - templ
-        epsilondev_yy_loc_nplus1(i,j,k,ispec) = duydyl - templ
-        epsilondev_xy_loc_nplus1(i,j,k,ispec) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-        epsilondev_xz_loc_nplus1(i,j,k,ispec) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-        epsilondev_yz_loc_nplus1(i,j,k,ispec) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
-      enddo
-    enddo
-  enddo
-#endif
-
  end subroutine compute_element_strain_att_Dev
 
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_Dev.F90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -25,10 +25,14 @@
 !
 !=====================================================================
 
+! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION
+! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh
+#include "config.fh"
+
+
   subroutine compute_forces_crust_mantle_Dev( NSPEC,NGLOB,NSPEC_ATT, &
                                               deltat, &
                                               displ_crust_mantle, &
-!                                             veloc_crust_mantle, &
                                               accel_crust_mantle, &
                                               phase_is_inner, &
                                               R_xx,R_yy,R_xy,R_xz,R_yz, &
@@ -37,7 +41,7 @@
                                               epsilondev_xz,epsilondev_yz, &
                                               epsilon_trace_over_3, &
                                               alphaval,betaval,gammaval, &
-                                              factor_common,vnspec,is_backward_field)
+                                              factor_common,vnspec )
 
 ! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
 
@@ -71,11 +75,7 @@
     nspec_outer => nspec_outer_crust_mantle, &
     nspec_inner => nspec_inner_crust_mantle
 
-#ifdef FORCE_VECTORIZATION
   use specfem_par,only: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D
-#else
-  use specfem_par,only: wgllwgll_xy,wgllwgll_xz,wgllwgll_yz
-#endif
 
 !daniel: att - debug
 !  use specfem_par,only: it,NSTEP
@@ -113,8 +113,6 @@
   ! inner/outer element run flag
   logical :: phase_is_inner
 
-  logical :: is_backward_field
-
   ! local parameters
 
   ! Deville
@@ -162,8 +160,8 @@
   ! for gravity
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLLX,NGLLY,NGLLZ) :: rho_s_H
 
-  integer :: ispec,i,j,k,iglob
-
+  integer :: ispec,iglob
+  integer :: i,j,k
   integer :: num_elements,ispec_p
   integer :: iphase
 
@@ -190,30 +188,15 @@
 
     ! only compute element which belong to current phase (inner or outer elements)
 
-    ! subroutines adapted from Deville, Fischer and Mund, High-order methods
-    ! for incompressible fluid flow, Cambridge University Press (2002),
-    ! pages 386 and 389 and Figure 8.3.1
+    DO_LOOP_IJK
 
-#ifdef FORCE_VECTORIZATION
-    do ijk=1,NGLLCUBE
-      iglob = ibool(ijk,1,1,ispec)
-      dummyx_loc(ijk,1,1) = displ_crust_mantle(1,iglob)
-      dummyy_loc(ijk,1,1) = displ_crust_mantle(2,iglob)
-      dummyz_loc(ijk,1,1) = displ_crust_mantle(3,iglob)
-    enddo
-#else
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-          iglob = ibool(i,j,k,ispec)
-          dummyx_loc(i,j,k) = displ_crust_mantle(1,iglob)
-          dummyy_loc(i,j,k) = displ_crust_mantle(2,iglob)
-          dummyz_loc(i,j,k) = displ_crust_mantle(3,iglob)
-        enddo
-      enddo
-    enddo
-#endif
+      iglob = ibool(INDEX_IJK,ispec)
+      dummyx_loc(INDEX_IJK) = displ_crust_mantle(1,iglob)
+      dummyy_loc(INDEX_IJK) = displ_crust_mantle(2,iglob)
+      dummyz_loc(INDEX_IJK) = displ_crust_mantle(3,iglob)
 
+    ENDDO_LOOP_IJK
+
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
@@ -292,54 +275,51 @@
     if(ANISOTROPIC_3D_MANTLE_VAL) then
        ! anisotropic element
        call compute_element_aniso(ispec, &
-            minus_gravity_table,density_table,minus_deriv_gravity_table, &
-            xstore,ystore,zstore, &
-            xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-            wgll_cube, &
-            c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
-            c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
-            c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
-            ibool, &
-            R_xx,R_yy,R_xy,R_xz,R_yz, &
-            epsilon_trace_over_3, &
-            one_minus_sum_beta,vnspec, &
-            tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
-            dummyx_loc,dummyy_loc,dummyz_loc, &
-            epsilondev_loc, &
-            rho_s_H,is_backward_field)
+                                  minus_gravity_table,density_table,minus_deriv_gravity_table, &
+                                  xstore,ystore,zstore, &
+                                  xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                  wgll_cube, &
+                                  c11store,c12store,c13store,c14store,c15store,c16store,c22store, &
+                                  c23store,c24store,c25store,c26store,c33store,c34store,c35store, &
+                                  c36store,c44store,c45store,c46store,c55store,c56store,c66store, &
+                                  ibool, &
+                                  R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                  epsilon_trace_over_3, &
+                                  one_minus_sum_beta,vnspec, &
+                                  tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
+                                  dummyx_loc,dummyy_loc,dummyz_loc, &
+                                  epsilondev_loc,rho_s_H)
     else
        if(.not. ispec_is_tiso(ispec)) then
           ! isotropic element
           call compute_element_iso(ispec, &
-               minus_gravity_table,density_table,minus_deriv_gravity_table, &
-               xstore,ystore,zstore, &
-               xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-               wgll_cube, &
-               kappavstore,muvstore, &
-               ibool, &
-               R_xx,R_yy,R_xy,R_xz,R_yz, &
-               epsilon_trace_over_3, &
-               one_minus_sum_beta,vnspec, &
-               tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
-               dummyx_loc,dummyy_loc,dummyz_loc, &
-               epsilondev_loc, &
-               rho_s_H,is_backward_field)
+                                   minus_gravity_table,density_table,minus_deriv_gravity_table, &
+                                   xstore,ystore,zstore, &
+                                   xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                   wgll_cube, &
+                                   kappavstore,muvstore, &
+                                   ibool, &
+                                   R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                   epsilon_trace_over_3, &
+                                   one_minus_sum_beta,vnspec, &
+                                   tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
+                                   dummyx_loc,dummyy_loc,dummyz_loc, &
+                                   epsilondev_loc,rho_s_H)
        else
           ! transverse isotropic element
           call compute_element_tiso(ispec, &
-               minus_gravity_table,density_table,minus_deriv_gravity_table, &
-               xstore,ystore,zstore, &
-               xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
-               wgll_cube, &
-               kappavstore,kappahstore,muvstore,muhstore,eta_anisostore, &
-               ibool, &
-               R_xx,R_yy,R_xy,R_xz,R_yz, &
-               epsilon_trace_over_3, &
-               one_minus_sum_beta,vnspec, &
-               tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
-               dummyx_loc,dummyy_loc,dummyz_loc, &
-               epsilondev_loc, &
-               rho_s_H,is_backward_field)
+                                     minus_gravity_table,density_table,minus_deriv_gravity_table, &
+                                     xstore,ystore,zstore, &
+                                     xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+                                     wgll_cube, &
+                                     kappavstore,kappahstore,muvstore,muhstore,eta_anisostore, &
+                                     ibool, &
+                                     R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                     epsilon_trace_over_3, &
+                                     one_minus_sum_beta,vnspec, &
+                                     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3, &
+                                     dummyx_loc,dummyy_loc,dummyz_loc, &
+                                     epsilondev_loc,rho_s_H)
        endif ! .not. ispec_is_tiso
     endif
 
@@ -416,40 +396,42 @@
     enddo
 
     ! sum contributions
-#ifdef FORCE_VECTORIZATION
-    do ijk=1,NGLLCUBE
-      fac1 = wgllwgll_yz_3D(ijk,1,1)
-      fac2 = wgllwgll_xz_3D(ijk,1,1)
-      fac3 = wgllwgll_xy_3D(ijk,1,1)
-      sum_terms(1,ijk,1,1) = - (fac1*newtempx1(ijk,1,1) + fac2*newtempx2(ijk,1,1) + fac3*newtempx3(ijk,1,1))
-      sum_terms(2,ijk,1,1) = - (fac1*newtempy1(ijk,1,1) + fac2*newtempy2(ijk,1,1) + fac3*newtempy3(ijk,1,1))
-      sum_terms(3,ijk,1,1) = - (fac1*newtempz1(ijk,1,1) + fac2*newtempz2(ijk,1,1) + fac3*newtempz3(ijk,1,1))
-    enddo
+
+    DO_LOOP_IJK
+
+      fac1 = wgllwgll_yz_3D(INDEX_IJK)
+      fac2 = wgllwgll_xz_3D(INDEX_IJK)
+      fac3 = wgllwgll_xy_3D(INDEX_IJK)
+      sum_terms(1,INDEX_IJK) = - (fac1*newtempx1(INDEX_IJK) + fac2*newtempx2(INDEX_IJK) + fac3*newtempx3(INDEX_IJK))
+      sum_terms(2,INDEX_IJK) = - (fac1*newtempy1(INDEX_IJK) + fac2*newtempy2(INDEX_IJK) + fac3*newtempy3(INDEX_IJK))
+      sum_terms(3,INDEX_IJK) = - (fac1*newtempz1(INDEX_IJK) + fac2*newtempz2(INDEX_IJK) + fac3*newtempz3(INDEX_IJK))
+
+    ENDDO_LOOP_IJK
+
     ! adds gravity terms
     if(GRAVITY_VAL) then
+
+#ifdef FORCE_VECTORIZATION
       do ijk = 1,NDIM*NGLLCUBE
         sum_terms(ijk,1,1,1) = sum_terms(ijk,1,1,1) + rho_s_H(ijk,1,1,1)
       enddo
-    endif
 #else
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        fac1 = wgllwgll_yz(j,k)
-        do i=1,NGLLX
-          fac2 = wgllwgll_xz(i,k)
-          fac3 = wgllwgll_xy(i,j)
-          ! sums contributions
-          sum_terms(1,i,j,k) = - (fac1*newtempx1(i,j,k) + fac2*newtempx2(i,j,k) + fac3*newtempx3(i,j,k))
-          sum_terms(2,i,j,k) = - (fac1*newtempy1(i,j,k) + fac2*newtempy2(i,j,k) + fac3*newtempy3(i,j,k))
-          sum_terms(3,i,j,k) = - (fac1*newtempz1(i,j,k) + fac2*newtempz2(i,j,k) + fac3*newtempz3(i,j,k))
-          ! adds gravity terms
-          if(GRAVITY_VAL) sum_terms(:,i,j,k) = sum_terms(:,i,j,k) + rho_s_H(:,i,j,k)
-        enddo ! NGLLX
-      enddo ! NGLLY
-    enddo ! NGLLZ
+      do k=1,NGLLZ
+        do j=1,NGLLY
+          do i=1,NGLLX
+            sum_terms(1,INDEX_IJK) = sum_terms(1,INDEX_IJK) + rho_s_H(1,INDEX_IJK)
+            sum_terms(2,INDEX_IJK) = sum_terms(2,INDEX_IJK) + rho_s_H(2,INDEX_IJK)
+            sum_terms(3,INDEX_IJK) = sum_terms(3,INDEX_IJK) + rho_s_H(3,INDEX_IJK)
+          enddo
+        enddo
+      enddo
 #endif
 
-    ! sum contributions from each element to the global mesh and add gravity terms
+    endif
+
+
+    ! updates acceleration
+
 #ifdef FORCE_VECTORIZATION
 ! we can force vectorization using a compiler directive here because we know that there is no dependency
 ! inside a given spectral element, since all the global points of a local elements are different by definition
@@ -459,21 +441,23 @@
 !pgi$ ivdep
 !DIR$ IVDEP
     do ijk = 1,NGLLCUBE
-      iglob = ibool(ijk,1,1,ispec)
-      ! do NOT use array syntax ":" for the three statements below otherwise most compilers
-      ! will not be able to vectorize the outer loop
-      accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(1,ijk,1,1)
-      accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(2,ijk,1,1)
-      accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(3,ijk,1,1)
-    enddo
 #else
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
-          iglob = ibool(i,j,k,ispec)
-          accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(1,i,j,k)
-          accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(2,i,j,k)
-          accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(3,i,j,k)
+#endif
+
+          iglob = ibool(INDEX_IJK,ispec)
+
+          ! do NOT use array syntax ":" for the three statements below otherwise most compilers
+          ! will not be able to vectorize the outer loop
+          accel_crust_mantle(1,iglob) = accel_crust_mantle(1,iglob) + sum_terms(1,INDEX_IJK)
+          accel_crust_mantle(2,iglob) = accel_crust_mantle(2,iglob) + sum_terms(2,INDEX_IJK)
+          accel_crust_mantle(3,iglob) = accel_crust_mantle(3,iglob) + sum_terms(3,INDEX_IJK)
+
+#ifdef FORCE_VECTORIZATION
+    enddo
+#else
         enddo
       enddo
     enddo
@@ -511,7 +495,7 @@
                                            c44store,muvstore, &
                                            epsilondev_xx,epsilondev_yy,epsilondev_xy, &
                                            epsilondev_xz,epsilondev_yz, &
-                                           epsilondev_loc) !!!!!!!!!!!!!!! ,is_backward_field)
+                                           epsilondev_loc)
       endif
     endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_noDev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_noDev.f90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_crust_mantle_noDev.f90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -28,7 +28,6 @@
   subroutine compute_forces_crust_mantle(NSPEC,NGLOB,NSPEC_ATT, &
                                         deltat, &
                                         displ_crust_mantle, &
-!                                       veloc_crust_mantle, &
                                         accel_crust_mantle, &
                                         phase_is_inner, &
                                         R_xx,R_yy,R_xy,R_xz,R_yz, &
@@ -37,7 +36,7 @@
                                         epsilondev_xz,epsilondev_yz, &
                                         epsilon_trace_over_3, &
                                         alphaval,betaval,gammaval, &
-                                        factor_common,vnspec) !!!!!!!!!!!!!!!!!!!!! ,is_backward_field)
+                                        factor_common,vnspec)
 
   use constants_solver
 
@@ -78,7 +77,6 @@
 
   ! displacement, velocity and acceleration
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: displ_crust_mantle
-! real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: veloc_crust_mantle
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: accel_crust_mantle
 
   ! variable sized array variables
@@ -102,8 +100,6 @@
   ! inner/outer element run flag
   logical :: phase_is_inner
 
-!!!!!!!!!!!!!!!  logical :: is_backward_field
-
   ! local parameters
 
   ! for attenuation
@@ -829,7 +825,7 @@
                                            c44store,muvstore, &
                                            epsilondev_xx,epsilondev_yy,epsilondev_xy, &
                                            epsilondev_xz,epsilondev_yz, &
-                                           epsilondev_loc) !!!!!!!!!!!!!!! ,is_backward_field)
+                                           epsilondev_loc)
       endif
     endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_Dev.F90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -25,10 +25,15 @@
 !
 !=====================================================================
 
+! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION
+! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh
+#include "config.fh"
+
+
+
   subroutine compute_forces_inner_core_Dev( NSPEC,NGLOB,NSPEC_ATT, &
                                             deltat, &
                                             displ_inner_core, &
-!                                           veloc_inner_core, &
                                             accel_inner_core, &
                                             phase_is_inner, &
                                             R_xx,R_yy,R_xy,R_xz,R_yz, &
@@ -36,8 +41,8 @@
                                             epsilondev_xx,epsilondev_yy,epsilondev_xy, &
                                             epsilondev_xz,epsilondev_yz, &
                                             epsilon_trace_over_3,&
-                                            alphaval,betaval,gammaval,factor_common, &
-                                            vnspec) !!!!!!!!!!!!!!!!! ,is_backward_field)
+                                            alphaval,betaval,gammaval, &
+                                            factor_common,vnspec)
 
 ! this routine is optimized for NGLLX = NGLLY = NGLLZ = 5 using the Deville et al. (2002) inlined matrix-matrix products
 
@@ -64,11 +69,7 @@
     nspec_outer => nspec_outer_inner_core, &
     nspec_inner => nspec_inner_inner_core
 
-#ifdef FORCE_VECTORIZATION
   use specfem_par,only: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D
-#else
-  use specfem_par,only: wgllwgll_xy,wgllwgll_xz,wgllwgll_yz
-#endif
 
   implicit none
 
@@ -79,7 +80,6 @@
 
   ! displacement and acceleration
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: displ_inner_core
-! real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: veloc_inner_core
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: accel_inner_core
 
   ! for attenuation
@@ -103,8 +103,6 @@
   ! inner/outer element run flag
   logical :: phase_is_inner
 
-!!!!!!!!!!!!!!!  logical :: is_backward_field
-
   ! local parameters
 
   ! Deville
@@ -113,6 +111,7 @@
     tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc, &
     newtempx1,newtempx2,newtempx3,newtempy1,newtempy2,newtempy3,newtempz1,newtempz2,newtempz3
+
   real(kind=CUSTOM_REAL), dimension(NGLLX,m2) :: B1_m1_m2_5points,B2_m1_m2_5points,B3_m1_m2_5points
   real(kind=CUSTOM_REAL), dimension(m1,m2) :: C1_m1_m2_5points,C2_m1_m2_5points,C3_m1_m2_5points
   real(kind=CUSTOM_REAL), dimension(m1,m2) :: E1_m1_m2_5points,E2_m1_m2_5points,E3_m1_m2_5points
@@ -127,7 +126,6 @@
   equivalence(newtempy1,E2_m1_m2_5points)
   equivalence(newtempz1,E3_m1_m2_5points)
 
-
   real(kind=CUSTOM_REAL), dimension(m2,NGLLX) :: &
     A1_mxm_m2_m1_5points,A2_mxm_m2_m1_5points,A3_mxm_m2_m1_5points
   real(kind=CUSTOM_REAL), dimension(m2,m1) :: &
@@ -157,9 +155,10 @@
 
   real(kind=CUSTOM_REAL) sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz
 
-  real(kind=CUSTOM_REAL) fac1,fac2,fac3,templ
+  real(kind=CUSTOM_REAL) fac1,fac2,fac3
   real(kind=CUSTOM_REAL) lambdal,mul,lambdalplus2mul
   real(kind=CUSTOM_REAL) kappal
+  real(kind=CUSTOM_REAL) templ
 
   real(kind=CUSTOM_REAL) minus_sum_beta
   real(kind=CUSTOM_REAL) c11l,c33l,c12l,c13l,c44l
@@ -175,15 +174,17 @@
   real(kind=CUSTOM_REAL) sigma_yx,sigma_zx,sigma_zy
 
   integer :: int_radius
-  integer :: ispec
+  integer :: ispec,iglob
   integer :: i,j,k
-  integer :: iglob
-!  integer :: computed_elements
   integer :: num_elements,ispec_p
   integer :: iphase
 
 #ifdef FORCE_VECTORIZATION
+! in this vectorized version we have to assume that N_SLS == 3 in order to be able to unroll and thus suppress
+! an inner loop that would otherwise prevent vectorization; this is safe in practice in all cases because N_SLS == 3
+! in all known applications, and in the main program we check that N_SLS == 3 if FORCE_VECTORIZATION is used and we stop
   integer :: ijk
+  real(kind=CUSTOM_REAL) :: R_xx_val,R_yy_val
 #endif
 
 ! ****************************************************
@@ -209,21 +210,18 @@
     ! exclude fictitious elements in central cube
     if(idoubling(ispec) /= IFLAG_IN_FICTITIOUS_CUBE) then
 
+      DO_LOOP_IJK
+
+        iglob = ibool(INDEX_IJK,ispec)
+        dummyx_loc(INDEX_IJK) = displ_inner_core(1,iglob)
+        dummyy_loc(INDEX_IJK) = displ_inner_core(2,iglob)
+        dummyz_loc(INDEX_IJK) = displ_inner_core(3,iglob)
+
+      ENDDO_LOOP_IJK
+
       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
       ! for incompressible fluid flow, Cambridge University Press (2002),
       ! pages 386 and 389 and Figure 8.3.1
-
-      do k=1,NGLLZ
-        do j=1,NGLLY
-          do i=1,NGLLX
-            iglob = ibool(i,j,k,ispec)
-            dummyx_loc(i,j,k) = displ_inner_core(1,iglob)
-            dummyy_loc(i,j,k) = displ_inner_core(2,iglob)
-            dummyz_loc(i,j,k) = displ_inner_core(3,iglob)
-          enddo
-        enddo
-      enddo
-
       do j=1,m2
          do i=1,m1
             C1_m1_m2_5points(i,j) = hprime_xx(i,1)*B1_m1_m2_5points(1,j) + &
@@ -293,281 +291,311 @@
          enddo
       enddo
 
-      do k=1,NGLLZ
-        do j=1,NGLLY
-          do i=1,NGLLX
+      DO_LOOP_IJK
 
-            ! get derivatives of ux, uy and uz with respect to x, y and z
-            xixl = xix(i,j,k,ispec)
-            xiyl = xiy(i,j,k,ispec)
-            xizl = xiz(i,j,k,ispec)
-            etaxl = etax(i,j,k,ispec)
-            etayl = etay(i,j,k,ispec)
-            etazl = etaz(i,j,k,ispec)
-            gammaxl = gammax(i,j,k,ispec)
-            gammayl = gammay(i,j,k,ispec)
-            gammazl = gammaz(i,j,k,ispec)
+        ! get derivatives of ux, uy and uz with respect to x, y and z
+        xixl = xix(INDEX_IJK,ispec)
+        xiyl = xiy(INDEX_IJK,ispec)
+        xizl = xiz(INDEX_IJK,ispec)
+        etaxl = etax(INDEX_IJK,ispec)
+        etayl = etay(INDEX_IJK,ispec)
+        etazl = etaz(INDEX_IJK,ispec)
+        gammaxl = gammax(INDEX_IJK,ispec)
+        gammayl = gammay(INDEX_IJK,ispec)
+        gammazl = gammaz(INDEX_IJK,ispec)
 
-            ! compute the jacobian
-            jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                          - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                          + xizl*(etaxl*gammayl-etayl*gammaxl))
+        ! compute the jacobian
+        jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+                      - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+                      + xizl*(etaxl*gammayl-etayl*gammaxl))
 
-            duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-            duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-            duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+        duxdxl = xixl*tempx1(INDEX_IJK) + etaxl*tempx2(INDEX_IJK) + gammaxl*tempx3(INDEX_IJK)
+        duxdyl = xiyl*tempx1(INDEX_IJK) + etayl*tempx2(INDEX_IJK) + gammayl*tempx3(INDEX_IJK)
+        duxdzl = xizl*tempx1(INDEX_IJK) + etazl*tempx2(INDEX_IJK) + gammazl*tempx3(INDEX_IJK)
 
-            duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
-            duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
-            duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+        duydxl = xixl*tempy1(INDEX_IJK) + etaxl*tempy2(INDEX_IJK) + gammaxl*tempy3(INDEX_IJK)
+        duydyl = xiyl*tempy1(INDEX_IJK) + etayl*tempy2(INDEX_IJK) + gammayl*tempy3(INDEX_IJK)
+        duydzl = xizl*tempy1(INDEX_IJK) + etazl*tempy2(INDEX_IJK) + gammazl*tempy3(INDEX_IJK)
 
-            duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
-            duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
-            duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+        duzdxl = xixl*tempz1(INDEX_IJK) + etaxl*tempz2(INDEX_IJK) + gammaxl*tempz3(INDEX_IJK)
+        duzdyl = xiyl*tempz1(INDEX_IJK) + etayl*tempz2(INDEX_IJK) + gammayl*tempz3(INDEX_IJK)
+        duzdzl = xizl*tempz1(INDEX_IJK) + etazl*tempz2(INDEX_IJK) + gammazl*tempz3(INDEX_IJK)
 
-            ! precompute some sums to save CPU time
-            duxdxl_plus_duydyl = duxdxl + duydyl
-            duxdxl_plus_duzdzl = duxdxl + duzdzl
-            duydyl_plus_duzdzl = duydyl + duzdzl
-            duxdyl_plus_duydxl = duxdyl + duydxl
-            duzdxl_plus_duxdzl = duzdxl + duxdzl
-            duzdyl_plus_duydzl = duzdyl + duydzl
+        ! precompute some sums to save CPU time
+        duxdxl_plus_duydyl = duxdxl + duydyl
+        duxdxl_plus_duzdzl = duxdxl + duzdzl
+        duydyl_plus_duzdzl = duydyl + duzdzl
+        duxdyl_plus_duydxl = duxdyl + duydxl
+        duzdxl_plus_duxdzl = duzdxl + duxdzl
+        duzdyl_plus_duydzl = duzdyl + duydzl
 
-            ! compute deviatoric strain
-            if (COMPUTE_AND_STORE_STRAIN) then
-              if(NSPEC_INNER_CORE_STRAIN_ONLY == 1) then
-                if( ispec == 1) then
-                  epsilon_trace_over_3(i,j,k,1) = templ
-                endif
-              else
-                epsilon_trace_over_3(i,j,k,ispec) = templ
-              endif
-              templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
-              epsilondev_loc(1,i,j,k) = duxdxl - templ
-              epsilondev_loc(2,i,j,k) = duydyl - templ
-              epsilondev_loc(3,i,j,k) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
-              epsilondev_loc(4,i,j,k) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
-              epsilondev_loc(5,i,j,k) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+        ! compute deviatoric strain
+        if (COMPUTE_AND_STORE_STRAIN) then
+          templ = ONE_THIRD * (duxdxl + duydyl + duzdzl)
+          if(NSPEC_INNER_CORE_STRAIN_ONLY == 1) then
+            if( ispec == 1) then
+              epsilon_trace_over_3(INDEX_IJK,1) = templ
             endif
+          else
+            epsilon_trace_over_3(INDEX_IJK,ispec) = templ
+          endif
+          epsilondev_loc(1,INDEX_IJK) = duxdxl - templ
+          epsilondev_loc(2,INDEX_IJK) = duydyl - templ
+          epsilondev_loc(3,INDEX_IJK) = 0.5_CUSTOM_REAL * duxdyl_plus_duydxl
+          epsilondev_loc(4,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdxl_plus_duxdzl
+          epsilondev_loc(5,INDEX_IJK) = 0.5_CUSTOM_REAL * duzdyl_plus_duydzl
+        endif
 
-            if(ANISOTROPIC_INNER_CORE_VAL) then
-              ! elastic tensor for hexagonal symmetry in reduced notation:
-              !
-              !      c11 c12 c13  0   0        0
-              !      c12 c11 c13  0   0        0
-              !      c13 c13 c33  0   0        0
-              !       0   0   0  c44  0        0
-              !       0   0   0   0  c44       0
-              !       0   0   0   0   0  (c11-c12)/2
-              !
-              !       in terms of the A, C, L, N and F of Love (1927):
-              !
-              !       c11 = A
-              !       c12 = A-2N
-              !       c13 = F
-              !       c33 = C
-              !       c44 = L
-              c11l = c11store(i,j,k,ispec)
-              c12l = c12store(i,j,k,ispec)
-              c13l = c13store(i,j,k,ispec)
-              c33l = c33store(i,j,k,ispec)
-              c44l = c44store(i,j,k,ispec)
+        if(ANISOTROPIC_INNER_CORE_VAL) then
+          ! elastic tensor for hexagonal symmetry in reduced notation:
+          !
+          !      c11 c12 c13  0   0        0
+          !      c12 c11 c13  0   0        0
+          !      c13 c13 c33  0   0        0
+          !       0   0   0  c44  0        0
+          !       0   0   0   0  c44       0
+          !       0   0   0   0   0  (c11-c12)/2
+          !
+          !       in terms of the A, C, L, N and F of Love (1927):
+          !
+          !       c11 = A
+          !       c12 = A-2N
+          !       c13 = F
+          !       c33 = C
+          !       c44 = L
+          c11l = c11store(INDEX_IJK,ispec)
+          c12l = c12store(INDEX_IJK,ispec)
+          c13l = c13store(INDEX_IJK,ispec)
+          c33l = c33store(INDEX_IJK,ispec)
+          c44l = c44store(INDEX_IJK,ispec)
 
-              ! use unrelaxed parameters if attenuation
-              if(ATTENUATION_VAL) then
-                if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-                  minus_sum_beta =  one_minus_sum_beta(i,j,k,ispec) - 1.0_CUSTOM_REAL
-                else
-                  minus_sum_beta =  one_minus_sum_beta(1,1,1,ispec) - 1.0_CUSTOM_REAL
-                endif
-                mul = muvstore(i,j,k,ispec)
-                c11l = c11l + FOUR_THIRDS * minus_sum_beta * mul
-                c12l = c12l - TWO_THIRDS * minus_sum_beta * mul
-                c13l = c13l - TWO_THIRDS * minus_sum_beta * mul
-                c33l = c33l + FOUR_THIRDS * minus_sum_beta * mul
-                c44l = c44l + minus_sum_beta * mul
-              endif
+          ! use unrelaxed parameters if attenuation
+          if(ATTENUATION_VAL) then
+            if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
+              minus_sum_beta =  one_minus_sum_beta(INDEX_IJK,ispec) - 1.0_CUSTOM_REAL
+            else
+              minus_sum_beta =  one_minus_sum_beta(1,1,1,ispec) - 1.0_CUSTOM_REAL
+            endif
+            mul = muvstore(INDEX_IJK,ispec)
+            c11l = c11l + FOUR_THIRDS * minus_sum_beta * mul
+            c12l = c12l - TWO_THIRDS * minus_sum_beta * mul
+            c13l = c13l - TWO_THIRDS * minus_sum_beta * mul
+            c33l = c33l + FOUR_THIRDS * minus_sum_beta * mul
+            c44l = c44l + minus_sum_beta * mul
+          endif
 
-              sigma_xx = c11l*duxdxl + c12l*duydyl + c13l*duzdzl
-              sigma_yy = c12l*duxdxl + c11l*duydyl + c13l*duzdzl
-              sigma_zz = c13l*duxdxl + c13l*duydyl + c33l*duzdzl
-              sigma_xy = 0.5_CUSTOM_REAL*(c11l-c12l)*duxdyl_plus_duydxl
-              sigma_xz = c44l*duzdxl_plus_duxdzl
-              sigma_yz = c44l*duzdyl_plus_duydzl
+          sigma_xx = c11l*duxdxl + c12l*duydyl + c13l*duzdzl
+          sigma_yy = c12l*duxdxl + c11l*duydyl + c13l*duzdzl
+          sigma_zz = c13l*duxdxl + c13l*duydyl + c33l*duzdzl
+          sigma_xy = 0.5_CUSTOM_REAL*(c11l-c12l)*duxdyl_plus_duydxl
+          sigma_xz = c44l*duzdxl_plus_duxdzl
+          sigma_yz = c44l*duzdyl_plus_duydzl
+        else
+
+          ! inner core with no anisotropy, use kappav and muv for instance
+          ! layer with no anisotropy, use kappav and muv for instance
+          kappal = kappavstore(INDEX_IJK,ispec)
+          mul = muvstore(INDEX_IJK,ispec)
+
+          ! use unrelaxed parameters if attenuation
+          if(ATTENUATION_VAL ) then
+            if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
+              mul = mul * one_minus_sum_beta(INDEX_IJK,ispec)
             else
+              mul = mul * one_minus_sum_beta(1,1,1,ispec)
+            endif
+          endif
 
-              ! inner core with no anisotropy, use kappav and muv for instance
-              ! layer with no anisotropy, use kappav and muv for instance
-              kappal = kappavstore(i,j,k,ispec)
-              mul = muvstore(i,j,k,ispec)
+          lambdalplus2mul = kappal + FOUR_THIRDS * mul
+          lambdal = lambdalplus2mul - 2._CUSTOM_REAL*mul
 
-              ! use unrelaxed parameters if attenuation
-              if(ATTENUATION_VAL ) then
-                if( ATTENUATION_3D_VAL .or. ATTENUATION_1D_WITH_3D_STORAGE_VAL ) then
-                  mul = mul * one_minus_sum_beta(i,j,k,ispec)
-                else
-                  mul = mul * one_minus_sum_beta(1,1,1,ispec)
-                endif
-              endif
+          ! compute stress sigma
+          sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+          sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+          sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
 
-              lambdalplus2mul = kappal + FOUR_THIRDS * mul
-              lambdal = lambdalplus2mul - 2._CUSTOM_REAL*mul
+          sigma_xy = mul*duxdyl_plus_duydxl
+          sigma_xz = mul*duzdxl_plus_duxdzl
+          sigma_yz = mul*duzdyl_plus_duydzl
 
-              ! compute stress sigma
-              sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
-              sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
-              sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+        endif
 
-              sigma_xy = mul*duxdyl_plus_duydxl
-              sigma_xz = mul*duzdxl_plus_duxdzl
-              sigma_yz = mul*duzdyl_plus_duydzl
+        ! subtract memory variables if attenuation
+        if( ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
+#ifdef FORCE_VECTORIZATION
+          ! do NOT put this is a subroutine, otherwise the call to the subroutine prevents compilers
+          !from vectorizing the outer loop
 
-            endif
+          ! here we assume that N_SLS == 3 in order to be able to unroll and suppress the loop
+          ! in order to vectorize the outer loop
+          R_xx_val = R_xx(1,INDEX_IJK,ispec)
+          R_yy_val = R_yy(1,INDEX_IJK,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_xy(1,INDEX_IJK,ispec)
+          sigma_xz = sigma_xz - R_xz(1,INDEX_IJK,ispec)
+          sigma_yz = sigma_yz - R_yz(1,INDEX_IJK,ispec)
 
-            ! subtract memory variables if attenuation
-            if( ATTENUATION_VAL .and. .not. PARTIAL_PHYS_DISPERSION_ONLY_VAL ) then
-              ! note: Fortran passes pointers to array location, thus R_memory(1,1,...) is fine
-              call compute_element_att_stress(R_xx(1,i,j,k,ispec), &
-                                              R_yy(1,i,j,k,ispec), &
-                                              R_xy(1,i,j,k,ispec), &
-                                              R_xz(1,i,j,k,ispec), &
-                                              R_yz(1,i,j,k,ispec), &
-                                              sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
-            endif
+          R_xx_val = R_xx(2,INDEX_IJK,ispec)
+          R_yy_val = R_yy(2,INDEX_IJK,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_xy(2,INDEX_IJK,ispec)
+          sigma_xz = sigma_xz - R_xz(2,INDEX_IJK,ispec)
+          sigma_yz = sigma_yz - R_yz(2,INDEX_IJK,ispec)
 
-            ! define symmetric components of sigma for gravity
-            sigma_yx = sigma_xy
-            sigma_zx = sigma_xz
-            sigma_zy = sigma_yz
+          R_xx_val = R_xx(3,INDEX_IJK,ispec)
+          R_yy_val = R_yy(3,INDEX_IJK,ispec)
+          sigma_xx = sigma_xx - R_xx_val
+          sigma_yy = sigma_yy - R_yy_val
+          sigma_zz = sigma_zz + R_xx_val + R_yy_val
+          sigma_xy = sigma_xy - R_xy(3,INDEX_IJK,ispec)
+          sigma_xz = sigma_xz - R_xz(3,INDEX_IJK,ispec)
+          sigma_yz = sigma_yz - R_yz(3,INDEX_IJK,ispec)
+#else
+          ! note: Fortran passes pointers to array location, thus R_memory(1,1,...) is fine
+          call compute_element_att_stress(R_xx(1,INDEX_IJK,ispec), &
+                                          R_yy(1,INDEX_IJK,ispec), &
+                                          R_xy(1,INDEX_IJK,ispec), &
+                                          R_xz(1,INDEX_IJK,ispec), &
+                                          R_yz(1,INDEX_IJK,ispec), &
+                                          sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz)
+#endif
+        endif
 
-            ! compute non-symmetric terms for gravity
-            if(GRAVITY_VAL) then
+        ! define symmetric components of sigma for gravity
+        sigma_yx = sigma_xy
+        sigma_zx = sigma_xz
+        sigma_zy = sigma_yz
 
-              ! use mesh coordinates to get theta and phi
-              ! x y and z contain r theta and phi
-              iglob = ibool(i,j,k,ispec)
-              radius = dble(xstore(iglob))
-              theta = dble(ystore(iglob))
-              phi = dble(zstore(iglob))
+        ! compute non-symmetric terms for gravity
+        if(GRAVITY_VAL) then
 
-              ! make sure radius is never zero even for points at center of cube
-              ! because we later divide by radius
-              if(radius < 100.d0 / R_EARTH) radius = 100.d0 / R_EARTH
+          ! use mesh coordinates to get theta and phi
+          ! x y and z contain r theta and phi
+          iglob = ibool(INDEX_IJK,ispec)
+          radius = dble(xstore(iglob))
+          theta = dble(ystore(iglob))
+          phi = dble(zstore(iglob))
 
-              cos_theta = dcos(theta)
-              sin_theta = dsin(theta)
-              cos_phi = dcos(phi)
-              sin_phi = dsin(phi)
+          ! make sure radius is never zero even for points at center of cube
+          ! because we later divide by radius
+          if(radius < 100.d0 / R_EARTH) radius = 100.d0 / R_EARTH
 
-              cos_theta_sq = cos_theta**2
-              sin_theta_sq = sin_theta**2
-              cos_phi_sq = cos_phi**2
-              sin_phi_sq = sin_phi**2
+          cos_theta = dcos(theta)
+          sin_theta = dsin(theta)
+          cos_phi = dcos(phi)
+          sin_phi = dsin(phi)
 
-              ! get g, rho and dg/dr=dg
-              ! spherical components of the gravitational acceleration
-              ! for efficiency replace with lookup table every 100 m in radial direction
-              ! make sure we never use zero for point exactly at the center of the Earth
-              int_radius = max(1,nint(radius * R_EARTH_KM * 10.d0))
-              minus_g = minus_gravity_table(int_radius)
-              minus_dg = minus_deriv_gravity_table(int_radius)
-              rho = density_table(int_radius)
+          cos_theta_sq = cos_theta**2
+          sin_theta_sq = sin_theta**2
+          cos_phi_sq = cos_phi**2
+          sin_phi_sq = sin_phi**2
 
-              ! Cartesian components of the gravitational acceleration
-              gxl = minus_g*sin_theta*cos_phi
-              gyl = minus_g*sin_theta*sin_phi
-              gzl = minus_g*cos_theta
+          ! get g, rho and dg/dr=dg
+          ! spherical components of the gravitational acceleration
+          ! for efficiency replace with lookup table every 100 m in radial direction
+          ! make sure we never use zero for point exactly at the center of the Earth
+          int_radius = max(1,nint(radius * R_EARTH_KM * 10.d0))
+          minus_g = minus_gravity_table(int_radius)
+          minus_dg = minus_deriv_gravity_table(int_radius)
+          rho = density_table(int_radius)
 
-              ! Cartesian components of gradient of gravitational acceleration
-              ! obtained from spherical components
-              minus_g_over_radius = minus_g / radius
-              minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius
+          ! Cartesian components of the gravitational acceleration
+          gxl = minus_g*sin_theta*cos_phi
+          gyl = minus_g*sin_theta*sin_phi
+          gzl = minus_g*cos_theta
 
-              Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq
-              Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq
-              Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq
-              Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq
-              Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
-              Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
+          ! Cartesian components of gradient of gravitational acceleration
+          ! obtained from spherical components
+          minus_g_over_radius = minus_g / radius
+          minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius
 
-              ! for locality principle, we set iglob again, in order to have it in the cache again
-              iglob = ibool(i,j,k,ispec)
+          Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq
+          Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq
+          Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq
+          Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq
+          Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta
+          Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta
 
-              ! distinguish between single and double precision for reals
-              if(CUSTOM_REAL == SIZE_REAL) then
-                ! get displacement and multiply by density to compute G tensor
-                sx_l = rho * dble(displ_inner_core(1,iglob))
-                sy_l = rho * dble(displ_inner_core(2,iglob))
-                sz_l = rho * dble(displ_inner_core(3,iglob))
+          ! for locality principle, we set iglob again, in order to have it in the cache again
+          iglob = ibool(INDEX_IJK,ispec)
 
-                ! compute G tensor from s . g and add to sigma (not symmetric)
-                sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
-                sigma_yy = sigma_yy + sngl(sx_l*gxl + sz_l*gzl)
-                sigma_zz = sigma_zz + sngl(sx_l*gxl + sy_l*gyl)
+          ! distinguish between single and double precision for reals
+          if(CUSTOM_REAL == SIZE_REAL) then
+            ! get displacement and multiply by density to compute G tensor
+            sx_l = rho * dble(displ_inner_core(1,iglob))
+            sy_l = rho * dble(displ_inner_core(2,iglob))
+            sz_l = rho * dble(displ_inner_core(3,iglob))
 
-                sigma_xy = sigma_xy - sngl(sx_l * gyl)
-                sigma_yx = sigma_yx - sngl(sy_l * gxl)
+            ! compute G tensor from s . g and add to sigma (not symmetric)
+            sigma_xx = sigma_xx + sngl(sy_l*gyl + sz_l*gzl)
+            sigma_yy = sigma_yy + sngl(sx_l*gxl + sz_l*gzl)
+            sigma_zz = sigma_zz + sngl(sx_l*gxl + sy_l*gyl)
 
-                sigma_xz = sigma_xz - sngl(sx_l * gzl)
-                sigma_zx = sigma_zx - sngl(sz_l * gxl)
+            sigma_xy = sigma_xy - sngl(sx_l * gyl)
+            sigma_yx = sigma_yx - sngl(sy_l * gxl)
 
-                sigma_yz = sigma_yz - sngl(sy_l * gzl)
-                sigma_zy = sigma_zy - sngl(sz_l * gyl)
+            sigma_xz = sigma_xz - sngl(sx_l * gzl)
+            sigma_zx = sigma_zx - sngl(sz_l * gxl)
 
-                ! precompute vector
-                factor = dble(jacobianl) * wgll_cube(i,j,k)
-                rho_s_H(1,i,j,k) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
-                rho_s_H(2,i,j,k) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
-                rho_s_H(3,i,j,k) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
+            sigma_yz = sigma_yz - sngl(sy_l * gzl)
+            sigma_zy = sigma_zy - sngl(sz_l * gyl)
 
-              else
+            ! precompute vector
+            factor = dble(jacobianl) * wgll_cube(INDEX_IJK)
+            rho_s_H(1,INDEX_IJK) = sngl(factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl))
+            rho_s_H(2,INDEX_IJK) = sngl(factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl))
+            rho_s_H(3,INDEX_IJK) = sngl(factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl))
 
-                ! get displacement and multiply by density to compute G tensor
-                sx_l = rho * displ_inner_core(1,iglob)
-                sy_l = rho * displ_inner_core(2,iglob)
-                sz_l = rho * displ_inner_core(3,iglob)
+          else
 
-                ! compute G tensor from s . g and add to sigma (not symmetric)
-                sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
-                sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
-                sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
+            ! get displacement and multiply by density to compute G tensor
+            sx_l = rho * displ_inner_core(1,iglob)
+            sy_l = rho * displ_inner_core(2,iglob)
+            sz_l = rho * displ_inner_core(3,iglob)
 
-                sigma_xy = sigma_xy - sx_l * gyl
-                sigma_yx = sigma_yx - sy_l * gxl
+            ! compute G tensor from s . g and add to sigma (not symmetric)
+            sigma_xx = sigma_xx + sy_l*gyl + sz_l*gzl
+            sigma_yy = sigma_yy + sx_l*gxl + sz_l*gzl
+            sigma_zz = sigma_zz + sx_l*gxl + sy_l*gyl
 
-                sigma_xz = sigma_xz - sx_l * gzl
-                sigma_zx = sigma_zx - sz_l * gxl
+            sigma_xy = sigma_xy - sx_l * gyl
+            sigma_yx = sigma_yx - sy_l * gxl
 
-                sigma_yz = sigma_yz - sy_l * gzl
-                sigma_zy = sigma_zy - sz_l * gyl
+            sigma_xz = sigma_xz - sx_l * gzl
+            sigma_zx = sigma_zx - sz_l * gxl
 
-                ! precompute vector
-                factor = jacobianl * wgll_cube(i,j,k)
-                rho_s_H(1,i,j,k) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
-                rho_s_H(2,i,j,k) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
-                rho_s_H(3,i,j,k) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
+            sigma_yz = sigma_yz - sy_l * gzl
+            sigma_zy = sigma_zy - sz_l * gyl
 
-              endif
+            ! precompute vector
+            factor = jacobianl * wgll_cube(INDEX_IJK)
+            rho_s_H(1,INDEX_IJK) = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl)
+            rho_s_H(2,INDEX_IJK) = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl)
+            rho_s_H(3,INDEX_IJK) = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl)
 
-            endif  ! end of section with gravity terms
+          endif
 
-            ! form dot product with test vector, non-symmetric form
-            tempx1(i,j,k) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
-            tempy1(i,j,k) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
-            tempz1(i,j,k) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
+        endif  ! end of section with gravity terms
 
-            tempx2(i,j,k) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
-            tempy2(i,j,k) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
-            tempz2(i,j,k) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
+        ! form dot product with test vector, non-symmetric form
+        tempx1(INDEX_IJK) = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl) ! this goes to accel_x
+        tempy1(INDEX_IJK) = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl) ! this goes to accel_y
+        tempz1(INDEX_IJK) = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl) ! this goes to accel_z
 
-            tempx3(i,j,k) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
-            tempy3(i,j,k) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
-            tempz3(i,j,k) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
+        tempx2(INDEX_IJK) = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl) ! this goes to accel_x
+        tempy2(INDEX_IJK) = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl) ! this goes to accel_y
+        tempz2(INDEX_IJK) = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl) ! this goes to accel_z
 
-          enddo
-        enddo
-      enddo
+        tempx3(INDEX_IJK) = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl) ! this goes to accel_x
+        tempy3(INDEX_IJK) = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl) ! this goes to accel_y
+        tempz3(INDEX_IJK) = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl) ! this goes to accel_z
 
+      ENDDO_LOOP_IJK
+
       ! subroutines adapted from Deville, Fischer and Mund, High-order methods
       ! for incompressible fluid flow, Cambridge University Press (2002),
       ! pages 386 and 389 and Figure 8.3.1
@@ -640,37 +668,41 @@
         enddo
       enddo
 
+      ! sums contributions
+
+      DO_LOOP_IJK
+
+        fac1 = wgllwgll_yz_3D(INDEX_IJK)
+        fac2 = wgllwgll_xz_3D(INDEX_IJK)
+        fac3 = wgllwgll_xy_3D(INDEX_IJK)
+        sum_terms(1,INDEX_IJK) = - (fac1*newtempx1(INDEX_IJK) + fac2*newtempx2(INDEX_IJK) + fac3*newtempx3(INDEX_IJK))
+        sum_terms(2,INDEX_IJK) = - (fac1*newtempy1(INDEX_IJK) + fac2*newtempy2(INDEX_IJK) + fac3*newtempy3(INDEX_IJK))
+        sum_terms(3,INDEX_IJK) = - (fac1*newtempz1(INDEX_IJK) + fac2*newtempz2(INDEX_IJK) + fac3*newtempz3(INDEX_IJK))
+
+      ENDDO_LOOP_IJK
+
+      ! adds gravity
+      if(GRAVITY_VAL) then
+
 #ifdef FORCE_VECTORIZATION
-      do ijk=1,NGLLCUBE
-        fac1 = wgllwgll_yz_3D(ijk,1,1)
-        fac2 = wgllwgll_xz_3D(ijk,1,1)
-        fac3 = wgllwgll_xy_3D(ijk,1,1)
-        sum_terms(1,ijk,1,1) = - (fac1*newtempx1(ijk,1,1) + fac2*newtempx2(ijk,1,1) + fac3*newtempx3(ijk,1,1))
-        sum_terms(2,ijk,1,1) = - (fac1*newtempy1(ijk,1,1) + fac2*newtempy2(ijk,1,1) + fac3*newtempy3(ijk,1,1))
-        sum_terms(3,ijk,1,1) = - (fac1*newtempz1(ijk,1,1) + fac2*newtempz2(ijk,1,1) + fac3*newtempz3(ijk,1,1))
-      enddo
-      if(GRAVITY_VAL) then
         do ijk = 1,NDIM*NGLLCUBE
           sum_terms(ijk,1,1,1) = sum_terms(ijk,1,1,1) + rho_s_H(ijk,1,1,1)
         enddo
-      endif
 #else
-      do k=1,NGLLZ
-        do j=1,NGLLY
-          fac1 = wgllwgll_yz(j,k)
-          do i=1,NGLLX
-            fac2 = wgllwgll_xz(i,k)
-            fac3 = wgllwgll_xy(i,j)
-            ! sum contributions
-            sum_terms(1,i,j,k) = - (fac1*newtempx1(i,j,k) + fac2*newtempx2(i,j,k) + fac3*newtempx3(i,j,k))
-            sum_terms(2,i,j,k) = - (fac1*newtempy1(i,j,k) + fac2*newtempy2(i,j,k) + fac3*newtempy3(i,j,k))
-            sum_terms(3,i,j,k) = - (fac1*newtempz1(i,j,k) + fac2*newtempz2(i,j,k) + fac3*newtempz3(i,j,k))
-            if(GRAVITY_VAL) sum_terms(:,i,j,k) = sum_terms(:,i,j,k) + rho_s_H(:,i,j,k)
+        do k=1,NGLLZ
+          do j=1,NGLLY
+            do i=1,NGLLX
+              sum_terms(1,INDEX_IJK) = sum_terms(1,INDEX_IJK) + rho_s_H(1,INDEX_IJK)
+              sum_terms(2,INDEX_IJK) = sum_terms(2,INDEX_IJK) + rho_s_H(2,INDEX_IJK)
+              sum_terms(3,INDEX_IJK) = sum_terms(3,INDEX_IJK) + rho_s_H(3,INDEX_IJK)
+            enddo
           enddo
         enddo
-      enddo
 #endif
 
+      endif
+
+
       ! sum contributions from each element to the global mesh and add gravity terms
 #ifdef FORCE_VECTORIZATION
 ! we can force vectorization using a compiler directive here because we know that there is no dependency
@@ -681,23 +713,22 @@
 !pgi$ ivdep
 !DIR$ IVDEP
       do ijk = 1,NGLLCUBE
-        iglob = ibool(ijk,1,1,ispec)
-        ! do NOT use array syntax ":" for the three statements below
-        ! otherwise most compilers will not be able to vectorize the outer loop
-        accel_inner_core(1,iglob) = accel_inner_core(1,iglob) + sum_terms(1,ijk,1,1)
-        accel_inner_core(2,iglob) = accel_inner_core(2,iglob) + sum_terms(2,ijk,1,1)
-        accel_inner_core(3,iglob) = accel_inner_core(3,iglob) + sum_terms(3,ijk,1,1)
-      enddo
 #else
       do k=1,NGLLZ
         do j=1,NGLLY
           do i=1,NGLLX
-            iglob = ibool(i,j,k,ispec)
+#endif
+
+            iglob = ibool(INDEX_IJK,ispec)
             ! do NOT use array syntax ":" for the three statements below
             ! otherwise most compilers will not be able to vectorize the outer loop
-            accel_inner_core(1,iglob) = accel_inner_core(1,iglob) + sum_terms(1,i,j,k)
-            accel_inner_core(2,iglob) = accel_inner_core(2,iglob) + sum_terms(2,i,j,k)
-            accel_inner_core(3,iglob) = accel_inner_core(3,iglob) + sum_terms(3,i,j,k)
+            accel_inner_core(1,iglob) = accel_inner_core(1,iglob) + sum_terms(1,INDEX_IJK)
+            accel_inner_core(2,iglob) = accel_inner_core(2,iglob) + sum_terms(2,INDEX_IJK)
+            accel_inner_core(3,iglob) = accel_inner_core(3,iglob) + sum_terms(3,INDEX_IJK)
+
+#ifdef FORCE_VECTORIZATION
+      enddo
+#else
           enddo
         enddo
       enddo
@@ -735,7 +766,7 @@
                                              muvstore, &
                                              epsilondev_xx,epsilondev_yy,epsilondev_xy, &
                                              epsilondev_xz,epsilondev_yz, &
-                                             epsilondev_loc) !!!!!!!!!!!! ,is_backward_field)
+                                             epsilondev_loc)
         endif
       endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_noDev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_noDev.f90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_inner_core_noDev.f90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -28,7 +28,6 @@
   subroutine compute_forces_inner_core( NSPEC,NGLOB,NSPEC_ATT, &
                                         deltat, &
                                         displ_inner_core, &
-!                                       veloc_inner_core, &
                                         accel_inner_core, &
                                         phase_is_inner, &
                                         R_xx,R_yy,R_xy,R_xz,R_yz, &
@@ -37,7 +36,7 @@
                                         epsilondev_xz,epsilondev_yz, &
                                         epsilon_trace_over_3,&
                                         alphaval,betaval,gammaval,factor_common, &
-                                        vnspec) !!!!!!!!!!!!!!!!!!!  ,is_backward_field)
+                                        vnspec)
 
   use constants_solver
 
@@ -71,7 +70,6 @@
 
   ! displacement, velocity and acceleration
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: displ_inner_core
-! real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: veloc_inner_core
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB) :: accel_inner_core
 
   ! for attenuation
@@ -95,8 +93,6 @@
   ! inner/outer element run flag
   logical :: phase_is_inner
 
-!!!!!!!!!!!!!!!!  logical :: is_backward_field
-
   ! local parameters
   real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
 
@@ -576,7 +572,7 @@
                                              muvstore, &
                                              epsilondev_xx,epsilondev_yy,epsilondev_xy, &
                                              epsilondev_xz,epsilondev_yz, &
-                                             epsilondev_loc) !!!!!!!!!!!! ,is_backward_field)
+                                             epsilondev_loc)
         endif
       endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_outer_core_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_outer_core_Dev.F90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_outer_core_Dev.F90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -25,6 +25,12 @@
 !
 !=====================================================================
 
+! we switch between vectorized and non-vectorized version by using pre-processor flag FORCE_VECTORIZATION
+! and macros INDEX_IJK, DO_LOOP_IJK, ENDDO_LOOP_IJK defined in config.fh
+#include "config.fh"
+
+
+
   subroutine compute_forces_outer_core_Dev(time,deltat,two_omega_earth, &
                                            NSPEC,NGLOB, &
                                            A_array_rotation,B_array_rotation, &
@@ -53,11 +59,7 @@
     nspec_outer => nspec_outer_outer_core, &
     nspec_inner => nspec_inner_outer_core
 
-#ifdef FORCE_VECTORIZATION
   use specfem_par,only: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D
-#else
-  use specfem_par,only: wgllwgll_xy,wgllwgll_xz,wgllwgll_yz
-#endif
 
   implicit none
 
@@ -81,9 +83,10 @@
   logical :: phase_is_inner
 
   ! local parameters
-
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: newtempx1,newtempx2,newtempx3
+  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: sum_terms
+
   ! for gravity
   integer :: int_radius
   double precision :: radius,theta,phi,gxl,gyl,gzl
@@ -128,9 +131,6 @@
 
 #ifdef FORCE_VECTORIZATION
   integer :: ijk
-  real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: sum_terms
-#else
-  real(kind=CUSTOM_REAL) :: sum_terms
 #endif
 
 ! ****************************************************
@@ -155,13 +155,14 @@
     ispec = phase_ispec_inner(ispec_p,iphase)
 
     ! only compute element which belong to current phase (inner or outer elements)
-#ifdef FORCE_VECTORIZATION
-    do ijk=1,NGLLCUBE
-      iglob = ibool(ijk,1,1,ispec)
 
-      ! get a local copy of the potential field
-      dummyx_loc(ijk,1,1) = displfluid(iglob)
+    DO_LOOP_IJK
 
+      iglob = ibool(INDEX_IJK,ispec)
+
+      ! stores "displacement"
+      dummyx_loc(INDEX_IJK) = displfluid(iglob)
+
       ! pre-computes factors
       ! use mesh coordinates to get theta and phi
       ! x y z contain r theta phi
@@ -178,64 +179,22 @@
 
       if( .not. GRAVITY_VAL ) then
         ! grad(rho)/rho in Cartesian components
-        displ_times_grad_x_ln_rho(ijk,1,1) = dummyx_loc(ijk,1,1) &
-              * sin_theta * cos_phi * d_ln_density_dr_table(int_radius)
-        displ_times_grad_y_ln_rho(ijk,1,1) = dummyx_loc(ijk,1,1) &
-              * sin_theta * sin_phi * d_ln_density_dr_table(int_radius)
-        displ_times_grad_z_ln_rho(ijk,1,1) = dummyx_loc(ijk,1,1) &
-              * cos_theta * d_ln_density_dr_table(int_radius)
+        displ_times_grad_x_ln_rho(INDEX_IJK) = dummyx_loc(INDEX_IJK) &
+              * sngl(sin_theta * cos_phi * d_ln_density_dr_table(int_radius))
+        displ_times_grad_y_ln_rho(INDEX_IJK) = dummyx_loc(INDEX_IJK) &
+              * sngl(sin_theta * sin_phi * d_ln_density_dr_table(int_radius))
+        displ_times_grad_z_ln_rho(INDEX_IJK) = dummyx_loc(INDEX_IJK) &
+              * sngl(cos_theta * d_ln_density_dr_table(int_radius))
       else
         ! Cartesian components of the gravitational acceleration
         ! integrate and multiply by rho / Kappa
-        temp_gxl(ijk,1,1) = sin_theta*cos_phi
-        temp_gyl(ijk,1,1) = sin_theta*sin_phi
-        temp_gzl(ijk,1,1) = cos_theta
+        temp_gxl(INDEX_IJK) = sin_theta*cos_phi
+        temp_gyl(INDEX_IJK) = sin_theta*sin_phi
+        temp_gzl(INDEX_IJK) = cos_theta
       endif
-    enddo
-#else
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
-          iglob = ibool(i,j,k,ispec)
 
-          ! stores "displacement"
-          dummyx_loc(i,j,k) = displfluid(iglob)
+    ENDDO_LOOP_IJK
 
-          ! pre-computes factors
-          ! use mesh coordinates to get theta and phi
-          ! x y z contain r theta phi
-          radius = dble(xstore(iglob))
-          theta = dble(ystore(iglob))
-          phi = dble(zstore(iglob))
-
-          cos_theta = dcos(theta)
-          sin_theta = dsin(theta)
-          cos_phi = dcos(phi)
-          sin_phi = dsin(phi)
-
-          int_radius = nint(radius * R_EARTH_KM * 10.d0)
-
-          if( .not. GRAVITY_VAL ) then
-            ! grad(rho)/rho in Cartesian components
-            displ_times_grad_x_ln_rho(i,j,k) = dummyx_loc(i,j,k) &
-                  * sngl(sin_theta * cos_phi * d_ln_density_dr_table(int_radius))
-            displ_times_grad_y_ln_rho(i,j,k) = dummyx_loc(i,j,k) &
-                  * sngl(sin_theta * sin_phi * d_ln_density_dr_table(int_radius))
-            displ_times_grad_z_ln_rho(i,j,k) = dummyx_loc(i,j,k) &
-                  * sngl(cos_theta * d_ln_density_dr_table(int_radius))
-          else
-            ! Cartesian components of the gravitational acceleration
-            ! integrate and multiply by rho / Kappa
-            temp_gxl(i,j,k) = sin_theta*cos_phi
-            temp_gyl(i,j,k) = sin_theta*sin_phi
-            temp_gzl(i,j,k) = cos_theta
-          endif
-
-        enddo
-      enddo
-    enddo
-#endif
-
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
@@ -271,151 +230,146 @@
       enddo
     enddo
 
+    DO_LOOP_IJK
 
-    do k=1,NGLLZ
-      do j=1,NGLLY
-        do i=1,NGLLX
+      ! get derivatives of velocity potential with respect to x, y and z
+      xixl = xix(INDEX_IJK,ispec)
+      xiyl = xiy(INDEX_IJK,ispec)
+      xizl = xiz(INDEX_IJK,ispec)
+      etaxl = etax(INDEX_IJK,ispec)
+      etayl = etay(INDEX_IJK,ispec)
+      etazl = etaz(INDEX_IJK,ispec)
+      gammaxl = gammax(INDEX_IJK,ispec)
+      gammayl = gammay(INDEX_IJK,ispec)
+      gammazl = gammaz(INDEX_IJK,ispec)
 
-          ! get derivatives of velocity potential with respect to x, y and z
-          xixl = xix(i,j,k,ispec)
-          xiyl = xiy(i,j,k,ispec)
-          xizl = xiz(i,j,k,ispec)
-          etaxl = etax(i,j,k,ispec)
-          etayl = etay(i,j,k,ispec)
-          etazl = etaz(i,j,k,ispec)
-          gammaxl = gammax(i,j,k,ispec)
-          gammayl = gammay(i,j,k,ispec)
-          gammazl = gammaz(i,j,k,ispec)
+      ! compute the jacobian
+      jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
+                    - xiyl*(etaxl*gammazl-etazl*gammaxl) &
+                    + xizl*(etaxl*gammayl-etayl*gammaxl))
 
-          ! compute the jacobian
-          jacobianl = 1._CUSTOM_REAL / (xixl*(etayl*gammazl-etazl*gammayl) &
-                        - xiyl*(etaxl*gammazl-etazl*gammaxl) &
-                        + xizl*(etaxl*gammayl-etayl*gammaxl))
+      dpotentialdxl = xixl*tempx1(INDEX_IJK) + etaxl*tempx2(INDEX_IJK) + gammaxl*tempx3(INDEX_IJK)
+      dpotentialdyl = xiyl*tempx1(INDEX_IJK) + etayl*tempx2(INDEX_IJK) + gammayl*tempx3(INDEX_IJK)
+      dpotentialdzl = xizl*tempx1(INDEX_IJK) + etazl*tempx2(INDEX_IJK) + gammazl*tempx3(INDEX_IJK)
 
-          dpotentialdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
-          dpotentialdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
-          dpotentialdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+      ! compute contribution of rotation and add to gradient of potential
+      ! this term has no Z component
+      if(ROTATION_VAL) then
 
-          ! compute contribution of rotation and add to gradient of potential
-          ! this term has no Z component
-          if(ROTATION_VAL) then
+        ! store the source for the Euler scheme for A_rotation and B_rotation
+        two_omega_deltat = deltat * two_omega_earth
 
-            ! store the source for the Euler scheme for A_rotation and B_rotation
-            two_omega_deltat = deltat * two_omega_earth
+        cos_two_omega_t = cos(two_omega_earth*time)
+        sin_two_omega_t = sin(two_omega_earth*time)
 
-            cos_two_omega_t = cos(two_omega_earth*time)
-            sin_two_omega_t = sin(two_omega_earth*time)
+        ! time step deltat of Euler scheme is included in the source
+        source_euler_A(INDEX_IJK) = two_omega_deltat &
+              * (cos_two_omega_t * dpotentialdyl + sin_two_omega_t * dpotentialdxl)
+        source_euler_B(INDEX_IJK) = two_omega_deltat &
+              * (sin_two_omega_t * dpotentialdyl - cos_two_omega_t * dpotentialdxl)
 
-            ! time step deltat of Euler scheme is included in the source
-            source_euler_A(i,j,k) = two_omega_deltat &
-                  * (cos_two_omega_t * dpotentialdyl + sin_two_omega_t * dpotentialdxl)
-            source_euler_B(i,j,k) = two_omega_deltat &
-                  * (sin_two_omega_t * dpotentialdyl - cos_two_omega_t * dpotentialdxl)
+        A_rotation = A_array_rotation(INDEX_IJK,ispec)
+        B_rotation = B_array_rotation(INDEX_IJK,ispec)
 
-            A_rotation = A_array_rotation(i,j,k,ispec)
-            B_rotation = B_array_rotation(i,j,k,ispec)
+        ux_rotation =   A_rotation*cos_two_omega_t + B_rotation*sin_two_omega_t
+        uy_rotation = - A_rotation*sin_two_omega_t + B_rotation*cos_two_omega_t
 
-            ux_rotation =   A_rotation*cos_two_omega_t + B_rotation*sin_two_omega_t
-            uy_rotation = - A_rotation*sin_two_omega_t + B_rotation*cos_two_omega_t
+        dpotentialdx_with_rot = dpotentialdxl + ux_rotation
+        dpotentialdy_with_rot = dpotentialdyl + uy_rotation
 
-            dpotentialdx_with_rot = dpotentialdxl + ux_rotation
-            dpotentialdy_with_rot = dpotentialdyl + uy_rotation
+      else
 
-          else
+        dpotentialdx_with_rot = dpotentialdxl
+        dpotentialdy_with_rot = dpotentialdyl
 
-            dpotentialdx_with_rot = dpotentialdxl
-            dpotentialdy_with_rot = dpotentialdyl
+      endif  ! end of section with rotation
 
-          endif  ! end of section with rotation
+      ! add (chi/rho)grad(rho) term in no gravity case
+      if(.not. GRAVITY_VAL) then
 
-          ! add (chi/rho)grad(rho) term in no gravity case
-          if(.not. GRAVITY_VAL) then
+        ! With regards to the non-gravitating case: we cannot set N^2 = 0 *and* let g = 0.
+        ! We can *either* assume N^2 = 0 but keep gravity g, *or* we can assume that gravity
+        ! is negligible to begin with, as in our GJI 2002a, in which case N does not arise.
+        ! We get:
+        !
+        ! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
+        !
+        ! Then the displacement is
+        !
+        ! \bu = \bdel\chi+\chi\bdel\ln\rho = \rho^{-1}\bdel(\rho\chi)
+        !
+        ! and the pressure is
+        !
+        ! p = -\rho\ddot{\chi}
+        !
+        ! Thus in our 2002b GJI paper eqn (21) is wrong, and equation (41)
+        ! in our AGU monograph is incorrect; these equations should be replaced by
+        !
+        ! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
+        !
+        ! Note that the fluid potential we use in GJI 2002a differs from the one used here:
+        !
+        ! \chi_GJI2002a = \rho\partial\t\chi
+        !
+        ! such that
+        !
+        ! \bv = \partial_t\bu=\rho^{-1}\bdel\chi_GJI2002a  (GJI 2002a eqn 20)
+        !
+        ! p = - \partial_t\chi_GJI2002a (GJI 2002a eqn 19)
 
-            ! With regards to the non-gravitating case: we cannot set N^2 = 0 *and* let g = 0.
-            ! We can *either* assume N^2 = 0 but keep gravity g, *or* we can assume that gravity
-            ! is negligible to begin with, as in our GJI 2002a, in which case N does not arise.
-            ! We get:
-            !
-            ! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
-            !
-            ! Then the displacement is
-            !
-            ! \bu = \bdel\chi+\chi\bdel\ln\rho = \rho^{-1}\bdel(\rho\chi)
-            !
-            ! and the pressure is
-            !
-            ! p = -\rho\ddot{\chi}
-            !
-            ! Thus in our 2002b GJI paper eqn (21) is wrong, and equation (41)
-            ! in our AGU monograph is incorrect; these equations should be replaced by
-            !
-            ! \ddot\chi = \rho^{-1}\kappa\bdel\cdot(\bdel\chi+\chi\bdel\ln\rho)
-            !
-            ! Note that the fluid potential we use in GJI 2002a differs from the one used here:
-            !
-            ! \chi_GJI2002a = \rho\partial\t\chi
-            !
-            ! such that
-            !
-            ! \bv = \partial_t\bu=\rho^{-1}\bdel\chi_GJI2002a  (GJI 2002a eqn 20)
-            !
-            ! p = - \partial_t\chi_GJI2002a (GJI 2002a eqn 19)
+        ! use mesh coordinates to get theta and phi
+        ! x y z contain r theta phi
+        dpotentialdx_with_rot = dpotentialdx_with_rot + displ_times_grad_x_ln_rho(INDEX_IJK)
+        dpotentialdy_with_rot = dpotentialdy_with_rot + displ_times_grad_y_ln_rho(INDEX_IJK)
+        dpotentialdzl = dpotentialdzl + displ_times_grad_z_ln_rho(INDEX_IJK)
 
-            ! use mesh coordinates to get theta and phi
-            ! x y z contain r theta phi
-            dpotentialdx_with_rot = dpotentialdx_with_rot + displ_times_grad_x_ln_rho(i,j,k)
-            dpotentialdy_with_rot = dpotentialdy_with_rot + displ_times_grad_y_ln_rho(i,j,k)
-            dpotentialdzl = dpotentialdzl + displ_times_grad_z_ln_rho(i,j,k)
+     else
+        ! if gravity is turned on
 
-         else
-            ! if gravity is turned on
+        ! compute divergence of displacment
+        gxl = temp_gxl(INDEX_IJK)
+        gyl = temp_gyl(INDEX_IJK)
+        gzl = temp_gzl(INDEX_IJK)
 
-            ! compute divergence of displacment
-            gxl = temp_gxl(i,j,k)
-            gyl = temp_gyl(i,j,k)
-            gzl = temp_gzl(i,j,k)
+        ! distinguish between single and double precision for reals
+        if(CUSTOM_REAL == SIZE_REAL) then
+          gravity_term(INDEX_IJK) = &
+                  sngl( minus_rho_g_over_kappa_fluid(int_radius) &
+                  * dble(jacobianl) * wgll_cube(INDEX_IJK) &
+                  * (dble(dpotentialdx_with_rot) * gxl  &
+                     + dble(dpotentialdy_with_rot) * gyl &
+                     + dble(dpotentialdzl) * gzl) )
+        else
+          gravity_term(INDEX_IJK) = minus_rho_g_over_kappa_fluid(int_radius) * &
+                    jacobianl * wgll_cube(INDEX_IJK) &
+                    * (dpotentialdx_with_rot * gxl  &
+                      + dpotentialdy_with_rot * gyl &
+                      + dpotentialdzl * gzl)
+        endif
 
-            ! distinguish between single and double precision for reals
-            if(CUSTOM_REAL == SIZE_REAL) then
-              gravity_term(i,j,k) = &
-                      sngl( minus_rho_g_over_kappa_fluid(int_radius) &
-                      * dble(jacobianl) * wgll_cube(i,j,k) &
-                      * (dble(dpotentialdx_with_rot) * gxl  &
-                         + dble(dpotentialdy_with_rot) * gyl &
-                         + dble(dpotentialdzl) * gzl) )
-            else
-              gravity_term(i,j,k) = minus_rho_g_over_kappa_fluid(int_radius) * &
-                        jacobianl * wgll_cube(i,j,k) &
-                        * (dpotentialdx_with_rot * gxl  &
-                          + dpotentialdy_with_rot * gyl &
-                          + dpotentialdzl * gzl)
-            endif
+        ! divergence of displacement field with gravity on
+        ! note: these calculations are only considered for SIMULATION_TYPE == 1 .and. SAVE_FORWARD
+        !          and one has set MOVIE_VOLUME_TYPE == 4 when MOVIE_VOLUME is .true.;
+        !         in case of SIMULATION_TYPE == 3, it gets overwritten by compute_kernels_outer_core()
+        if( MOVIE_VOLUME .and. NSPEC_OUTER_CORE_3DMOVIE /= 1 ) then
+          div_displfluid(INDEX_IJK,ispec) =  &
+                    minus_rho_g_over_kappa_fluid(int_radius) &
+                    * (dpotentialdx_with_rot * gxl &
+                     + dpotentialdy_with_rot * gyl &
+                     + dpotentialdzl * gzl)
+        endif
 
-            ! divergence of displacement field with gravity on
-            ! note: these calculations are only considered for SIMULATION_TYPE == 1 .and. SAVE_FORWARD
-            !          and one has set MOVIE_VOLUME_TYPE == 4 when MOVIE_VOLUME is .true.;
-            !         in case of SIMULATION_TYPE == 3, it gets overwritten by compute_kernels_outer_core()
-            if( MOVIE_VOLUME .and. NSPEC_OUTER_CORE_3DMOVIE /= 1 ) then
-              div_displfluid(i,j,k,ispec) =  &
-                        minus_rho_g_over_kappa_fluid(int_radius) &
-                        * (dpotentialdx_with_rot * gxl &
-                         + dpotentialdy_with_rot * gyl &
-                         + dpotentialdzl * gzl)
-            endif
+      endif
 
-          endif
+      tempx1(INDEX_IJK) = jacobianl*(xixl*dpotentialdx_with_rot &
+                               + xiyl*dpotentialdy_with_rot + xizl*dpotentialdzl)
+      tempx2(INDEX_IJK) = jacobianl*(etaxl*dpotentialdx_with_rot &
+                               + etayl*dpotentialdy_with_rot + etazl*dpotentialdzl)
+      tempx3(INDEX_IJK) = jacobianl*(gammaxl*dpotentialdx_with_rot &
+                               + gammayl*dpotentialdy_with_rot + gammazl*dpotentialdzl)
 
-          tempx1(i,j,k) = jacobianl*(xixl*dpotentialdx_with_rot &
-                                   + xiyl*dpotentialdy_with_rot + xizl*dpotentialdzl)
-          tempx2(i,j,k) = jacobianl*(etaxl*dpotentialdx_with_rot &
-                                   + etayl*dpotentialdy_with_rot + etazl*dpotentialdzl)
-          tempx3(i,j,k) = jacobianl*(gammaxl*dpotentialdx_with_rot &
-                                   + gammayl*dpotentialdy_with_rot + gammazl*dpotentialdzl)
+    ENDDO_LOOP_IJK
 
-        enddo
-      enddo
-    enddo
-
     ! subroutines adapted from Deville, Fischer and Mund, High-order methods
     ! for incompressible fluid flow, Cambridge University Press (2002),
     ! pages 386 and 389 and Figure 8.3.1
@@ -452,17 +406,29 @@
     enddo
 
     ! sum contributions from each element to the global mesh and add gravity term
-#ifdef FORCE_VECTORIZATION
-    do ijk=1,NGLLCUBE
-      sum_terms(ijk,1,1) = - (wgllwgll_yz_3D(ijk,1,1)*newtempx1(ijk,1,1) &
-                            + wgllwgll_xz_3D(ijk,1,1)*newtempx2(ijk,1,1) &
-                            + wgllwgll_xy_3D(ijk,1,1)*newtempx3(ijk,1,1))
-    enddo
+
+    DO_LOOP_IJK
+
+          sum_terms(INDEX_IJK) = - ( wgllwgll_yz_3D(INDEX_IJK)*newtempx1(INDEX_IJK) &
+                                   + wgllwgll_xz_3D(INDEX_IJK)*newtempx2(INDEX_IJK) &
+                                   + wgllwgll_xy_3D(INDEX_IJK)*newtempx3(INDEX_IJK))
+
+    ENDDO_LOOP_IJK
+
+    ! adds gravity
     if(GRAVITY_VAL) then
-      do ijk = 1,NGLLCUBE
-        sum_terms(ijk,1,1) = sum_terms(ijk,1,1) + gravity_term(ijk,1,1)
-      enddo
+
+      DO_LOOP_IJK
+
+        sum_terms(INDEX_IJK) = sum_terms(INDEX_IJK) + gravity_term(INDEX_IJK)
+
+      ENDDO_LOOP_IJK
+
     endif
+
+    ! updates acceleration
+
+#ifdef FORCE_VECTORIZATION
 ! we can force vectorization using a compiler directive here because we know that there is no dependency
 ! inside a given spectral element, since all the global points of a local elements are different by definition
 ! (only common points between different elements can be the same)
@@ -471,63 +437,54 @@
 !pgi$ ivdep
 !DIR$ IVDEP
     do ijk = 1,NGLLCUBE
-      iglob = ibool(ijk,1,1,ispec)
-      accelfluid(iglob) = accelfluid(iglob) + sum_terms(ijk,1,1)
-    enddo
-    ! update rotation term with Euler scheme
-    if(ROTATION_VAL) then
-      if(USE_LDDRK) then
-        ! use the source saved above
-        do ijk = 1,NGLLCUBE
-          A_array_rotation_lddrk(ijk,1,1,ispec) = ALPHA_LDDRK(istage) * A_array_rotation_lddrk(ijk,1,1,ispec) &
-                                                  + source_euler_A(ijk,1,1)
-          A_array_rotation(ijk,1,1,ispec) = A_array_rotation(ijk,1,1,ispec) &
-                                            + BETA_LDDRK(istage) * A_array_rotation_lddrk(ijk,1,1,ispec)
-
-          B_array_rotation_lddrk(ijk,1,1,ispec) = ALPHA_LDDRK(istage) * B_array_rotation_lddrk(ijk,1,1,ispec) &
-                                                  + source_euler_B(ijk,1,1)
-          B_array_rotation(ijk,1,1,ispec) = B_array_rotation(ijk,1,1,ispec) &
-                                            + BETA_LDDRK(istage) * B_array_rotation_lddrk(ijk,1,1,ispec)
-        enddo
-      else
-        do ijk = 1,NGLLCUBE
-          A_array_rotation(ijk,1,1,ispec) = A_array_rotation(ijk,1,1,ispec) + source_euler_A(ijk,1,1)
-          B_array_rotation(ijk,1,1,ispec) = B_array_rotation(ijk,1,1,ispec) + source_euler_B(ijk,1,1)
-        enddo
-      endif
-    endif
 #else
-    ! sum contributions from each element to the global mesh and add gravity term
     do k=1,NGLLZ
       do j=1,NGLLY
         do i=1,NGLLX
-          sum_terms = - (wgllwgll_yz(j,k)*newtempx1(i,j,k) &
-                       + wgllwgll_xz(i,k)*newtempx2(i,j,k) &
-                       + wgllwgll_xy(i,j)*newtempx3(i,j,k))
+#endif
+          iglob = ibool(INDEX_IJK,ispec)
+          accelfluid(iglob) = accelfluid(iglob) + sum_terms(INDEX_IJK)
 
-          if(GRAVITY_VAL) sum_terms = sum_terms + gravity_term(i,j,k)
-
-          iglob = ibool(i,j,k,ispec)
-          accelfluid(iglob) = accelfluid(iglob) + sum_terms
+#ifdef FORCE_VECTORIZATION
+    enddo
+#else
         enddo
       enddo
     enddo
+#endif
+
     ! update rotation term with Euler scheme
+
     if(ROTATION_VAL) then
+
       if(USE_LDDRK) then
+
         ! use the source saved above
-        A_array_rotation_lddrk(:,:,:,ispec) = ALPHA_LDDRK(istage) * A_array_rotation_lddrk(:,:,:,ispec) + source_euler_A(:,:,:)
-        A_array_rotation(:,:,:,ispec) = A_array_rotation(:,:,:,ispec) + BETA_LDDRK(istage) * A_array_rotation_lddrk(:,:,:,ispec)
+        DO_LOOP_IJK
 
-        B_array_rotation_lddrk(:,:,:,ispec) = ALPHA_LDDRK(istage) * B_array_rotation_lddrk(:,:,:,ispec) + source_euler_B(:,:,:)
-        B_array_rotation(:,:,:,ispec) = B_array_rotation(:,:,:,ispec) + BETA_LDDRK(istage) * B_array_rotation_lddrk(:,:,:,ispec)
+          A_array_rotation_lddrk(INDEX_IJK,ispec) = ALPHA_LDDRK(istage) * A_array_rotation_lddrk(INDEX_IJK,ispec) &
+                                                    + source_euler_A(INDEX_IJK)
+          A_array_rotation(INDEX_IJK,ispec) = A_array_rotation(INDEX_IJK,ispec) &
+                                              + BETA_LDDRK(istage) * A_array_rotation_lddrk(INDEX_IJK,ispec)
+
+          B_array_rotation_lddrk(INDEX_IJK,ispec) = ALPHA_LDDRK(istage) * B_array_rotation_lddrk(INDEX_IJK,ispec) &
+                                                    + source_euler_B(INDEX_IJK)
+          B_array_rotation(INDEX_IJK,ispec) = B_array_rotation(INDEX_IJK,ispec) &
+                                              + BETA_LDDRK(istage) * B_array_rotation_lddrk(INDEX_IJK,ispec)
+
+        ENDDO_LOOP_IJK
+
       else
-        ! use the source saved above
-        A_array_rotation(:,:,:,ispec) = A_array_rotation(:,:,:,ispec) + source_euler_A(:,:,:)
-        B_array_rotation(:,:,:,ispec) = B_array_rotation(:,:,:,ispec) + source_euler_B(:,:,:)
+
+        DO_LOOP_IJK
+
+          A_array_rotation(INDEX_IJK,ispec) = A_array_rotation(INDEX_IJK,ispec) + source_euler_A(INDEX_IJK)
+          B_array_rotation(INDEX_IJK,ispec) = B_array_rotation(INDEX_IJK,ispec) + source_euler_B(INDEX_IJK)
+
+        ENDDO_LOOP_IJK
+
       endif
     endif
-#endif
 
   enddo   ! spectral element loop
 

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -85,7 +85,6 @@
                                              NSPEC_CRUST_MANTLE_ATTENUATION, &
                                              deltat, &
                                              displ_crust_mantle, &
-!                                            veloc_crust_mantle, &
                                              accel_crust_mantle, &
                                              phase_is_inner, &
                                              R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
@@ -96,13 +95,12 @@
                                              epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
                                              eps_trace_over_3_crust_mantle, &
                                              alphaval,betaval,gammaval, &
-                                             factor_common_crust_mantle,ATT4_VAL, .false. )
+                                             factor_common_crust_mantle,ATT4_VAL)
         ! inner core region
         call compute_forces_inner_core_Dev(NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
                                            NSPEC_INNER_CORE_ATTENUATION, &
                                            deltat, &
                                            displ_inner_core, &
-!                                          veloc_inner_core, &
                                            accel_inner_core, &
                                            phase_is_inner, &
                                            R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
@@ -113,7 +111,7 @@
                                            epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
                                            eps_trace_over_3_inner_core,&
                                            alphaval,betaval,gammaval, &
-                                           factor_common_inner_core,ATT5_VAL) !!!! , .false. )
+                                           factor_common_inner_core,ATT5_VAL)
       else
         ! no Deville optimization
         ! crust/mantle region
@@ -121,7 +119,6 @@
                                          NSPEC_CRUST_MANTLE_ATTENUATION, &
                                          deltat, &
                                          displ_crust_mantle, &
-!                                        veloc_crust_mantle, &
                                          accel_crust_mantle, &
                                          phase_is_inner, &
                                          R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
@@ -132,13 +129,12 @@
                                          epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
                                          eps_trace_over_3_crust_mantle, &
                                          alphaval,betaval,gammaval, &
-                                         factor_common_crust_mantle,ATT4_VAL) !!!!!!!!!!!! , .false. )
+                                         factor_common_crust_mantle,ATT4_VAL)
         ! inner core region
         call compute_forces_inner_core(NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
                                        NSPEC_INNER_CORE_ATTENUATION, &
                                        deltat, &
                                        displ_inner_core, &
-!                                      veloc_inner_core, &
                                        accel_inner_core, &
                                        phase_is_inner, &
                                        R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
@@ -149,7 +145,7 @@
                                        epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
                                        eps_trace_over_3_inner_core,&
                                        alphaval,betaval,gammaval, &
-                                       factor_common_inner_core,ATT5_VAL) !!!!!!!!!!! , .false. )
+                                       factor_common_inner_core,ATT5_VAL)
       endif
     else
       ! on GPU
@@ -472,7 +468,6 @@
                                              NSPEC_CRUST_MANTLE_STR_AND_ATT, &
                                              b_deltat, &
                                              b_displ_crust_mantle, &
-!                                            b_veloc_crust_mantle, &
                                              b_accel_crust_mantle, &
                                              phase_is_inner, &
                                              b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
@@ -484,13 +479,12 @@
                                              b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
                                              b_eps_trace_over_3_crust_mantle, &
                                              b_alphaval,b_betaval,b_gammaval, &
-                                             factor_common_crust_mantle,ATT4_VAL, .true. )
+                                             factor_common_crust_mantle,ATT4_VAL)
         ! inner core region
         call compute_forces_inner_core_Dev(NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
                                            NSPEC_INNER_CORE_STR_AND_ATT, &
                                            b_deltat, &
                                            b_displ_inner_core, &
-!                                          b_veloc_inner_core, &
                                            b_accel_inner_core, &
                                            phase_is_inner, &
                                            b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
@@ -501,7 +495,7 @@
                                            b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
                                            b_eps_trace_over_3_inner_core,&
                                            b_alphaval,b_betaval,b_gammaval, &
-                                           factor_common_inner_core,ATT5_VAL) !!!! , .true. )
+                                           factor_common_inner_core,ATT5_VAL)
       else
         ! no Deville optimization
         ! crust/mantle region
@@ -509,7 +503,6 @@
                                          NSPEC_CRUST_MANTLE_STR_AND_ATT, &
                                          b_deltat, &
                                          b_displ_crust_mantle, &
-!                                        b_veloc_crust_mantle, &
                                          b_accel_crust_mantle, &
                                          phase_is_inner, &
                                          b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
@@ -521,13 +514,12 @@
                                          b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
                                          b_eps_trace_over_3_crust_mantle, &
                                          b_alphaval,b_betaval,b_gammaval, &
-                                         factor_common_crust_mantle,ATT4_VAL) !!!!!!!!!!!! , .true. )
+                                         factor_common_crust_mantle,ATT4_VAL)
         ! inner core region
         call compute_forces_inner_core(NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
                                        NSPEC_INNER_CORE_STR_AND_ATT, &
                                        b_deltat, &
                                        b_displ_inner_core, &
-!                                      b_veloc_inner_core, &
                                        b_accel_inner_core, &
                                        phase_is_inner, &
                                        b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
@@ -538,7 +530,7 @@
                                        b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
                                        b_eps_trace_over_3_inner_core,&
                                        b_alphaval,b_betaval,b_gammaval, &
-                                       factor_common_inner_core,ATT5_VAL) !!!!!!!!!!! , .true. )
+                                       factor_common_inner_core,ATT5_VAL)
       endif
     else
       ! on GPU

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_GLL_points.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_GLL_points.f90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_GLL_points.f90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -40,18 +40,16 @@
                                  hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
                                  wgllwgll_xy,wgllwgll_xz,wgllwgll_yz,wgll_cube)
 
-  ! define a 3D extension in order to be able to force vectorization in the compute_forces routines
-  if( FORCE_VECTORIZATION_VAL ) then
-    do k = 1,NGLLZ
-      do j = 1,NGLLY
-        do i = 1,NGLLX
-          wgllwgll_yz_3D(i,j,k) = wgllwgll_yz(j,k)
-          wgllwgll_xz_3D(i,j,k) = wgllwgll_xz(i,k)
-          wgllwgll_xy_3D(i,j,k) = wgllwgll_xy(i,j)
-        enddo
+  ! define a 3D extension in order to be able to force vectorization in the compute_forces_**_Dev routines
+  do k = 1,NGLLZ
+    do j = 1,NGLLY
+      do i = 1,NGLLX
+        wgllwgll_yz_3D(i,j,k) = wgllwgll_yz(j,k)
+        wgllwgll_xz_3D(i,j,k) = wgllwgll_xz(i,k)
+        wgllwgll_xy_3D(i,j,k) = wgllwgll_xy(i,j)
       enddo
     enddo
-  endif
+  enddo
 
   if( USE_DEVILLE_PRODUCTS_VAL ) then
     ! check that optimized routines from Deville et al. (2002) can be used

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -365,7 +365,7 @@
   ! count number of receivers located in this slice
   nrec_local = 0
   if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
-    ! note: for 1-chunk simulations, nrec is now the actual number of receivers found in this chunk 
+    ! note: for 1-chunk simulations, nrec is now the actual number of receivers found in this chunk
     !       (excludes stations located outside of chunk)
     nrec_simulation = nrec
     do irec = 1,nrec

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D_par.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D_par.F90	2013-10-23 13:03:00 UTC (rev 22970)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/specfem3D_par.F90	2013-10-23 14:27:26 UTC (rev 22971)
@@ -68,11 +68,12 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLX) :: hprime_xxT,hprimewgll_xxT
   real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLY) :: hprime_yy,hprimewgll_yy
   real(kind=CUSTOM_REAL), dimension(NGLLZ,NGLLZ) :: hprime_zz,hprimewgll_zz
+
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY) :: wgllwgll_xy
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLZ) :: wgllwgll_xz
   real(kind=CUSTOM_REAL), dimension(NGLLY,NGLLZ) :: wgllwgll_yz
 
-  ! arrays for force_vectorization
+  ! arrays for Deville and force_vectorization
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D
 
   !-----------------------------------------------------------------