[cig-commits] r20536 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER: UTILS src/cuda src/shared src/specfem3D

Mon Jul 23 14:58:26 PDT 2012

Author: danielpeter
Date: 2012-07-23 14:58:26 -0700 (Mon, 23 Jul 2012)
New Revision: 20536

Modified:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/UTILS/update_headers_change_word_f90.pl
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
Log:
updates texture usage for cuda routines

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/UTILS/update_headers_change_word_f90.pl
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/UTILS/update_headers_change_word_f90.pl	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/UTILS/update_headers_change_word_f90.pl	2012-07-23 21:58:26 UTC (rev 20536)
@@ -22,7 +22,7 @@
 #
 
 
- at objects = `ls src/*/*.f90 src/*/*.F90 src/*/*.h.in src/*/*.h src/*/*.c src/*/*.cu`;
+ at objects = `ls src/*/*.f90 src/*/*.F90 src/*/*.h.in src/*/*.h src/*/*.c src/*/*.cu setup/*.h.in`;
 
 foreach $name (@objects) {
   chop $name;

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2012-07-23 21:58:26 UTC (rev 20536)
@@ -39,9 +39,7 @@
 
 #include "config.h"
 #include "mesh_constants_cuda.h"
-#include "prepare_constants_cuda.h"
 
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // Helper functions

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-07-23 21:58:26 UTC (rev 20536)
@@ -38,226 +38,18 @@
 #include "mesh_constants_cuda.h"
 
 
-//  cuda constant arrays
-__device__ realw d_hprime_xx[NGLL2];
-__device__ realw d_hprime_yy[NGLL2];
-__device__ realw d_hprime_zz[NGLL2];
-__device__ realw d_hprimewgll_xx[NGLL2];
-__device__ realw d_hprimewgll_yy[NGLL2];
-__device__ realw d_hprimewgll_zz[NGLL2];
-__device__ realw d_wgllwgll_xy[NGLL2];
-__device__ realw d_wgllwgll_xz[NGLL2];
-__device__ realw d_wgllwgll_yz[NGLL2];
+#ifdef USE_TEXTURES_FIELDS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_cm_tex;
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_cm_tex;
+#endif
 
-__constant__ realw d_wgll_cube[NGLL3]; // needed only for gravity case
+#ifdef USE_TEXTURES_CONSTANTS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_cm_tex;
+#endif
 
-/* ----------------------------------------------------------------------------------------------- */
 
-// CONSTANT arrays setup
-
 /* ----------------------------------------------------------------------------------------------- */
 
-/* note:
- constant arrays when used in other compute_forces_***_cuda.cu routines stay zero,
- constant declaration and cudaMemcpyToSymbol would have to be in the same file...
-
- extern keyword doesn't work for __constant__ declarations.
-
- also:
- cudaMemcpyToSymbol("deviceCaseParams", caseParams, sizeof(CaseParams));
- ..
- and compile with -arch=sm_20
-
- see also: http://stackoverflow.com/questions/4008031/how-to-use-cuda-constant-memory-in-a-programmer-pleasant-way
- doesn't seem to work.
-
- we could keep arrays separated for acoustic and elastic routines...
-
- workaround:
-
-    for now, we store pointers with cudaGetSymbolAddress() function calls.
-    we pass those pointers in all other compute_forces_..() routines
-
-    in this file, we can use the above constant array declarations without need of the pointers.
-
- */
-
-// constant arrays
-
-void setConst_hprime_xx(realw* array,Mesh* mp)
-{
-
-  cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err));
-    fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
-    exit(1);
-  }
-
-  err = cudaGetSymbolAddress((void**)&(mp->d_hprime_xx),"d_hprime_xx");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_hprime_xx: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-}
-
-void setConst_hprime_yy(realw* array,Mesh* mp)
-{
-
-  cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
-    fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
-    exit(1);
-  }
-
-  err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-}
-
-void setConst_hprime_zz(realw* array,Mesh* mp)
-{
-
-  cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
-    fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
-    exit(1);
-  }
-
-  err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-}
-
-
-void setConst_hprimewgll_xx(realw* array,Mesh* mp)
-{
-  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-
-  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_xx),"d_hprimewgll_xx");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_hprimewgll_xx: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-}
-
-void setConst_hprimewgll_yy(realw* array,Mesh* mp)
-{
-  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-
-  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_yy),"d_hprimewgll_yy");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_hprimewgll_yy: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-}
-
-void setConst_hprimewgll_zz(realw* array,Mesh* mp)
-{
-  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-
-  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_zz),"d_hprimewgll_zz");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_hprimewgll_zz: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-}
-
-void setConst_wgllwgll_xy(realw* array,Mesh* mp)
-{
-  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_wgllwgll_xy: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-  //mp->d_wgllwgll_xy = d_wgllwgll_xy;
-  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xy),"d_wgllwgll_xy");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_wgllwgll_xy: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-
-}
-
-void setConst_wgllwgll_xz(realw* array,Mesh* mp)
-{
-  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in  setConst_wgllwgll_xz: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-  //mp->d_wgllwgll_xz = d_wgllwgll_xz;
-  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xz),"d_wgllwgll_xz");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_wgllwgll_xz: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-
-}
-
-void setConst_wgllwgll_yz(realw* array,Mesh* mp)
-{
-  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_wgllwgll_yz: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-  //mp->d_wgllwgll_yz = d_wgllwgll_yz;
-  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_yz),"d_wgllwgll_yz");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_wgllwgll_yz: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-
-}
-
-void setConst_wgll_cube(realw* array,Mesh* mp)
-{
-  cudaError_t err = cudaMemcpyToSymbol(d_wgll_cube, array, NGLL3*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_wgll_cube: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-  //mp->d_wgll_cube = d_wgll_cube;
-  err = cudaGetSymbolAddress((void**)&(mp->d_wgll_cube),"d_wgll_cube");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_wgll_cube: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
 // elemental routines
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -270,31 +62,29 @@
                                               realw* R_xy,
                                               realw* R_xz,
                                               realw* R_yz,
-                                              reald* sigma_xx,
-                                              reald* sigma_yy,
-                                              reald* sigma_zz,
-                                              reald* sigma_xy,
-                                              reald* sigma_xz,
-                                              reald* sigma_yz) {
+                                              realw* sigma_xx,
+                                              realw* sigma_yy,
+                                              realw* sigma_zz,
+                                              realw* sigma_xy,
+                                              realw* sigma_xz,
+                                              realw* sigma_yz) {
 
-  int i_sls,offset;
-  reald R_xx_val,R_yy_val;
+  int i_sls;
+  realw R_xx_val,R_yy_val;
 
   for(i_sls = 0; i_sls < N_SLS; i_sls++){
     // index
     // note: index for R_xx,.. here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
     //          local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
-    offset = i_sls + N_SLS*(tx + NGLL3*working_element);
+    R_xx_val = R_xx[i_sls + N_SLS*(tx + NGLL3*working_element)];
+    R_yy_val = R_yy[i_sls + N_SLS*(tx + NGLL3*working_element)];
 
-    R_xx_val = R_xx[offset];
-    R_yy_val = R_yy[offset];
-
     *sigma_xx = *sigma_xx - R_xx_val;
     *sigma_yy = *sigma_yy - R_yy_val;
     *sigma_zz = *sigma_zz + R_xx_val + R_yy_val;
-    *sigma_xy = *sigma_xy - R_xy[offset];
-    *sigma_xz = *sigma_xz - R_xz[offset];
-    *sigma_yz = *sigma_yz - R_yz[offset];
+    *sigma_xy = *sigma_xy - R_xy[i_sls + N_SLS*(tx + NGLL3*working_element)];
+    *sigma_xz = *sigma_xz - R_xz[i_sls + N_SLS*(tx + NGLL3*working_element)];
+    *sigma_yz = *sigma_yz - R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)];
   }
   return;
 }
@@ -310,29 +100,23 @@
                                               realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
                                               realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
                                               realw* epsilondev_xz,realw* epsilondev_yz,
-                                              reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
-                                              reald epsilondev_xz_loc,reald epsilondev_yz_loc,
+                                              realw epsilondev_xx_loc,realw epsilondev_yy_loc,realw epsilondev_xy_loc,
+                                              realw epsilondev_xz_loc,realw epsilondev_yz_loc,
                                               int ANISOTROPY,
                                               realw* d_c44store,
                                               int ATTENUATION_3D
                                               ){
 
   int i_sls;
-  int ijk_ispec;
-  int offset_align,offset;
-  reald fac;
-  reald alphaval_loc,betaval_loc,gammaval_loc;
-  reald factor_loc,Sn,Snp1;
+  realw fac;
+  realw alphaval_loc,betaval_loc,gammaval_loc;
+  realw factor_loc,Sn,Snp1;
 
-  // indices
-  offset_align = tx + NGLL3_PADDED * working_element;
-  ijk_ispec = tx + NGLL3 * working_element;
-
   // shear moduli for common factor (only Q_mu attenuation)
   if( ANISOTROPY ){
-    fac = d_c44store[offset_align];
+    fac = d_c44store[tx + NGLL3_PADDED * working_element];
   }else{
-    fac = d_muvstore[offset_align];
+    fac = d_muvstore[tx + NGLL3_PADDED * working_element];
   }
 
   // use Runge-Kutta scheme to march in time
@@ -340,48 +124,49 @@
     // indices
     // note: index for R_xx,... here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
     //          local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
-    // index for (i_sls,i,j,k,ispec)
-    offset = i_sls + N_SLS*(tx + NGLL3*working_element);
-    // index for (i,j,k,ispec,i_sls)
-    //offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
-
+    //
     // either mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
     // or       factor_common(i_sls,:,:,:,ispec) * c44store(:,:,:,ispec)
     if( ATTENUATION_3D ){
-      factor_loc = fac * factor_common[offset];
+      factor_loc = fac * factor_common[i_sls + N_SLS*(tx + NGLL3*working_element)];
     }else{
-      factor_loc = fac * factor_common[i_sls + N_SLS*working_element];    
+      factor_loc = fac * factor_common[i_sls + N_SLS*working_element];
     }
-    
+
     alphaval_loc = alphaval[i_sls]; // (i_sls)
     betaval_loc = betaval[i_sls];
     gammaval_loc = gammaval[i_sls];
 
     // term in xx
-    Sn   = factor_loc * epsilondev_xx[ijk_ispec]; //(i,j,k,ispec)
+    Sn   = factor_loc * epsilondev_xx[tx + NGLL3 * working_element]; //(i,j,k,ispec)
     Snp1   = factor_loc * epsilondev_xx_loc; //(i,j,k)
-    R_xx[offset] = alphaval_loc * R_xx[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_xx[i_sls + N_SLS*(tx + NGLL3*working_element)] =
+      alphaval_loc * R_xx[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in yy
-    Sn   = factor_loc * epsilondev_yy[ijk_ispec];
+    Sn   = factor_loc * epsilondev_yy[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_yy_loc;
-    R_yy[offset] = alphaval_loc * R_yy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_yy[i_sls + N_SLS*(tx + NGLL3*working_element)] =
+      alphaval_loc * R_yy[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
     // term in zz not computed since zero trace
 
     // term in xy
-    Sn   = factor_loc * epsilondev_xy[ijk_ispec];
+    Sn   = factor_loc * epsilondev_xy[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_xy_loc;
-    R_xy[offset] = alphaval_loc * R_xy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_xy[i_sls + N_SLS*(tx + NGLL3*working_element)] =
+      alphaval_loc * R_xy[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in xz
-    Sn   = factor_loc * epsilondev_xz[ijk_ispec];
+    Sn   = factor_loc * epsilondev_xz[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_xz_loc;
-    R_xz[offset] = alphaval_loc * R_xz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_xz[i_sls + N_SLS*(tx + NGLL3*working_element)] =
+      alphaval_loc * R_xz[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in yz
-    Sn   = factor_loc * epsilondev_yz[ijk_ispec];
+    Sn   = factor_loc * epsilondev_yz[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_yz_loc;
-    R_yz[offset] = alphaval_loc * R_yz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+    R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)] =
+      alphaval_loc * R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
   }
   return;
 }
@@ -397,38 +182,38 @@
                                           realw* d_minus_deriv_gravity_table,
                                           realw* d_density_table,
                                           realw* wgll_cube,
-                                          reald jacobianl,
-                                          reald* s_dummyx_loc,
-                                          reald* s_dummyy_loc,
-                                          reald* s_dummyz_loc,
-                                          reald* sigma_xx,
-                                          reald* sigma_yy,
-                                          reald* sigma_zz,
-                                          reald* sigma_xy,
-                                          reald* sigma_yx,
-                                          reald* sigma_xz,
-                                          reald* sigma_zx,
-                                          reald* sigma_yz,
-                                          reald* sigma_zy,
-                                          reald* rho_s_H1,
-                                          reald* rho_s_H2,
-                                          reald* rho_s_H3){
+                                          realw jacobianl,
+                                          realw* s_dummyx_loc,
+                                          realw* s_dummyy_loc,
+                                          realw* s_dummyz_loc,
+                                          realw* sigma_xx,
+                                          realw* sigma_yy,
+                                          realw* sigma_zz,
+                                          realw* sigma_xy,
+                                          realw* sigma_yx,
+                                          realw* sigma_xz,
+                                          realw* sigma_zx,
+                                          realw* sigma_yz,
+                                          realw* sigma_zy,
+                                          realw* rho_s_H1,
+                                          realw* rho_s_H2,
+                                          realw* rho_s_H3){
 
-  reald radius,theta,phi;
-  reald cos_theta,sin_theta,cos_phi,sin_phi;
-  reald minus_g,minus_dg;
-  reald rho;
-  reald gxl,gyl,gzl;
-  reald minus_g_over_radius,minus_dg_plus_g_over_radius;
-  reald cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
-  reald Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
-  reald sx_l,sy_l,sz_l;
-  reald factor;
+  realw radius,theta,phi;
+  realw cos_theta,sin_theta,cos_phi,sin_phi;
+  realw minus_g,minus_dg;
+  realw rho;
+  realw gxl,gyl,gzl;
+  realw minus_g_over_radius,minus_dg_plus_g_over_radius;
+  realw cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
+  realw Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
+  realw sx_l,sy_l,sz_l;
+  realw factor;
 
   // R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
-  const reald R_EARTH_KM = 6371.0f;
+  //const realw R_EARTH_KM = 6371.0f;
   // uncomment line below for PREM with oceans
-  //const reald R_EARTH_KM = 6368.0f;
+  //const realw R_EARTH_KM = 6368.0f;
 
   // compute non-symmetric terms for gravity
 
@@ -521,18 +306,17 @@
                                          realw* d_c44store,realw* d_c45store,realw* d_c46store,
                                          realw* d_c55store,realw* d_c56store,realw* d_c66store,
                                          int ATTENUATION,
-                                         reald minus_sum_beta,
-                                         reald duxdxl,reald duxdyl,reald duxdzl,
-                                         reald duydxl,reald duydyl,reald duydzl,
-                                         reald duzdxl,reald duzdyl,reald duzdzl,
-                                         reald duxdxl_plus_duydyl,reald duxdxl_plus_duzdzl,reald duydyl_plus_duzdzl,
-                                         reald duxdyl_plus_duydxl,reald duzdxl_plus_duxdzl,reald duzdyl_plus_duydzl,
-                                         reald* sigma_xx,reald* sigma_yy,reald* sigma_zz,
-                                         reald* sigma_xy,reald* sigma_xz,reald* sigma_yz
+                                         realw minus_sum_beta,
+                                         realw duxdxl,realw duxdyl,realw duxdzl,
+                                         realw duydxl,realw duydyl,realw duydzl,
+                                         realw duzdxl,realw duzdyl,realw duzdzl,
+                                         realw duxdyl_plus_duydxl,realw duzdxl_plus_duxdzl,realw duzdyl_plus_duydzl,
+                                         realw* sigma_xx,realw* sigma_yy,realw* sigma_zz,
+                                         realw* sigma_xy,realw* sigma_xz,realw* sigma_yz
                                          ){
 
-  reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
-  reald mul;
+  realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+  realw mul;
 
   c11 = d_c11store[offset];
   c12 = d_c12store[offset];
@@ -592,14 +376,14 @@
 __device__ void compute_element_cm_iso(int offset,
                                        realw* d_kappavstore,realw* d_muvstore,
                                        int ATTENUATION,
-                                       reald one_minus_sum_beta_use,
-                                       reald duxdxl,reald duydyl,reald duzdzl,
-                                       reald duxdxl_plus_duydyl,reald duxdxl_plus_duzdzl,reald duydyl_plus_duzdzl,
-                                       reald duxdyl_plus_duydxl,reald duzdxl_plus_duxdzl,reald duzdyl_plus_duydzl,
-                                       reald* sigma_xx,reald* sigma_yy,reald* sigma_zz,
-                                       reald* sigma_xy,reald* sigma_xz,reald* sigma_yz){
+                                       realw one_minus_sum_beta_use,
+                                       realw duxdxl,realw duydyl,realw duzdzl,
+                                       realw duxdxl_plus_duydyl,realw duxdxl_plus_duzdzl,realw duydyl_plus_duzdzl,
+                                       realw duxdyl_plus_duydxl,realw duzdxl_plus_duxdzl,realw duzdyl_plus_duydzl,
+                                       realw* sigma_xx,realw* sigma_yy,realw* sigma_zz,
+                                       realw* sigma_xy,realw* sigma_xz,realw* sigma_yz){
 
-  reald lambdal,mul,lambdalplus2mul,kappal;
+  realw lambdal,mul,lambdalplus2mul,kappal;
 
   // compute elements with an elastic isotropic rheology
   kappal = d_kappavstore[offset];
@@ -630,31 +414,30 @@
                                         realw* d_kappavstore,realw* d_muvstore,
                                         realw* d_kappahstore,realw* d_muhstore,realw* d_eta_anisostore,
                                         int ATTENUATION,
-                                        reald one_minus_sum_beta_use,
-                                        reald duxdxl,reald duxdyl,reald duxdzl,
-                                        reald duydxl,reald duydyl,reald duydzl,
-                                        reald duzdxl,reald duzdyl,reald duzdzl,
-                                        reald duxdxl_plus_duydyl,reald duxdxl_plus_duzdzl,reald duydyl_plus_duzdzl,
-                                        reald duxdyl_plus_duydxl,reald duzdxl_plus_duxdzl,reald duzdyl_plus_duydzl,
+                                        realw one_minus_sum_beta_use,
+                                        realw duxdxl,realw duxdyl,realw duxdzl,
+                                        realw duydxl,realw duydyl,realw duydzl,
+                                        realw duzdxl,realw duzdyl,realw duzdzl,
+                                        realw duxdyl_plus_duydxl,realw duzdxl_plus_duxdzl,realw duzdyl_plus_duydzl,
                                         int iglob,int NGLOB,
                                         realw* d_ystore, realw* d_zstore,
-                                        reald* sigma_xx,reald* sigma_yy,reald* sigma_zz,
-                                        reald* sigma_xy,reald* sigma_xz,reald* sigma_yz){
+                                        realw* sigma_xx,realw* sigma_yy,realw* sigma_zz,
+                                        realw* sigma_xy,realw* sigma_xz,realw* sigma_yz){
 
-  reald kappavl,muvl,kappahl,muhl;
-  reald rhovpvsq,rhovphsq,rhovsvsq,rhovshsq,eta_aniso;
-  reald costheta,sintheta,cosphi,sinphi;
-  reald costhetasq,sinthetasq,cosphisq,sinphisq,costhetafour,sinthetafour,cosphifour,sinphifour;
-  reald costwotheta,sintwotheta,costwophi,sintwophi,cosfourtheta,cosfourphi;
-  reald costwothetasq,costwophisq,sintwophisq;
-  reald etaminone,twoetaminone;
-  reald two_eta_aniso,four_eta_aniso,six_eta_aniso;
-  reald two_rhovsvsq,two_rhovshsq; // two_rhovpvsq,two_rhovphsq
-  reald four_rhovsvsq,four_rhovshsq; // four_rhovpvsq,four_rhovphsq
-  reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+  realw kappavl,muvl,kappahl,muhl;
+  realw rhovpvsq,rhovphsq,rhovsvsq,rhovshsq,eta_aniso;
+  realw costheta,sintheta,cosphi,sinphi;
+  realw costhetasq,sinthetasq,cosphisq,sinphisq,costhetafour,sinthetafour,cosphifour,sinphifour;
+  realw costwotheta,sintwotheta,costwophi,sintwophi,cosfourtheta,cosfourphi;
+  realw costwothetasq,costwophisq,sintwophisq;
+  realw etaminone,twoetaminone;
+  realw two_eta_aniso,four_eta_aniso,six_eta_aniso;
+  realw two_rhovsvsq,two_rhovshsq; // two_rhovpvsq,two_rhovphsq
+  realw four_rhovsvsq,four_rhovshsq; // four_rhovpvsq,four_rhovphsq
+  realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
 
   // cosine and sine function in CUDA only supported for float
-  reald theta,phi;
+  realw theta,phi;
 
   // use Kappa and mu from transversely isotropic model
   kappavl = d_kappavstore[offset];
@@ -699,10 +482,14 @@
     //sinphi = sinf(phi);
     sincosf(phi, &sinphi, &cosphi);
 
-    costwotheta = cosf(2.0f * theta);
-    sintwotheta = sinf(2.0f * theta);
-    costwophi = cosf(2.0f * phi);
-    sintwophi = sinf(2.0f * phi);
+    //costwotheta = cosf(2.0f * theta);
+    //sintwotheta = sinf(2.0f * theta);
+    sincosf(2.0f * theta, &sintwotheta, &costwotheta);
+
+    //costwophi = cosf(2.0f * phi);
+    //sintwophi = sinf(2.0f * phi);
+    sincosf(2.0f * phi, &sintwophi, &costwophi);
+
     cosfourtheta = cosf(4.0f * theta);
     cosfourphi = cosf(4.0f * phi);
   }else{
@@ -911,6 +698,9 @@
                                           realw* d_xix, realw* d_xiy, realw* d_xiz,
                                           realw* d_etax, realw* d_etay, realw* d_etaz,
                                           realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                                          realw* d_hprime_xx,
+                                          realw* d_hprimewgll_xx, realw* d_hprimewgll_yy, realw* d_hprimewgll_zz,
+                                          realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
                                           realw* d_kappavstore, realw* d_muvstore,
                                           realw* d_kappahstore, realw* d_muhstore,
                                           realw* d_eta_anisostore,
@@ -947,11 +737,6 @@
   /* int bx = blockIdx.x; */
   int tx = threadIdx.x;
 
-  //const int NGLLX = 5;
-  // const int NGLL2 = 25;
-  //const int NGLL3 = NGLL3;
-  const int NGLL3_ALIGN = NGLL3_PADDED;
-
   int K = (tx/NGLL2);
   int J = ((tx-K*NGLL2)/NGLLX);
   int I = (tx-K*NGLL2-J*NGLLX);
@@ -960,680 +745,670 @@
   int iglob = 0;
   int working_element;
 
-  reald tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
-  reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
-  reald duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
-  reald duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
-  reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+  realw tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+  realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+  realw duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+  realw duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+  realw duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
 
-  reald tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
-  reald duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
-  reald duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+  realw tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
+  realw duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+  realw duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
 
-  reald fac1,fac2,fac3;
-  reald minus_sum_beta,one_minus_sum_beta_use;
+  realw fac1,fac2,fac3;
+  realw minus_sum_beta,one_minus_sum_beta_use;
 
-  reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
-  reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
-  //reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
-  reald sum_terms1,sum_terms2,sum_terms3;
+  realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+  realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+  realw sum_terms1,sum_terms2,sum_terms3;
 
   // gravity variables
-  reald sigma_yx,sigma_zx,sigma_zy;
-  reald rho_s_H1,rho_s_H2,rho_s_H3;
+  realw sigma_yx,sigma_zx,sigma_zy;
+  realw rho_s_H1,rho_s_H2,rho_s_H3;
 
 #ifndef MANUALLY_UNROLLED_LOOPS
     int l;
-    realw hp1,hp2,hp3;
 #endif
 
-    __shared__ reald s_dummyx_loc[NGLL3];
-    __shared__ reald s_dummyy_loc[NGLL3];
-    __shared__ reald s_dummyz_loc[NGLL3];
+  __shared__ realw s_dummyx_loc[NGLL3];
+  __shared__ realw s_dummyy_loc[NGLL3];
+  __shared__ realw s_dummyz_loc[NGLL3];
 
-    __shared__ reald s_dummyx_loc_att[NGLL3];
-    __shared__ reald s_dummyy_loc_att[NGLL3];
-    __shared__ reald s_dummyz_loc_att[NGLL3];
+  __shared__ realw s_dummyx_loc_att[NGLL3];
+  __shared__ realw s_dummyy_loc_att[NGLL3];
+  __shared__ realw s_dummyz_loc_att[NGLL3];
 
-    __shared__ reald s_tempx1[NGLL3];
-    __shared__ reald s_tempx2[NGLL3];
-    __shared__ reald s_tempx3[NGLL3];
-    __shared__ reald s_tempy1[NGLL3];
-    __shared__ reald s_tempy2[NGLL3];
-    __shared__ reald s_tempy3[NGLL3];
-    __shared__ reald s_tempz1[NGLL3];
-    __shared__ reald s_tempz2[NGLL3];
-    __shared__ reald s_tempz3[NGLL3];
+  __shared__ realw s_tempx1[NGLL3];
+  __shared__ realw s_tempx2[NGLL3];
+  __shared__ realw s_tempx3[NGLL3];
+  __shared__ realw s_tempy1[NGLL3];
+  __shared__ realw s_tempy2[NGLL3];
+  __shared__ realw s_tempy3[NGLL3];
+  __shared__ realw s_tempz1[NGLL3];
+  __shared__ realw s_tempz2[NGLL3];
+  __shared__ realw s_tempz3[NGLL3];
 
+  __shared__ realw sh_hprime_xx[NGLL2];
+
 // use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
 // because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
-    active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+  active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
 
 // copy from global memory to shared memory
 // each thread writes one of the NGLL^3 = 125 data points
-    if (active) {
+  if (active) {
 
 #ifdef USE_MESH_COLORING_GPU
-      working_element = bx;
+    working_element = bx;
 #else
-      //mesh coloring
-      if( use_mesh_coloring_gpu ){
-        working_element = bx;
-      }else{
-        // iphase-1 and working_element-1 for Fortran->C array conventions
-        working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
-      }
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+      working_element = bx;
+    }else{
+      // iphase-1 and working_element-1 for Fortran->C array conventions
+      working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
+    }
 #endif
 
-      // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
-      iglob = d_ibool[working_element*NGLL3 + tx]-1;
+    // iglob = d_ibool[working_element*NGLL3_PADDED + tx]-1;
+    iglob = d_ibool[working_element*NGLL3 + tx]-1;
 
-#ifdef USE_TEXTURES
-      s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
-      s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
-      s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
+#ifdef USE_TEXTURES_FIELDS
+    s_dummyx_loc[tx] = tex1Dfetch(d_displ_cm_tex, iglob*3);
+    s_dummyy_loc[tx] = tex1Dfetch(d_displ_cm_tex, iglob*3 + 1);
+    s_dummyz_loc[tx] = tex1Dfetch(d_displ_cm_tex, iglob*3 + 2);
 #else
-      // changing iglob indexing to match fortran row changes fast style
-      s_dummyx_loc[tx] = d_displ[iglob*3];
-      s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
-      s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+    // changing iglob indexing to match fortran row changes fast style
+    s_dummyx_loc[tx] = d_displ[iglob*3];
+    s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+    s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
 #endif
 
-      if(ATTENUATION){
-        if(ATTENUATION_NEW){
-          // takes new routines
-          // use first order Taylor expansion of displacement for local storage of stresses
-          // at this current time step, to fix attenuation in a consistent way
-#ifdef USE_TEXTURES
-          s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob);
-          s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + NGLOB);
-          s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + 2*NGLOB);
+    if(ATTENUATION){
+      if(ATTENUATION_NEW){
+        // takes new routines
+        // use first order Taylor expansion of displacement for local storage of stresses
+        // at this current time step, to fix attenuation in a consistent way
+#ifdef USE_TEXTURES_FIELDS
+        s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(d_displ_cm_tex, iglob*3);
+        s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(d_displ_cm_tex, iglob*3 + 1);
+        s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(d_displ_cm_tex, iglob*3 + 2);
 #else
-          s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
-          s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
-          s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+        s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
+        s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
+        s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
 #endif
-        }
-        else{
+      }
+      else{
           // takes old routines
           s_dummyx_loc_att[tx] = s_dummyx_loc[tx];
           s_dummyy_loc_att[tx] = s_dummyy_loc[tx];
           s_dummyz_loc_att[tx] = s_dummyz_loc[tx];
-        }
       }
     }
+  }
 
+  if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+    sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_cm_tex,tx);
+#else
+    sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+  }
+
 // synchronize all the threads (one thread for each of the NGLL grid points of the
 // current spectral element) because we need the whole element to be ready in order
 // to be able to compute the matrix products along cut planes of the 3D element below
-    __syncthreads();
+  __syncthreads();
 
-#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+  if (active) {
 
-    if (active) {
-
 #ifndef MANUALLY_UNROLLED_LOOPS
 
-      tempx1l = 0.f;
-      tempx2l = 0.f;
-      tempx3l = 0.f;
+    tempx1l = 0.f;
+    tempx2l = 0.f;
+    tempx3l = 0.f;
 
-      tempy1l = 0.f;
-      tempy2l = 0.f;
-      tempy3l = 0.f;
+    tempy1l = 0.f;
+    tempy2l = 0.f;
+    tempy3l = 0.f;
 
-      tempz1l = 0.f;
-      tempz2l = 0.f;
-      tempz3l = 0.f;
+    tempz1l = 0.f;
+    tempz2l = 0.f;
+    tempz3l = 0.f;
 
-      for (l=0;l<NGLLX;l++) {
-          hp1 = d_hprime_xx[l*NGLLX+I];
-          offset = K*NGLL2+J*NGLLX+l;
-          tempx1l += s_dummyx_loc[offset]*hp1;
-          tempy1l += s_dummyy_loc[offset]*hp1;
-          tempz1l += s_dummyz_loc[offset]*hp1;
+    for (l=0;l<NGLLX;l++) {
+        fac1 = sh_hprime_xx[l*NGLLX+I];
+        tempx1l += s_dummyx_loc[K*NGLL2+J*NGLLX+l]*fac1;
+        tempy1l += s_dummyy_loc[K*NGLL2+J*NGLLX+l]*fac1;
+        tempz1l += s_dummyz_loc[K*NGLL2+J*NGLLX+l]*fac1;
 
-          hp2 = d_hprime_xx[l*NGLLX+J];
-          offset = K*NGLL2+l*NGLLX+I;
-          tempx2l += s_dummyx_loc[offset]*hp2;
-          tempy2l += s_dummyy_loc[offset]*hp2;
-          tempz2l += s_dummyz_loc[offset]*hp2;
+        fac2 = sh_hprime_xx[l*NGLLX+J];
+        tempx2l += s_dummyx_loc[K*NGLL2+l*NGLLX+I]*fac2;
+        tempy2l += s_dummyy_loc[K*NGLL2+l*NGLLX+I]*fac2;
+        tempz2l += s_dummyz_loc[K*NGLL2+l*NGLLX+I]*fac2;
 
-          hp3 = d_hprime_xx[l*NGLLX+K];
-          offset = l*NGLL2+J*NGLLX+I;
-          tempx3l += s_dummyx_loc[offset]*hp3;
-          tempy3l += s_dummyy_loc[offset]*hp3;
-          tempz3l += s_dummyz_loc[offset]*hp3;
+        fac3 = sh_hprime_xx[l*NGLLX+K];
+        tempx3l += s_dummyx_loc[l*NGLL2+J*NGLLX+I]*fac3;
+        tempy3l += s_dummyy_loc[l*NGLL2+J*NGLLX+I]*fac3;
+        tempz3l += s_dummyz_loc[l*NGLL2+J*NGLLX+I]*fac3;
+    }
 
-      }
 
+    if( ATTENUATION){
+      // temporary variables used for fixing attenuation in a consistent way
+      tempx1l_att = 0.f;
+      tempx2l_att = 0.f;
+      tempx3l_att = 0.f;
 
-      if( ATTENUATION){
-        // temporary variables used for fixing attenuation in a consistent way
-        tempx1l_att = 0.f;
-        tempx2l_att = 0.f;
-        tempx3l_att = 0.f;
+      tempy1l_att = 0.f;
+      tempy2l_att = 0.f;
+      tempy3l_att = 0.f;
 
-        tempy1l_att = 0.f;
-        tempy2l_att = 0.f;
-        tempy3l_att = 0.f;
+      tempz1l_att = 0.f;
+      tempz2l_att = 0.f;
+      tempz3l_att = 0.f;
 
-        tempz1l_att = 0.f;
-        tempz2l_att = 0.f;
-        tempz3l_att = 0.f;
+      for (l=0;l<NGLLX;l++) {
+        fac1 = sh_hprime_xx[l*NGLLX+I];
+        tempx1l_att += s_dummyx_loc_att[K*NGLL2+J*NGLLX+l]*fac1;
+        tempy1l_att += s_dummyy_loc_att[K*NGLL2+J*NGLLX+l]*fac1;
+        tempz1l_att += s_dummyz_loc_att[K*NGLL2+J*NGLLX+l]*fac1;
 
-        for (l=0;l<NGLLX;l++) {
-                hp1 = d_hprime_xx[l*NGLLX+I];
-                offset = K*NGLL2+J*NGLLX+l;
-                tempx1l_att += s_dummyx_loc_att[offset]*hp1;
-                tempy1l_att += s_dummyy_loc_att[offset]*hp1;
-                tempz1l_att += s_dummyz_loc_att[offset]*hp1;
+        fac2 = sh_hprime_xx[l*NGLLX+J];
+        tempx2l_att += s_dummyx_loc_att[K*NGLL2+l*NGLLX+I]*fac2;
+        tempy2l_att += s_dummyy_loc_att[K*NGLL2+l*NGLLX+I]*fac2;
+        tempz2l_att += s_dummyz_loc_att[K*NGLL2+l*NGLLX+I]*fac2;
 
-                hp2 = d_hprime_xx[l*NGLLX+J];
-                offset = K*NGLL2+l*NGLLX+I;
-                tempx2l_att += s_dummyx_loc_att[offset]*hp2;
-                tempy2l_att += s_dummyy_loc_att[offset]*hp2;
-                tempz2l_att += s_dummyz_loc_att[offset]*hp2;
-
-                hp3 = d_hprime_xx[l*NGLLX+K];
-                offset = l*NGLL2+J*NGLLX+I;
-                tempx3l_att += s_dummyx_loc_att[offset]*hp3;
-                tempy3l_att += s_dummyy_loc_att[offset]*hp3;
-                tempz3l_att += s_dummyz_loc_att[offset]*hp3;
-
-        }
+        fac3 = sh_hprime_xx[l*NGLLX+K];
+        tempx3l_att += s_dummyx_loc_att[l*NGLL2+J*NGLLX+I]*fac3;
+        tempy3l_att += s_dummyy_loc_att[l*NGLL2+J*NGLLX+I]*fac3;
+        tempz3l_att += s_dummyz_loc_att[l*NGLL2+J*NGLLX+I]*fac3;
       }
+    }
 #else
 
-      tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-              + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-              + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-              + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-              + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+    tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-      tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-              + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-              + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-              + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-              + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+    tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-      tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-              + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-              + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-              + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-              + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+    tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-      tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
-              + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-              + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-              + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-              + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+    tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-      tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
-              + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-              + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-              + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-              + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+    tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-      tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
-              + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-              + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-              + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-              + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+    tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-      tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
-              + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-              + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-              + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-              + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+    tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-      tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
-              + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-              + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-              + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-              + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+    tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-      tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
-              + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-              + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-              + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-              + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+    tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-      if( ATTENUATION){
-        // temporary variables used for fixing attenuation in a consistent way
-        tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-          + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-          + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-          + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-          + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+    if( ATTENUATION){
+      // temporary variables used for fixing attenuation in a consistent way
+      tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+        + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+        + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+        + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+        + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-        tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-          + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-          + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-          + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-          + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+      tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+        + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+        + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+        + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+        + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-        tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-          + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-          + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-          + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-          + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+      tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+        + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+        + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+        + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+        + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-        tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
-          + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-          + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-          + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-          + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+      tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+        + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+        + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+        + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+        + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-        tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
-          + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-          + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-          + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-          + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+      tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+        + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+        + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+        + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+        + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-        tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
-          + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-          + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-          + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-          + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+      tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+        + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+        + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+        + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+        + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-        tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
-          + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-          + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-          + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-          + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+      tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+        + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+        + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+        + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+        + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-        tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
-          + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-          + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-          + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-          + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+      tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+        + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+        + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+        + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+        + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-        tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
-          + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-          + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-          + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-          + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
-      }
+      tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+        + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+        + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+        + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+        + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+    }
 
 #endif
 
-      // compute derivatives of ux, uy and uz with respect to x, y and z
-      offset = working_element*NGLL3_ALIGN + tx;
+    // compute derivatives of ux, uy and uz with respect to x, y and z
+    offset = working_element*NGLL3_PADDED + tx;
 
-      xixl = d_xix[offset];
-      xiyl = d_xiy[offset];
-      xizl = d_xiz[offset];
-      etaxl = d_etax[offset];
-      etayl = d_etay[offset];
-      etazl = d_etaz[offset];
-      gammaxl = d_gammax[offset];
-      gammayl = d_gammay[offset];
-      gammazl = d_gammaz[offset];
+    xixl = d_xix[offset];
+    xiyl = d_xiy[offset];
+    xizl = d_xiz[offset];
+    etaxl = d_etax[offset];
+    etayl = d_etay[offset];
+    etazl = d_etaz[offset];
+    gammaxl = d_gammax[offset];
+    gammayl = d_gammay[offset];
+    gammazl = d_gammaz[offset];
 
-      duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
-      duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
-      duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+    duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+    duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+    duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
 
-      duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
-      duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
-      duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+    duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+    duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+    duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
 
-      duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
-      duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
-      duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+    duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+    duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+    duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
 
-      // precompute some sums to save CPU time
-      duxdxl_plus_duydyl = duxdxl + duydyl;
-      duxdxl_plus_duzdzl = duxdxl + duzdzl;
-      duydyl_plus_duzdzl = duydyl + duzdzl;
-      duxdyl_plus_duydxl = duxdyl + duydxl;
-      duzdxl_plus_duxdzl = duzdxl + duxdzl;
-      duzdyl_plus_duydzl = duzdyl + duydzl;
+    // precompute some sums to save CPU time
+    duxdxl_plus_duydyl = duxdxl + duydyl;
+    duxdxl_plus_duzdzl = duxdxl + duzdzl;
+    duydyl_plus_duzdzl = duydyl + duzdzl;
+    duxdyl_plus_duydxl = duxdyl + duydxl;
+    duzdxl_plus_duxdzl = duzdxl + duxdzl;
+    duzdyl_plus_duydzl = duzdyl + duydzl;
 
-      if( ATTENUATION){
-        // temporary variables used for fixing attenuation in a consistent way
-        duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
-        duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
-        duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
+    if( ATTENUATION){
+      // temporary variables used for fixing attenuation in a consistent way
+      duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
+      duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
+      duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
 
-        duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
-        duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
-        duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
+      duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
+      duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
+      duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
 
-        duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
-        duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
-        duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
+      duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
+      duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
+      duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
 
-        // precompute some sums to save CPU time
-        duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
-        duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
-        duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
+      // precompute some sums to save CPU time
+      duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
+      duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
+      duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
 
-        // computes deviatoric strain attenuation and/or for kernel calculations
-        if(COMPUTE_AND_STORE_STRAIN) {
-          realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
+      // computes deviatoric strain attenuation and/or for kernel calculations
+      if(COMPUTE_AND_STORE_STRAIN) {
+        realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
 
-          // local storage: stresses at this current time step
-          epsilondev_xx_loc = duxdxl_att - templ;
-          epsilondev_yy_loc = duydyl_att - templ;
-          epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
-          epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
-          epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
+        // local storage: stresses at this current time step
+        epsilondev_xx_loc = duxdxl_att - templ;
+        epsilondev_yy_loc = duydyl_att - templ;
+        epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
+        epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
+        epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
 
-          if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) {
-            epsilon_trace_over_3[tx] = templ;
-          }else{
-            epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
-          }
+        if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) {
+          epsilon_trace_over_3[tx] = templ;
+        }else{
+          epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
         }
-      }else{
-        // computes deviatoric strain attenuation and/or for kernel calculations
-        if(COMPUTE_AND_STORE_STRAIN) {
-          realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+      }
+    }else{
+      // computes deviatoric strain attenuation and/or for kernel calculations
+      if(COMPUTE_AND_STORE_STRAIN) {
+        realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
 
-          // local storage: stresses at this current time step
-          epsilondev_xx_loc = duxdxl - templ;
-          epsilondev_yy_loc = duydyl - templ;
-          epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
-          epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
-          epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+        // local storage: stresses at this current time step
+        epsilondev_xx_loc = duxdxl - templ;
+        epsilondev_yy_loc = duydyl - templ;
+        epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+        epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+        epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
 
-          if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) {
-            epsilon_trace_over_3[tx] = templ;
-          }else{
-            epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
-          }
+        if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) {
+          epsilon_trace_over_3[tx] = templ;
+        }else{
+          epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
         }
       }
+    }
 
-      // attenuation
-      if(ATTENUATION){
-        // use unrelaxed parameters if attenuation
-        if( ATTENUATION_3D ){
-          one_minus_sum_beta_use = one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
-        }else{
-          one_minus_sum_beta_use = one_minus_sum_beta[working_element]; // (1,1,1,ispec)        
-        }
-        minus_sum_beta = one_minus_sum_beta_use - 1.0f;
+    // attenuation
+    if(ATTENUATION){
+      // use unrelaxed parameters if attenuation
+      if( ATTENUATION_3D ){
+        one_minus_sum_beta_use = one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
+      }else{
+        one_minus_sum_beta_use = one_minus_sum_beta[working_element]; // (1,1,1,ispec)
       }
+      minus_sum_beta = one_minus_sum_beta_use - 1.0f;
+    }
 
-      // computes stresses
-      if(ANISOTROPY){
-        // full anisotropic case, stress calculations
-        compute_element_cm_aniso(offset,
-                              d_c11store,d_c12store,d_c13store,d_c14store,d_c15store,d_c16store,d_c22store,
-                              d_c23store,d_c24store,d_c25store,d_c26store,d_c33store,d_c34store,d_c35store,
-                              d_c36store,d_c44store,d_c45store,d_c46store,d_c55store,d_c56store,d_c66store,
-                              ATTENUATION,
-                              minus_sum_beta,
-                              duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl,
-                              duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
-                              duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
-                              &sigma_xx,&sigma_yy,&sigma_zz,
-                              &sigma_xy,&sigma_xz,&sigma_yz);
+    // computes stresses
+    if(ANISOTROPY){
+      // full anisotropic case, stress calculations
+      compute_element_cm_aniso(offset,
+                            d_c11store,d_c12store,d_c13store,d_c14store,d_c15store,d_c16store,d_c22store,
+                            d_c23store,d_c24store,d_c25store,d_c26store,d_c33store,d_c34store,d_c35store,
+                            d_c36store,d_c44store,d_c45store,d_c46store,d_c55store,d_c56store,d_c66store,
+                            ATTENUATION,
+                            minus_sum_beta,
+                            duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl,
+                            duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
+                            &sigma_xx,&sigma_yy,&sigma_zz,
+                            &sigma_xy,&sigma_xz,&sigma_yz);
 
+    }else{
+      if( ! d_ispec_is_tiso[working_element] ){
+        // isotropic case
+        compute_element_cm_iso(offset,
+                            d_kappavstore,d_muvstore,
+                            ATTENUATION,
+                            one_minus_sum_beta_use,
+                            duxdxl,duydyl,duzdzl,
+                            duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
+                            duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
+                            &sigma_xx,&sigma_yy,&sigma_zz,
+                            &sigma_xy,&sigma_xz,&sigma_yz);
       }else{
-        if( ! d_ispec_is_tiso[working_element] ){
-          // isotropic case
-          compute_element_cm_iso(offset,
+        // transverse isotropy
+        compute_element_cm_tiso(offset,
                               d_kappavstore,d_muvstore,
+                              d_kappahstore,d_muhstore,d_eta_anisostore,
                               ATTENUATION,
                               one_minus_sum_beta_use,
-                              duxdxl,duydyl,duzdzl,
-                              duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
+                              duxdxl,duxdyl,duxdzl,
+                              duydxl,duydyl,duydzl,
+                              duzdxl,duzdyl,duzdzl,
                               duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
+                              iglob, NGLOB,
+                              d_ystore,d_zstore,
                               &sigma_xx,&sigma_yy,&sigma_zz,
                               &sigma_xy,&sigma_xz,&sigma_yz);
-        }else{
-          // transverse isotropy
-          compute_element_cm_tiso(offset,
-                                d_kappavstore,d_muvstore,
-                                d_kappahstore,d_muhstore,d_eta_anisostore,
-                                ATTENUATION,
-                                one_minus_sum_beta_use,
-                                duxdxl,duxdyl,duxdzl,
-                                duydxl,duydyl,duydzl,
-                                duzdxl,duzdyl,duzdzl,
-                                duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
-                                duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
-                                iglob, NGLOB,
-                                d_ystore,d_zstore,
-                                &sigma_xx,&sigma_yy,&sigma_zz,
-                                &sigma_xy,&sigma_xz,&sigma_yz);
-        }
-      } // ! end of test whether isotropic or anisotropic element
+      }
+    } // ! end of test whether isotropic or anisotropic element
 
 
-      if(ATTENUATION && (! USE_ATTENUATION_MIMIC ) ){
-        // subtracts memory variables if attenuation
-        compute_element_cm_att_stress(tx,working_element,
-                                      R_xx,R_yy,R_xy,R_xz,R_yz,
-                                      &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
-      }
+    if(ATTENUATION && (! USE_ATTENUATION_MIMIC ) ){
+      // subtracts memory variables if attenuation
+      compute_element_cm_att_stress(tx,working_element,
+                                    R_xx,R_yy,R_xy,R_xz,R_yz,
+                                    &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+    }
 
-      // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
-      sigma_yx = sigma_xy;
-      sigma_zx = sigma_xz;
-      sigma_zy = sigma_yz;
+    // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+    sigma_yx = sigma_xy;
+    sigma_zx = sigma_xz;
+    sigma_zy = sigma_yz;
 
-      // jacobian
-      jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
-                          -xiyl*(etaxl*gammazl-etazl*gammaxl)
-                          +xizl*(etaxl*gammayl-etayl*gammaxl));
+    // jacobian
+    jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
+                        -xiyl*(etaxl*gammazl-etazl*gammaxl)
+                        +xizl*(etaxl*gammayl-etayl*gammaxl));
 
-      if( GRAVITY ){
-        //  computes non-symmetric terms for gravity
-        compute_element_cm_gravity(tx,working_element,
-                                   d_ibool,d_xstore,d_ystore,d_zstore,
-                                   d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
-                                   wgll_cube,jacobianl,
-                                   s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
-                                   &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
-                                   &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,
-                                   &rho_s_H1,&rho_s_H2,&rho_s_H3);
-      }
+    if( GRAVITY ){
+      //  computes non-symmetric terms for gravity
+      compute_element_cm_gravity(tx,working_element,
+                                 d_ibool,d_xstore,d_ystore,d_zstore,
+                                 d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
+                                 wgll_cube,jacobianl,
+                                 s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+                                 &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
+                                 &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,
+                                 &rho_s_H1,&rho_s_H2,&rho_s_H3);
+    }
 
-      // form dot product with test vector, non-symmetric form
-      s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
-      s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
-      s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+    // form dot product with test vector, non-symmetric form
+    s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+    s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+    s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
 
-      s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
-      s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
-      s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+    s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+    s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+    s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
 
-      s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
-      s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
-      s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+    s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+    s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+    s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
 
-    }
+  }
 
 // synchronize all the threads (one thread for each of the NGLL grid points of the
 // current spectral element) because we need the whole element to be ready in order
 // to be able to compute the matrix products along cut planes of the 3D element below
-    __syncthreads();
+  __syncthreads();
 
-    if (active) {
+  if (active) {
 
 #ifndef MANUALLY_UNROLLED_LOOPS
 
-      tempx1l = 0.f;
-      tempy1l = 0.f;
-      tempz1l = 0.f;
+    tempx1l = 0.f;
+    tempy1l = 0.f;
+    tempz1l = 0.f;
 
-      tempx2l = 0.f;
-      tempy2l = 0.f;
-      tempz2l = 0.f;
+    tempx2l = 0.f;
+    tempy2l = 0.f;
+    tempz2l = 0.f;
 
-      tempx3l = 0.f;
-      tempy3l = 0.f;
-      tempz3l = 0.f;
+    tempx3l = 0.f;
+    tempy3l = 0.f;
+    tempz3l = 0.f;
 
-      for (l=0;l<NGLLX;l++) {
+    for (l=0;l<NGLLX;l++) {
 
-        fac1 = d_hprimewgll_xx[I*NGLLX+l];
-        offset = K*NGLL2+J*NGLLX+l;
-        tempx1l += s_tempx1[offset]*fac1;
-        tempy1l += s_tempy1[offset]*fac1;
-        tempz1l += s_tempz1[offset]*fac1;
+      fac1 = d_hprimewgll_xx[I*NGLLX+l];
+      tempx1l += s_tempx1[K*NGLL2+J*NGLLX+l]*fac1;
+      tempy1l += s_tempy1[K*NGLL2+J*NGLLX+l]*fac1;
+      tempz1l += s_tempz1[K*NGLL2+J*NGLLX+l]*fac1;
 
-        fac2 = d_hprimewgll_yy[J*NGLLX+l];
-        offset = K*NGLL2+l*NGLLX+I;
-        tempx2l += s_tempx2[offset]*fac2;
-        tempy2l += s_tempy2[offset]*fac2;
-        tempz2l += s_tempz2[offset]*fac2;
+      fac2 = d_hprimewgll_yy[J*NGLLX+l];
+      tempx2l += s_tempx2[K*NGLL2+l*NGLLX+I]*fac2;
+      tempy2l += s_tempy2[K*NGLL2+l*NGLLX+I]*fac2;
+      tempz2l += s_tempz2[K*NGLL2+l*NGLLX+I]*fac2;
 
-        fac3 = d_hprimewgll_zz[K*NGLLX+l];
-        offset = l*NGLL2+J*NGLLX+I;
-        tempx3l += s_tempx3[offset]*fac3;
-        tempy3l += s_tempy3[offset]*fac3;
-        tempz3l += s_tempz3[offset]*fac3;
+      fac3 = d_hprimewgll_zz[K*NGLLX+l];
+      tempx3l += s_tempx3[l*NGLL2+J*NGLLX+I]*fac3;
+      tempy3l += s_tempy3[l*NGLL2+J*NGLLX+I]*fac3;
+      tempz3l += s_tempz3[l*NGLL2+J*NGLLX+I]*fac3;
 
-      }
+    }
 #else
 
-      tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
-              + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
-              + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
-              + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
-              + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+    tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
 
-      tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
-              + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
-              + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
-              + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
-              + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+    tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
 
-      tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
-              + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
-              + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
-              + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
-              + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+    tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
 
-      tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
-              + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
-              + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
-              + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
-              + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+    tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+            + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+            + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+            + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+            + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
 
-      tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
-              + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
-              + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
-              + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
-              + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+    tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+            + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+            + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+            + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+            + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
 
-      tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
-              + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
-              + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
-              + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
-              + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+    tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+            + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+            + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+            + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+            + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
 
-      tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
-              + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
-              + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
-              + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
-              + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+    tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+            + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+            + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+            + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+            + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
 
-      tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
-              + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
-              + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
-              + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
-              + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+    tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+            + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+            + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+            + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+            + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
 
-      tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
-              + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
-              + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
-              + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
-              + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+    tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+            + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+            + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+            + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+            + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
 
 #endif
 
-      fac1 = d_wgllwgll_yz[K*NGLLX+J];
-      fac2 = d_wgllwgll_xz[K*NGLLX+I];
-      fac3 = d_wgllwgll_xy[J*NGLLX+I];
+    fac1 = d_wgllwgll_yz[K*NGLLX+J];
+    fac2 = d_wgllwgll_xz[K*NGLLX+I];
+    fac3 = d_wgllwgll_xy[J*NGLLX+I];
 
-      sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
-      sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
-      sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+    sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+    sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+    sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
 
-      // adds gravity term
-      if( GRAVITY ){
-        sum_terms1 += rho_s_H1;
-        sum_terms2 += rho_s_H2;
-        sum_terms3 += rho_s_H3;
-      }
+    // adds gravity term
+    if( GRAVITY ){
+      sum_terms1 += rho_s_H1;
+      sum_terms2 += rho_s_H2;
+      sum_terms3 += rho_s_H3;
+    }
 
-#ifdef USE_TEXTURES
-      d_accel[iglob] = tex1Dfetch(tex_accel, iglob) + sum_terms1 ;
-      d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) + sum_terms2 ;
-      d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) + sum_terms3 ;
+
+#ifdef USE_MESH_COLORING_GPU
+    // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+    d_accel[iglob*3]     = tex1Dfetch(d_accel_cm_tex, iglob*3) + sum_terms1;
+    d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 1) + sum_terms2;
+    d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 2) + sum_terms3;
 #else
-  /* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
+    d_accel[iglob*3]     += sum_terms1;
+    d_accel[iglob*3 + 1] += sum_terms2;
+    d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
 
+#else // MESH_COLORING
 
-#ifdef USE_MESH_COLORING_GPU
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+
       // no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+      d_accel[iglob*3]     = tex1Dfetch(d_accel_cm_tex, iglob*3) + sum_terms1;
+      d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 1) + sum_terms2;
+      d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 2) + sum_terms3;
+#else
       d_accel[iglob*3]     += sum_terms1;
       d_accel[iglob*3 + 1] += sum_terms2;
       d_accel[iglob*3 + 2] += sum_terms3;
-#else
-      //mesh coloring
-      if( use_mesh_coloring_gpu ){
+#endif // USE_TEXTURES_FIELDS
 
-       // no atomic operation needed, colors don't share global points between elements
-        d_accel[iglob*3]     += sum_terms1;
-        d_accel[iglob*3 + 1] += sum_terms2;
-        d_accel[iglob*3 + 2] += sum_terms3;
+    }else{
 
-      }else{
+      // for testing purposes only: w/out atomic updates
+      //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+      //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+      //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
 
-        // for testing purposes only: w/out atomic updates
-        //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
-        //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
-        //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+      atomicAdd(&d_accel[iglob*3], sum_terms1);
+      atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+      atomicAdd(&d_accel[iglob*3+2], sum_terms3);
 
-        atomicAdd(&d_accel[iglob*3], sum_terms1);
-        atomicAdd(&d_accel[iglob*3+1], sum_terms2);
-        atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+    }
+#endif // MESH_COLORING
 
-      }
-#endif
-
-#endif
-
-      // update memory variables based upon the Runge-Kutta scheme
-      if( ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
-        compute_element_cm_att_memory(tx,working_element,
-                                  d_muvstore,
-                                  factor_common,alphaval,betaval,gammaval,
-                                  R_xx,R_yy,R_xy,R_xz,R_yz,
-                                  epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
-                                  epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc,
-                                  ANISOTROPY,d_c44store,ATTENUATION_3D);
-      }
-
-      // save deviatoric strain for Runge-Kutta scheme
-      if( COMPUTE_AND_STORE_STRAIN ){
-        int ijk_ispec = tx + working_element*NGLL3;
-
-        // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-        epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
-        epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
-        epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
-        epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
-        epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
-      }
-
+    // update memory variables based upon the Runge-Kutta scheme
+    if( ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+      compute_element_cm_att_memory(tx,working_element,
+                                d_muvstore,
+                                factor_common,alphaval,betaval,gammaval,
+                                R_xx,R_yy,R_xy,R_xz,R_yz,
+                                epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+                                epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc,
+                                ANISOTROPY,d_c44store,ATTENUATION_3D);
     }
 
-#else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-    d_accel[iglob] -= 0.00000001f;
-    d_accel[iglob + NGLOB] -= 0.00000001f;
-    d_accel[iglob + 2*NGLOB] -= 0.00000001f;
-#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-
+    // save deviatoric strain for Runge-Kutta scheme
+    if( COMPUTE_AND_STORE_STRAIN ){
+      // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+      epsilondev_xx[tx + working_element*NGLL3] = epsilondev_xx_loc;
+      epsilondev_yy[tx + working_element*NGLL3] = epsilondev_yy_loc;
+      epsilondev_xy[tx + working_element*NGLL3] = epsilondev_xy_loc;
+      epsilondev_xz[tx + working_element*NGLL3] = epsilondev_xz_loc;
+      epsilondev_yz[tx + working_element*NGLL3] = epsilondev_yz_loc;
+    }
+  }
 }
+
 /* ----------------------------------------------------------------------------------------------- */
 
 void Kernel_2_crust_mantle(int nb_blocks_to_compute,Mesh* mp,
@@ -1707,98 +1482,104 @@
   // cudaEventRecord( start, 0 );
 
   Kernel_2_crust_mantle_impl<<<grid,threads>>>(nb_blocks_to_compute,
-                                  mp->NGLOB_CRUST_MANTLE,
-                                  d_ibool,
-                                  d_ispec_is_tiso,
-                                  mp->d_phase_ispec_inner_crust_mantle,
-                                  mp->num_phase_ispec_crust_mantle,
-                                  d_iphase,
-                                  d_deltat,
-                                  mp->use_mesh_coloring_gpu,
-                                  mp->d_displ_crust_mantle,
-                                  mp->d_veloc_crust_mantle,
-                                  mp->d_accel_crust_mantle,
-                                  d_xix, d_xiy, d_xiz,
-                                  d_etax, d_etay, d_etaz,
-                                  d_gammax, d_gammay, d_gammaz,
-                                  d_kappavstore, d_muvstore,
-                                  d_kappahstore, d_muhstore,
-                                  d_eta_anisostore,
-                                  mp->compute_and_store_strain,
-                                  d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
-                                  d_epsilondev_xz,d_epsilondev_yz,
-                                  d_epsilon_trace_over_3,
-                                  mp->simulation_type,
-                                  mp->attenuation,
-                                  mp->attenuation_new,
-                                  mp->use_attenuation_mimic,
-                                  mp->attenuation_3D,
-                                  d_one_minus_sum_beta,d_factor_common,
-                                  d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
-                                  mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
-                                  mp->anisotropic_3D_mantle,
-                                  d_c11store,d_c12store,d_c13store,
-                                  d_c14store,d_c15store,d_c16store,
-                                  d_c22store,d_c23store,d_c24store,
-                                  d_c25store,d_c26store,d_c33store,
-                                  d_c34store,d_c35store,d_c36store,
-                                  d_c44store,d_c45store,d_c46store,
-                                  d_c55store,d_c56store,d_c66store,
-                                  mp->gravity,
-                                  mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
-                                  mp->d_minus_gravity_table,
-                                  mp->d_minus_deriv_gravity_table,
-                                  mp->d_density_table,
-                                  mp->d_wgll_cube,
-                                  mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
+                                                mp->NGLOB_CRUST_MANTLE,
+                                                d_ibool,
+                                                d_ispec_is_tiso,
+                                                mp->d_phase_ispec_inner_crust_mantle,
+                                                mp->num_phase_ispec_crust_mantle,
+                                                d_iphase,
+                                                d_deltat,
+                                                mp->use_mesh_coloring_gpu,
+                                                mp->d_displ_crust_mantle,
+                                                mp->d_veloc_crust_mantle,
+                                                mp->d_accel_crust_mantle,
+                                                d_xix, d_xiy, d_xiz,
+                                                d_etax, d_etay, d_etaz,
+                                                d_gammax, d_gammay, d_gammaz,
+                                                mp->d_hprime_xx,
+                                                mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                                mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                d_kappavstore, d_muvstore,
+                                                d_kappahstore, d_muhstore,
+                                                d_eta_anisostore,
+                                                mp->compute_and_store_strain,
+                                                d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
+                                                d_epsilondev_xz,d_epsilondev_yz,
+                                                d_epsilon_trace_over_3,
+                                                mp->simulation_type,
+                                                mp->attenuation,
+                                                mp->attenuation_new,
+                                                mp->use_attenuation_mimic,
+                                                mp->attenuation_3D,
+                                                d_one_minus_sum_beta,d_factor_common,
+                                                d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+                                                mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+                                                mp->anisotropic_3D_mantle,
+                                                d_c11store,d_c12store,d_c13store,
+                                                d_c14store,d_c15store,d_c16store,
+                                                d_c22store,d_c23store,d_c24store,
+                                                d_c25store,d_c26store,d_c33store,
+                                                d_c34store,d_c35store,d_c36store,
+                                                d_c44store,d_c45store,d_c46store,
+                                                d_c55store,d_c56store,d_c66store,
+                                                mp->gravity,
+                                                mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
+                                                mp->d_minus_gravity_table,
+                                                mp->d_minus_deriv_gravity_table,
+                                                mp->d_density_table,
+                                                mp->d_wgll_cube,
+                                                mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
 
 
   if(mp->simulation_type == 3) {
     Kernel_2_crust_mantle_impl<<< grid,threads>>>(nb_blocks_to_compute,
-                                     mp->NGLOB_CRUST_MANTLE,
-                                     d_ibool,
-                                     d_ispec_is_tiso,
-                                     mp->d_phase_ispec_inner_crust_mantle,
-                                     mp->num_phase_ispec_crust_mantle,
-                                     d_iphase,
-                                     d_deltat,
-                                     mp->use_mesh_coloring_gpu,
-                                     mp->d_b_displ_crust_mantle,
-                                     mp->d_b_veloc_crust_mantle,
-                                     mp->d_b_accel_crust_mantle,
-                                     d_xix, d_xiy, d_xiz,
-                                     d_etax, d_etay, d_etaz,
-                                     d_gammax, d_gammay, d_gammaz,
-                                     d_kappavstore, d_muvstore,
-                                     d_kappahstore, d_muhstore,
-                                     d_eta_anisostore,
-                                     mp->compute_and_store_strain,
-                                     d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
-                                     d_b_epsilondev_xz,d_b_epsilondev_yz,
-                                     d_b_epsilon_trace_over_3,
-                                     mp->simulation_type,
-                                     mp->attenuation,
-                                     mp->attenuation_new,
-                                     mp->use_attenuation_mimic,
-                                     mp->attenuation_3D,
-                                     d_one_minus_sum_beta,d_factor_common,
-                                     d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
-                                     mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
-                                     mp->anisotropic_3D_mantle,
-                                     d_c11store,d_c12store,d_c13store,
-                                     d_c14store,d_c15store,d_c16store,
-                                     d_c22store,d_c23store,d_c24store,
-                                     d_c25store,d_c26store,d_c33store,
-                                     d_c34store,d_c35store,d_c36store,
-                                     d_c44store,d_c45store,d_c46store,
-                                     d_c55store,d_c56store,d_c66store,
-                                     mp->gravity,
-                                     mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
-                                     mp->d_minus_gravity_table,
-                                     mp->d_minus_deriv_gravity_table,
-                                     mp->d_density_table,
-                                     mp->d_wgll_cube,
-                                     mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
+                                                   mp->NGLOB_CRUST_MANTLE,
+                                                   d_ibool,
+                                                   d_ispec_is_tiso,
+                                                   mp->d_phase_ispec_inner_crust_mantle,
+                                                   mp->num_phase_ispec_crust_mantle,
+                                                   d_iphase,
+                                                   d_deltat,
+                                                   mp->use_mesh_coloring_gpu,
+                                                   mp->d_b_displ_crust_mantle,
+                                                   mp->d_b_veloc_crust_mantle,
+                                                   mp->d_b_accel_crust_mantle,
+                                                   d_xix, d_xiy, d_xiz,
+                                                   d_etax, d_etay, d_etaz,
+                                                   d_gammax, d_gammay, d_gammaz,
+                                                   mp->d_hprime_xx,
+                                                   mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                                   mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                   d_kappavstore, d_muvstore,
+                                                   d_kappahstore, d_muhstore,
+                                                   d_eta_anisostore,
+                                                   mp->compute_and_store_strain,
+                                                   d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
+                                                   d_b_epsilondev_xz,d_b_epsilondev_yz,
+                                                   d_b_epsilon_trace_over_3,
+                                                   mp->simulation_type,
+                                                   mp->attenuation,
+                                                   mp->attenuation_new,
+                                                   mp->use_attenuation_mimic,
+                                                   mp->attenuation_3D,
+                                                   d_one_minus_sum_beta,d_factor_common,
+                                                   d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
+                                                   mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+                                                   mp->anisotropic_3D_mantle,
+                                                   d_c11store,d_c12store,d_c13store,
+                                                   d_c14store,d_c15store,d_c16store,
+                                                   d_c22store,d_c23store,d_c24store,
+                                                   d_c25store,d_c26store,d_c33store,
+                                                   d_c34store,d_c35store,d_c36store,
+                                                   d_c44store,d_c45store,d_c46store,
+                                                   d_c55store,d_c56store,d_c66store,
+                                                   mp->gravity,
+                                                   mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
+                                                   mp->d_minus_gravity_table,
+                                                   mp->d_minus_deriv_gravity_table,
+                                                   mp->d_density_table,
+                                                   mp->d_wgll_cube,
+                                                   mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
   }
 
   // cudaEventRecord( stop, 0 );
@@ -1876,7 +1657,7 @@
       if( mp->attenuation_3D ){
         color_offset_nonpadded_att2 = (mp->nspec_outer_crust_mantle) * NGLL3 * N_SLS;
       }else{
-        color_offset_nonpadded_att2 = (mp->nspec_outer_crust_mantle) * 1 * N_SLS;      
+        color_offset_nonpadded_att2 = (mp->nspec_outer_crust_mantle) * 1 * N_SLS;
       }
       color_offset_ispec = mp->nspec_outer_crust_mantle;
     }
@@ -1966,7 +1747,7 @@
       if( mp->attenuation_3D ){
         color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
       }else{
-        color_offset_nonpadded_att2 += nb_blocks_to_compute * 1 * N_SLS;      
+        color_offset_nonpadded_att2 += nb_blocks_to_compute * 1 * N_SLS;
       }
       // for array(ispec)
       color_offset_ispec += nb_blocks_to_compute;

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2012-07-23 21:58:26 UTC (rev 20536)
@@ -37,6 +37,16 @@
 #include "config.h"
 #include "mesh_constants_cuda.h"
 
+#ifdef USE_TEXTURES_FIELDS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_ic_tex;
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_ic_tex;
+#endif
+
+#ifdef USE_TEXTURES_CONSTANTS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_ic_tex;
+#endif
+
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // elemental routines
@@ -51,15 +61,15 @@
                                              realw* R_xy,
                                              realw* R_xz,
                                              realw* R_yz,
-                                             reald* sigma_xx,
-                                             reald* sigma_yy,
-                                             reald* sigma_zz,
-                                             reald* sigma_xy,
-                                             reald* sigma_xz,
-                                             reald* sigma_yz) {
+                                             realw* sigma_xx,
+                                             realw* sigma_yy,
+                                             realw* sigma_zz,
+                                             realw* sigma_xy,
+                                             realw* sigma_xz,
+                                             realw* sigma_yz) {
 
   int i_sls,offset;
-  reald R_xx_val,R_yy_val;
+  realw R_xx_val,R_yy_val;
 
   for(i_sls = 0; i_sls < N_SLS; i_sls++){
     // index
@@ -91,24 +101,19 @@
                                               realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
                                               realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
                                               realw* epsilondev_xz,realw* epsilondev_yz,
-                                              reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
-                                              reald epsilondev_xz_loc,reald epsilondev_yz_loc,
+                                              realw epsilondev_xx_loc,realw epsilondev_yy_loc,realw epsilondev_xy_loc,
+                                              realw epsilondev_xz_loc,realw epsilondev_yz_loc,
                                               int ATTENUATION_3D
                                               ){
 
   int i_sls;
-  int ijk_ispec;
-  int offset_align,offset;
-  reald mul;
-  reald alphaval_loc,betaval_loc,gammaval_loc;
-  reald factor_loc,Sn,Snp1;
+  int offset;
+  realw mul;
+  realw alphaval_loc,betaval_loc,gammaval_loc;
+  realw factor_loc,Sn,Snp1;
 
-  // indices
-  offset_align = tx + NGLL3_PADDED * working_element;
-  ijk_ispec = tx + NGLL3 * working_element;
+  mul = d_muv[tx + NGLL3_PADDED * working_element];
 
-  mul = d_muv[offset_align];
-
   // use Runge-Kutta scheme to march in time
   for(i_sls = 0; i_sls < N_SLS; i_sls++){
 
@@ -128,28 +133,28 @@
     gammaval_loc = gammaval[i_sls];
 
     // term in xx
-    Sn   = factor_loc * epsilondev_xx[ijk_ispec]; //(i,j,k,ispec)
+    Sn   = factor_loc * epsilondev_xx[tx + NGLL3 * working_element]; //(i,j,k,ispec)
     Snp1   = factor_loc * epsilondev_xx_loc; //(i,j,k)
     R_xx[offset] = alphaval_loc * R_xx[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in yy
-    Sn   = factor_loc * epsilondev_yy[ijk_ispec];
+    Sn   = factor_loc * epsilondev_yy[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_yy_loc;
     R_yy[offset] = alphaval_loc * R_yy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
     // term in zz not computed since zero trace
 
     // term in xy
-    Sn   = factor_loc * epsilondev_xy[ijk_ispec];
+    Sn   = factor_loc * epsilondev_xy[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_xy_loc;
     R_xy[offset] = alphaval_loc * R_xy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in xz
-    Sn   = factor_loc * epsilondev_xz[ijk_ispec];
+    Sn   = factor_loc * epsilondev_xz[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_xz_loc;
     R_xz[offset] = alphaval_loc * R_xz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
 
     // term in yz
-    Sn   = factor_loc * epsilondev_yz[ijk_ispec];
+    Sn   = factor_loc * epsilondev_yz[tx + NGLL3 * working_element];
     Snp1   = factor_loc * epsilondev_yz_loc;
     R_yz[offset] = alphaval_loc * R_yz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
   }
@@ -167,40 +172,40 @@
                                            realw* d_minus_deriv_gravity_table,
                                            realw* d_density_table,
                                            realw* wgll_cube,
-                                           reald jacobianl,
-                                           reald* s_dummyx_loc,
-                                           reald* s_dummyy_loc,
-                                           reald* s_dummyz_loc,
-                                           reald* sigma_xx,
-                                           reald* sigma_yy,
-                                           reald* sigma_zz,
-                                           reald* sigma_xy,
-                                           reald* sigma_yx,
-                                           reald* sigma_xz,
-                                           reald* sigma_zx,
-                                           reald* sigma_yz,
-                                           reald* sigma_zy,
-                                           reald* rho_s_H1,
-                                           reald* rho_s_H2,
-                                           reald* rho_s_H3){
+                                           realw jacobianl,
+                                           realw* s_dummyx_loc,
+                                           realw* s_dummyy_loc,
+                                           realw* s_dummyz_loc,
+                                           realw* sigma_xx,
+                                           realw* sigma_yy,
+                                           realw* sigma_zz,
+                                           realw* sigma_xy,
+                                           realw* sigma_yx,
+                                           realw* sigma_xz,
+                                           realw* sigma_zx,
+                                           realw* sigma_yz,
+                                           realw* sigma_zy,
+                                           realw* rho_s_H1,
+                                           realw* rho_s_H2,
+                                           realw* rho_s_H3){
 
-  reald radius,theta,phi;
-  reald cos_theta,sin_theta,cos_phi,sin_phi;
-  reald minus_g,minus_dg;
-  reald rho;
-  reald gxl,gyl,gzl;
-  reald minus_g_over_radius,minus_dg_plus_g_over_radius;
-  reald cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
-  reald Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
-  reald sx_l,sy_l,sz_l;
-  reald factor;
+  realw radius,theta,phi;
+  realw cos_theta,sin_theta,cos_phi,sin_phi;
+  realw minus_g,minus_dg;
+  realw rho;
+  realw gxl,gyl,gzl;
+  realw minus_g_over_radius,minus_dg_plus_g_over_radius;
+  realw cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
+  realw Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
+  realw sx_l,sy_l,sz_l;
+  realw factor;
 
   // R_EARTH_KM is the radius of the bottom of the oceans
-  const reald R_EARTH = 6371000.0f; // in m
-  const reald R_EARTH_KM = 6371.0f; // in km
+  //const realw R_EARTH = 6371000.0f; // in m
+  //const realw R_EARTH_KM = 6371.0f; // in km
   // uncomment line below for PREM with oceans
-  //const reald R_EARTH = 6368000.0f;
-  //const reald R_EARTH_KM = 6368.0f;
+  //const realw R_EARTH = 6368000.0f;
+  //const realw R_EARTH_KM = 6368.0f;
 
   // compute non-symmetric terms for gravity
 
@@ -211,7 +216,7 @@
   radius = d_xstore[iglob];
   // make sure radius is never zero even for points at center of cube
   // because we later divide by radius
-  if(radius < 100.f / R_EARTH){ radius = 100.f / R_EARTH; }
+  if(radius < 100.f / (R_EARTH_KM*1000.0f)){ radius = 100.f / (R_EARTH_KM*1000.0f); }
 
   theta = d_ystore[iglob];
   phi = d_zstore[iglob];
@@ -310,7 +315,7 @@
                                          realw* d_xix, realw* d_xiy, realw* d_xiz,
                                          realw* d_etax, realw* d_etay, realw* d_etaz,
                                          realw* d_gammax, realw* d_gammay, realw* d_gammaz,
-                                         realw* d_hprime_xx, realw* d_hprime_yy, realw* d_hprime_zz,
+                                         realw* d_hprime_xx,
                                          realw* d_hprimewgll_xx, realw* d_hprimewgll_yy, realw* d_hprimewgll_zz,
                                          realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
                                          realw* d_kappav,
@@ -342,10 +347,6 @@
   /* int bx = blockIdx.x; */
   int tx = threadIdx.x;
 
-  //const int NGLLX = 5;
-  // const int NGLL2 = 25;
-  //const int NGLL3 = NGLL3;
-  const int NGLL3_ALIGN = NGLL3_PADDED;
   const int IFLAG_IN_FICTITIOUS_CUBE = 11; // from constants.h
 
   int K = (tx/NGLL2);
@@ -356,697 +357,700 @@
   int iglob = 0;
   int working_element;
 
-  reald tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
-  reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
-  reald duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
-  reald duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
-  reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+  realw tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+  realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+  realw duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+  realw duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+  realw duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
 
-  reald tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
-  reald duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
-  reald duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+  realw tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
+  realw duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+  realw duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
 
-  reald fac1,fac2,fac3;
-  reald lambdal,mul,lambdalplus2mul,kappal;
-  reald mul_iso,mul_aniso;
-  reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
-  reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
-  reald c11,c12,c13,c33,c44;
-  reald sum_terms1,sum_terms2,sum_terms3;
+  realw fac1,fac2,fac3;
+  realw lambdal,mul,lambdalplus2mul,kappal;
+  realw mul_iso,mul_aniso;
+  realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+  realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+  realw c11,c12,c13,c33,c44;
+  realw sum_terms1,sum_terms2,sum_terms3;
 
   // gravity variables
-  reald sigma_yx,sigma_zx,sigma_zy;
-  reald rho_s_H1,rho_s_H2,rho_s_H3;
+  realw sigma_yx,sigma_zx,sigma_zy;
+  realw rho_s_H1,rho_s_H2,rho_s_H3;
 
 #ifndef MANUALLY_UNROLLED_LOOPS
-    int l;
-    realw hp1,hp2,hp3;
+  int l;
 #endif
 
-    __shared__ reald s_dummyx_loc[NGLL3];
-    __shared__ reald s_dummyy_loc[NGLL3];
-    __shared__ reald s_dummyz_loc[NGLL3];
+  __shared__ realw s_dummyx_loc[NGLL3];
+  __shared__ realw s_dummyy_loc[NGLL3];
+  __shared__ realw s_dummyz_loc[NGLL3];
 
-    __shared__ reald s_dummyx_loc_att[NGLL3];
-    __shared__ reald s_dummyy_loc_att[NGLL3];
-    __shared__ reald s_dummyz_loc_att[NGLL3];
+  __shared__ realw s_dummyx_loc_att[NGLL3];
+  __shared__ realw s_dummyy_loc_att[NGLL3];
+  __shared__ realw s_dummyz_loc_att[NGLL3];
 
-    __shared__ reald s_tempx1[NGLL3];
-    __shared__ reald s_tempx2[NGLL3];
-    __shared__ reald s_tempx3[NGLL3];
-    __shared__ reald s_tempy1[NGLL3];
-    __shared__ reald s_tempy2[NGLL3];
-    __shared__ reald s_tempy3[NGLL3];
-    __shared__ reald s_tempz1[NGLL3];
-    __shared__ reald s_tempz2[NGLL3];
-    __shared__ reald s_tempz3[NGLL3];
+  __shared__ realw s_tempx1[NGLL3];
+  __shared__ realw s_tempx2[NGLL3];
+  __shared__ realw s_tempx3[NGLL3];
+  __shared__ realw s_tempy1[NGLL3];
+  __shared__ realw s_tempy2[NGLL3];
+  __shared__ realw s_tempy3[NGLL3];
+  __shared__ realw s_tempz1[NGLL3];
+  __shared__ realw s_tempz2[NGLL3];
+  __shared__ realw s_tempz3[NGLL3];
 
+  __shared__ realw sh_hprime_xx[NGLL2];
+
 // use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
 // because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
-    active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+  active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
 
 // copy from global memory to shared memory
 // each thread writes one of the NGLL^3 = 125 data points
-    if (active) {
+  if (active) {
 
 #ifdef USE_MESH_COLORING_GPU
+    working_element = bx;
+#else
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
       working_element = bx;
-#else
-      //mesh coloring
-      if( use_mesh_coloring_gpu ){
-        working_element = bx;
-      }else{
-        // iphase-1 and working_element-1 for Fortran->C array conventions
-        working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
-      }
+    }else{
+      // iphase-1 and working_element-1 for Fortran->C array conventions
+      working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
+    }
 #endif
 
-      // exclude fictitious elements in central cube
-      if( d_idoubling[working_element] == IFLAG_IN_FICTITIOUS_CUBE ){
-        active = 0;
-      }else{
-        // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
-        iglob = d_ibool[working_element*NGLL3 + tx]-1;
+    // exclude fictitious elements in central cube
+    if( d_idoubling[working_element] == IFLAG_IN_FICTITIOUS_CUBE ){
+      active = 0;
+    }else{
+      // iglob = d_ibool[working_element*NGLL3_PADDED + tx]-1;
+      iglob = d_ibool[working_element*NGLL3 + tx]-1;
 
-#ifdef USE_TEXTURES
-        s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
-        s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
-        s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
+#ifdef USE_TEXTURES_FIELDS
+      s_dummyx_loc[tx] = tex1Dfetch(d_displ_ic_tex, iglob*3);
+      s_dummyy_loc[tx] = tex1Dfetch(d_displ_ic_tex, iglob*3 + 1);
+      s_dummyz_loc[tx] = tex1Dfetch(d_displ_ic_tex, iglob*3 + 2);
 #else
-        // changing iglob indexing to match fortran row changes fast style
-        s_dummyx_loc[tx] = d_displ[iglob*3];
-        s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
-        s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+      // changing iglob indexing to match fortran row changes fast style
+      s_dummyx_loc[tx] = d_displ[iglob*3];
+      s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+      s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
 #endif
 
-        if(ATTENUATION){
-          if(ATTENUATION_NEW){
-            // takes new routines
-            // use first order Taylor expansion of displacement for local storage of stresses
-            // at this current time step, to fix attenuation in a consistent way
-#ifdef USE_TEXTURES
-            s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob);
-            s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + NGLOB);
-            s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + 2*NGLOB);
+      if(ATTENUATION){
+        if(ATTENUATION_NEW){
+          // takes new routines
+          // use first order Taylor expansion of displacement for local storage of stresses
+          // at this current time step, to fix attenuation in a consistent way
+#ifdef USE_TEXTURES_FIELDS
+          s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(d_displ_ic_tex, iglob*3);
+          s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(d_displ_ic_tex, iglob*3 + 1);
+          s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(d_displ_ic_tex, iglob*3 + 2);
 #else
-            s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
-            s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
-            s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+          s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
+          s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
+          s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
 #endif
-          }
-          else{
-            // takes old routines
-            s_dummyx_loc_att[tx] = s_dummyx_loc[tx];
-            s_dummyy_loc_att[tx] = s_dummyy_loc[tx];
-            s_dummyz_loc_att[tx] = s_dummyz_loc[tx];
-          }
         }
+        else{
+          // takes old routines
+          s_dummyx_loc_att[tx] = s_dummyx_loc[tx];
+          s_dummyy_loc_att[tx] = s_dummyy_loc[tx];
+          s_dummyz_loc_att[tx] = s_dummyz_loc[tx];
+        }
       }
     }
+  }
 
+  if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+    sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_ic_tex,tx);
+#else
+    sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+  }
+
 // synchronize all the threads (one thread for each of the NGLL grid points of the
 // current spectral element) because we need the whole element to be ready in order
 // to be able to compute the matrix products along cut planes of the 3D element below
-    __syncthreads();
+  __syncthreads();
 
-#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+  if (active) {
 
-    if (active) {
-
 #ifndef MANUALLY_UNROLLED_LOOPS
 
-      tempx1l = 0.f;
-      tempx2l = 0.f;
-      tempx3l = 0.f;
+    tempx1l = 0.f;
+    tempx2l = 0.f;
+    tempx3l = 0.f;
 
-      tempy1l = 0.f;
-      tempy2l = 0.f;
-      tempy3l = 0.f;
+    tempy1l = 0.f;
+    tempy2l = 0.f;
+    tempy3l = 0.f;
 
-      tempz1l = 0.f;
-      tempz2l = 0.f;
-      tempz3l = 0.f;
+    tempz1l = 0.f;
+    tempz2l = 0.f;
+    tempz3l = 0.f;
 
-      for (l=0;l<NGLLX;l++) {
-          hp1 = d_hprime_xx[l*NGLLX+I];
-          offset = K*NGLL2+J*NGLLX+l;
-          tempx1l += s_dummyx_loc[offset]*hp1;
-          tempy1l += s_dummyy_loc[offset]*hp1;
-          tempz1l += s_dummyz_loc[offset]*hp1;
+    for (l=0;l<NGLLX;l++) {
+        fac1 = sh_hprime_xx[l*NGLLX+I];
+        offset = K*NGLL2+J*NGLLX+l;
+        tempx1l += s_dummyx_loc[offset]*fac1;
+        tempy1l += s_dummyy_loc[offset]*fac1;
+        tempz1l += s_dummyz_loc[offset]*fac1;
 
-          hp2 = d_hprime_xx[l*NGLLX+J];
-          offset = K*NGLL2+l*NGLLX+I;
-          tempx2l += s_dummyx_loc[offset]*hp2;
-          tempy2l += s_dummyy_loc[offset]*hp2;
-          tempz2l += s_dummyz_loc[offset]*hp2;
+        fac2 = sh_hprime_xx[l*NGLLX+J];
+        offset = K*NGLL2+l*NGLLX+I;
+        tempx2l += s_dummyx_loc[offset]*fac2;
+        tempy2l += s_dummyy_loc[offset]*fac2;
+        tempz2l += s_dummyz_loc[offset]*fac2;
 
-          hp3 = d_hprime_xx[l*NGLLX+K];
-          offset = l*NGLL2+J*NGLLX+I;
-          tempx3l += s_dummyx_loc[offset]*hp3;
-          tempy3l += s_dummyy_loc[offset]*hp3;
-          tempz3l += s_dummyz_loc[offset]*hp3;
+        fac3 = sh_hprime_xx[l*NGLLX+K];
+        offset = l*NGLL2+J*NGLLX+I;
+        tempx3l += s_dummyx_loc[offset]*fac3;
+        tempy3l += s_dummyy_loc[offset]*fac3;
+        tempz3l += s_dummyz_loc[offset]*fac3;
 
-      }
+    }
 
-      if( ATTENUATION ){
-        // temporary variables used for fixing attenuation in a consistent way
-        tempx1l_att = 0.f;
-        tempx2l_att = 0.f;
-        tempx3l_att = 0.f;
+    if( ATTENUATION ){
+      // temporary variables used for fixing attenuation in a consistent way
+      tempx1l_att = 0.f;
+      tempx2l_att = 0.f;
+      tempx3l_att = 0.f;
 
-        tempy1l_att = 0.f;
-        tempy2l_att = 0.f;
-        tempy3l_att = 0.f;
+      tempy1l_att = 0.f;
+      tempy2l_att = 0.f;
+      tempy3l_att = 0.f;
 
-        tempz1l_att = 0.f;
-        tempz2l_att = 0.f;
-        tempz3l_att = 0.f;
+      tempz1l_att = 0.f;
+      tempz2l_att = 0.f;
+      tempz3l_att = 0.f;
 
-        for (l=0;l<NGLLX;l++) {
-                hp1 = d_hprime_xx[l*NGLLX+I];
-                offset = K*NGLL2+J*NGLLX+l;
-                tempx1l_att += s_dummyx_loc_att[offset]*hp1;
-                tempy1l_att += s_dummyy_loc_att[offset]*hp1;
-                tempz1l_att += s_dummyz_loc_att[offset]*hp1;
+      for (l=0;l<NGLLX;l++) {
+        fac1 = d_hprime_xx[l*NGLLX+I];
+        offset = K*NGLL2+J*NGLLX+l;
+        tempx1l_att += s_dummyx_loc_att[offset]*fac1;
+        tempy1l_att += s_dummyy_loc_att[offset]*fac1;
+        tempz1l_att += s_dummyz_loc_att[offset]*fac1;
 
-                hp2 = d_hprime_xx[l*NGLLX+J];
-                offset = K*NGLL2+l*NGLLX+I;
-                tempx2l_att += s_dummyx_loc_att[offset]*hp2;
-                tempy2l_att += s_dummyy_loc_att[offset]*hp2;
-                tempz2l_att += s_dummyz_loc_att[offset]*hp2;
+        fac2 = d_hprime_xx[l*NGLLX+J];
+        offset = K*NGLL2+l*NGLLX+I;
+        tempx2l_att += s_dummyx_loc_att[offset]*fac2;
+        tempy2l_att += s_dummyy_loc_att[offset]*fac2;
+        tempz2l_att += s_dummyz_loc_att[offset]*fac2;
 
-                hp3 = d_hprime_xx[l*NGLLX+K];
-                offset = l*NGLL2+J*NGLLX+I;
-                tempx3l_att += s_dummyx_loc_att[offset]*hp3;
-                tempy3l_att += s_dummyy_loc_att[offset]*hp3;
-                tempz3l_att += s_dummyz_loc_att[offset]*hp3;
-
-        }
+        fac3 = d_hprime_xx[l*NGLLX+K];
+        offset = l*NGLL2+J*NGLLX+I;
+        tempx3l_att += s_dummyx_loc_att[offset]*fac3;
+        tempy3l_att += s_dummyy_loc_att[offset]*fac3;
+        tempz3l_att += s_dummyz_loc_att[offset]*fac3;
       }
+    }
 #else
 
-      tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-              + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-              + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-              + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-              + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+    tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-      tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-              + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-              + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-              + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-              + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+    tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-      tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-              + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-              + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-              + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-              + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+    tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-      tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
-              + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-              + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-              + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-              + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+    tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-      tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
-              + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-              + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-              + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-              + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+    tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-      tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
-              + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-              + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-              + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-              + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+    tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-      tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
-              + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-              + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-              + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-              + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+    tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-      tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
-              + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-              + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-              + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-              + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+    tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-      tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
-              + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-              + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-              + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-              + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+    tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-      if( ATTENUATION ){
-        // temporary variables used for fixing attenuation in a consistent way
-        tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-          + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-          + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-          + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-          + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+    if( ATTENUATION ){
+      // temporary variables used for fixing attenuation in a consistent way
+      tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+        + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+        + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+        + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+        + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-        tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-          + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-          + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-          + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-          + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+      tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+        + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+        + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+        + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+        + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-        tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-          + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-          + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-          + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-          + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+      tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+        + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+        + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+        + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+        + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-        tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
-          + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-          + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-          + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-          + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+      tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+        + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+        + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+        + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+        + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-        tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
-          + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-          + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-          + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-          + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+      tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+        + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+        + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+        + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+        + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-        tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
-          + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-          + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-          + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-          + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+      tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+        + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+        + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+        + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+        + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-        tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
-          + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-          + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-          + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-          + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+      tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+        + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+        + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+        + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+        + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-        tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
-          + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-          + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-          + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-          + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+      tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+        + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+        + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+        + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+        + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-        tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
-          + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-          + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-          + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-          + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
-      }
+      tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+        + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+        + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+        + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+        + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+    }
 
 #endif
 
 // compute derivatives of ux, uy and uz with respect to x, y and z
-      offset = working_element*NGLL3_ALIGN + tx;
+    offset = working_element*NGLL3_PADDED + tx;
 
-      xixl = d_xix[offset];
-      xiyl = d_xiy[offset];
-      xizl = d_xiz[offset];
-      etaxl = d_etax[offset];
-      etayl = d_etay[offset];
-      etazl = d_etaz[offset];
-      gammaxl = d_gammax[offset];
-      gammayl = d_gammay[offset];
-      gammazl = d_gammaz[offset];
+    xixl = d_xix[offset];
+    xiyl = d_xiy[offset];
+    xizl = d_xiz[offset];
+    etaxl = d_etax[offset];
+    etayl = d_etay[offset];
+    etazl = d_etaz[offset];
+    gammaxl = d_gammax[offset];
+    gammayl = d_gammay[offset];
+    gammazl = d_gammaz[offset];
 
-      duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
-      duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
-      duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+    duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+    duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+    duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
 
-      duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
-      duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
-      duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+    duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+    duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+    duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
 
-      duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
-      duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
-      duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+    duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+    duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+    duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
 
-      // precompute some sums to save CPU time
-      duxdxl_plus_duydyl = duxdxl + duydyl;
-      duxdxl_plus_duzdzl = duxdxl + duzdzl;
-      duydyl_plus_duzdzl = duydyl + duzdzl;
-      duxdyl_plus_duydxl = duxdyl + duydxl;
-      duzdxl_plus_duxdzl = duzdxl + duxdzl;
-      duzdyl_plus_duydzl = duzdyl + duydzl;
+    // precompute some sums to save CPU time
+    duxdxl_plus_duydyl = duxdxl + duydyl;
+    duxdxl_plus_duzdzl = duxdxl + duzdzl;
+    duydyl_plus_duzdzl = duydyl + duzdzl;
+    duxdyl_plus_duydxl = duxdyl + duydxl;
+    duzdxl_plus_duxdzl = duzdxl + duxdzl;
+    duzdyl_plus_duydzl = duzdyl + duydzl;
 
-      if(ATTENUATION){
-        // temporary variables used for fixing attenuation in a consistent way
-        duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
-        duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
-        duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
+    if(ATTENUATION){
+      // temporary variables used for fixing attenuation in a consistent way
+      duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
+      duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
+      duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
 
-        duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
-        duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
-        duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
+      duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
+      duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
+      duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
 
-        duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
-        duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
-        duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
+      duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
+      duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
+      duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
 
-        // precompute some sums to save CPU time
-        duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
-        duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
-        duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
+      // precompute some sums to save CPU time
+      duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
+      duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
+      duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
 
-        // computes deviatoric strain attenuation and/or for kernel calculations
-        if(COMPUTE_AND_STORE_STRAIN) {
-          realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
+      // computes deviatoric strain attenuation and/or for kernel calculations
+      if(COMPUTE_AND_STORE_STRAIN) {
+        realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
 
-          // local storage: stresses at this current time step
-          epsilondev_xx_loc = duxdxl_att - templ;
-          epsilondev_yy_loc = duydyl_att - templ;
-          epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
-          epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
-          epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
+        // local storage: stresses at this current time step
+        epsilondev_xx_loc = duxdxl_att - templ;
+        epsilondev_yy_loc = duydyl_att - templ;
+        epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
+        epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
+        epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
 
-          if(SIMULATION_TYPE == 3) {
-            epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
-          }
+        if(SIMULATION_TYPE == 3) {
+          epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
         }
-      }else{
-        // computes deviatoric strain attenuation and/or for kernel calculations
-        if(COMPUTE_AND_STORE_STRAIN) {
-          realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+      }
+    }else{
+      // computes deviatoric strain attenuation and/or for kernel calculations
+      if(COMPUTE_AND_STORE_STRAIN) {
+        realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
 
-          // local storage: stresses at this current time step
-          epsilondev_xx_loc = duxdxl - templ;
-          epsilondev_yy_loc = duydyl - templ;
-          epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
-          epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
-          epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+        // local storage: stresses at this current time step
+        epsilondev_xx_loc = duxdxl - templ;
+        epsilondev_yy_loc = duydyl - templ;
+        epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+        epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+        epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
 
-          if(SIMULATION_TYPE == 3) {
-            epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
-          }
+        if(SIMULATION_TYPE == 3) {
+          epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
         }
       }
+    }
 
-      // compute elements with an elastic isotropic rheology
-      kappal = d_kappav[offset];
-      mul = d_muv[offset];
+    // compute elements with an elastic isotropic rheology
+    kappal = d_kappav[offset];
+    mul = d_muv[offset];
 
-      // attenuation
-      if(ATTENUATION){
-        // use unrelaxed parameters if attenuation
-        if( ATTENUATION_3D ){
-          mul_iso  = mul * one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
-          mul_aniso = mul *( one_minus_sum_beta[tx+working_element*NGLL3] - 1.0f );
-        }else{
-          mul_iso  = mul * one_minus_sum_beta[working_element]; // (1,1,1,ispec)
-          mul_aniso = mul *( one_minus_sum_beta[working_element] - 1.0f );        
-        }
+    // attenuation
+    if(ATTENUATION){
+      // use unrelaxed parameters if attenuation
+      if( ATTENUATION_3D ){
+        mul_iso  = mul * one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
+        mul_aniso = mul *( one_minus_sum_beta[tx+working_element*NGLL3] - 1.0f );
       }else{
-        mul_iso = mul;
+        mul_iso  = mul * one_minus_sum_beta[working_element]; // (1,1,1,ispec)
+        mul_aniso = mul *( one_minus_sum_beta[working_element] - 1.0f );
       }
+    }else{
+      mul_iso = mul;
+    }
 
-      // full anisotropic case, stress calculations
-      if(ANISOTROPY){
+    // full anisotropic case, stress calculations
+    if(ANISOTROPY){
 
-        // elastic tensor for hexagonal symmetry in reduced notation:
-        //
-        //      c11 c12 c13  0   0        0
-        //      c12 c11 c13  0   0        0
-        //      c13 c13 c33  0   0        0
-        //       0   0   0  c44  0        0
-        //       0   0   0   0  c44       0
-        //       0   0   0   0   0  (c11-c12)/2
-        //
-        //       in terms of the A, C, L, N and F of Love (1927):
-        //
-        //       c11 = A
-        //       c12 = A-2N
-        //       c13 = F
-        //       c33 = C
-        //       c44 = L
+      // elastic tensor for hexagonal symmetry in reduced notation:
+      //
+      //      c11 c12 c13  0   0        0
+      //      c12 c11 c13  0   0        0
+      //      c13 c13 c33  0   0        0
+      //       0   0   0  c44  0        0
+      //       0   0   0   0  c44       0
+      //       0   0   0   0   0  (c11-c12)/2
+      //
+      //       in terms of the A, C, L, N and F of Love (1927):
+      //
+      //       c11 = A
+      //       c12 = A-2N
+      //       c13 = F
+      //       c33 = C
+      //       c44 = L
 
-        c11 = d_c11store[offset];
-        c12 = d_c12store[offset];
-        c13 = d_c13store[offset];
-        c33 = d_c33store[offset];
-        c44 = d_c44store[offset];
+      c11 = d_c11store[offset];
+      c12 = d_c12store[offset];
+      c13 = d_c13store[offset];
+      c33 = d_c33store[offset];
+      c44 = d_c44store[offset];
 
-        // use unrelaxed parameters if attenuation
-        if( ATTENUATION){
-          c11 = c11 + 1.33333333333333333333f * mul_aniso; // FOUR_THIRDS = 1.33333
-          c12 = c12 - 0.66666666666666666666f * mul_aniso; // TWO_THIRDS = 0.66666666666666666666f
-          c13 = c13 - 0.66666666666666666666f * mul_aniso;
-          c33 = c33 + 1.33333333333333333333f * mul_aniso;
-          c44 = c44 + mul_aniso;
-        }
+      // use unrelaxed parameters if attenuation
+      if( ATTENUATION){
+        c11 = c11 + 1.33333333333333333333f * mul_aniso; // FOUR_THIRDS = 1.33333
+        c12 = c12 - 0.66666666666666666666f * mul_aniso; // TWO_THIRDS = 0.66666666666666666666f
+        c13 = c13 - 0.66666666666666666666f * mul_aniso;
+        c33 = c33 + 1.33333333333333333333f * mul_aniso;
+        c44 = c44 + mul_aniso;
+      }
 
-        sigma_xx = c11*duxdxl + c12*duydyl + c13*duzdzl;
-        sigma_yy = c12*duxdxl + c11*duydyl + c13*duzdzl;
-        sigma_zz = c13*duxdxl + c13*duydyl + c33*duzdzl;
-        sigma_xy = 0.5f*(c11-c12)*duxdyl_plus_duydxl;
-        sigma_xz = c44*duzdxl_plus_duxdzl;
-        sigma_yz = c44*duzdyl_plus_duydzl;
+      sigma_xx = c11*duxdxl + c12*duydyl + c13*duzdzl;
+      sigma_yy = c12*duxdxl + c11*duydyl + c13*duzdzl;
+      sigma_zz = c13*duxdxl + c13*duydyl + c33*duzdzl;
+      sigma_xy = 0.5f*(c11-c12)*duxdyl_plus_duydxl;
+      sigma_xz = c44*duzdxl_plus_duxdzl;
+      sigma_yz = c44*duzdyl_plus_duydzl;
 
-      }else{
+    }else{
 
-        // isotropic case
+      // isotropic case
 
-        lambdalplus2mul = kappal + 1.33333333333333333333f * mul_iso;  // 4./3. = 1.3333333
-        lambdal = lambdalplus2mul - 2.0f * mul_iso;
+      lambdalplus2mul = kappal + 1.33333333333333333333f * mul_iso;  // 4./3. = 1.3333333
+      lambdal = lambdalplus2mul - 2.0f * mul_iso;
 
-        // compute the six components of the stress tensor sigma
-        sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
-        sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
-        sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+      // compute the six components of the stress tensor sigma
+      sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+      sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+      sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
 
-        sigma_xy = mul*duxdyl_plus_duydxl;
-        sigma_xz = mul*duzdxl_plus_duxdzl;
-        sigma_yz = mul*duzdyl_plus_duydzl;
-      }
+      sigma_xy = mul*duxdyl_plus_duydxl;
+      sigma_xz = mul*duzdxl_plus_duxdzl;
+      sigma_yz = mul*duzdyl_plus_duydzl;
+    }
 
-      if(ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
-        // subtracts memory variables if attenuation
-        compute_element_ic_att_stress(tx,working_element,
-                                      R_xx,R_yy,R_xy,R_xz,R_yz,
-                                      &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
-      }
+    if(ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+      // subtracts memory variables if attenuation
+      compute_element_ic_att_stress(tx,working_element,
+                                    R_xx,R_yy,R_xy,R_xz,R_yz,
+                                    &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+    }
 
-      // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
-      sigma_yx = sigma_xy;
-      sigma_zx = sigma_xz;
-      sigma_zy = sigma_yz;
+    // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+    sigma_yx = sigma_xy;
+    sigma_zx = sigma_xz;
+    sigma_zy = sigma_yz;
 
-      // jacobian
-      jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
-                          -xiyl*(etaxl*gammazl-etazl*gammaxl)
-                          +xizl*(etaxl*gammayl-etayl*gammaxl));
+    // jacobian
+    jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
+                        -xiyl*(etaxl*gammazl-etazl*gammaxl)
+                        +xizl*(etaxl*gammayl-etayl*gammaxl));
 
-      if( GRAVITY ){
-        //  computes non-symmetric terms for gravity
-        compute_element_ic_gravity(tx,working_element,
-                                   d_ibool,d_xstore,d_ystore,d_zstore,
-                                   d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
-                                   wgll_cube,jacobianl,
-                                   s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
-                                   &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
-                                   &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,
-                                   &rho_s_H1,&rho_s_H2,&rho_s_H3);
-      }
+    if( GRAVITY ){
+      //  computes non-symmetric terms for gravity
+      compute_element_ic_gravity(tx,working_element,
+                                 d_ibool,d_xstore,d_ystore,d_zstore,
+                                 d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
+                                 wgll_cube,jacobianl,
+                                 s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+                                 &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
+                                 &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,
+                                 &rho_s_H1,&rho_s_H2,&rho_s_H3);
+    }
 
-      // form dot product with test vector, non-symmetric form
-      s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
-      s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
-      s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+    // form dot product with test vector, non-symmetric form
+    s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+    s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+    s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
 
-      s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
-      s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
-      s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+    s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+    s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+    s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
 
-      s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
-      s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
-      s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+    s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+    s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+    s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
 
-    }
+  }
 
 // synchronize all the threads (one thread for each of the NGLL grid points of the
 // current spectral element) because we need the whole element to be ready in order
 // to be able to compute the matrix products along cut planes of the 3D element below
-    __syncthreads();
+  __syncthreads();
 
-    if (active) {
+  if (active) {
 
 #ifndef MANUALLY_UNROLLED_LOOPS
 
-      tempx1l = 0.f;
-      tempy1l = 0.f;
-      tempz1l = 0.f;
+    tempx1l = 0.f;
+    tempy1l = 0.f;
+    tempz1l = 0.f;
 
-      tempx2l = 0.f;
-      tempy2l = 0.f;
-      tempz2l = 0.f;
+    tempx2l = 0.f;
+    tempy2l = 0.f;
+    tempz2l = 0.f;
 
-      tempx3l = 0.f;
-      tempy3l = 0.f;
-      tempz3l = 0.f;
+    tempx3l = 0.f;
+    tempy3l = 0.f;
+    tempz3l = 0.f;
 
-      for (l=0;l<NGLLX;l++) {
+    for (l=0;l<NGLLX;l++) {
 
-        fac1 = d_hprimewgll_xx[I*NGLLX+l];
-        offset = K*NGLL2+J*NGLLX+l;
-        tempx1l += s_tempx1[offset]*fac1;
-        tempy1l += s_tempy1[offset]*fac1;
-        tempz1l += s_tempz1[offset]*fac1;
+      fac1 = d_hprimewgll_xx[I*NGLLX+l];
+      offset = K*NGLL2+J*NGLLX+l;
+      tempx1l += s_tempx1[offset]*fac1;
+      tempy1l += s_tempy1[offset]*fac1;
+      tempz1l += s_tempz1[offset]*fac1;
 
-        fac2 = d_hprimewgll_yy[J*NGLLX+l];
-        offset = K*NGLL2+l*NGLLX+I;
-        tempx2l += s_tempx2[offset]*fac2;
-        tempy2l += s_tempy2[offset]*fac2;
-        tempz2l += s_tempz2[offset]*fac2;
+      fac2 = d_hprimewgll_yy[J*NGLLX+l];
+      offset = K*NGLL2+l*NGLLX+I;
+      tempx2l += s_tempx2[offset]*fac2;
+      tempy2l += s_tempy2[offset]*fac2;
+      tempz2l += s_tempz2[offset]*fac2;
 
-        fac3 = d_hprimewgll_zz[K*NGLLX+l];
-        offset = l*NGLL2+J*NGLLX+I;
-        tempx3l += s_tempx3[offset]*fac3;
-        tempy3l += s_tempy3[offset]*fac3;
-        tempz3l += s_tempz3[offset]*fac3;
+      fac3 = d_hprimewgll_zz[K*NGLLX+l];
+      offset = l*NGLL2+J*NGLLX+I;
+      tempx3l += s_tempx3[offset]*fac3;
+      tempy3l += s_tempy3[offset]*fac3;
+      tempz3l += s_tempz3[offset]*fac3;
 
-      }
+    }
 #else
 
-      tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
-              + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
-              + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
-              + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
-              + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+    tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
 
-      tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
-              + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
-              + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
-              + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
-              + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+    tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
 
-      tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
-              + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
-              + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
-              + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
-              + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+    tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+            + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+            + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+            + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+            + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
 
-      tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
-              + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
-              + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
-              + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
-              + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+    tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+            + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+            + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+            + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+            + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
 
-      tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
-              + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
-              + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
-              + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
-              + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+    tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+            + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+            + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+            + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+            + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
 
-      tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
-              + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
-              + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
-              + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
-              + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+    tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+            + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+            + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+            + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+            + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
 
-      tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
-              + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
-              + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
-              + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
-              + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+    tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+            + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+            + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+            + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+            + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
 
-      tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
-              + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
-              + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
-              + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
-              + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+    tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+            + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+            + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+            + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+            + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
 
-      tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
-              + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
-              + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
-              + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
-              + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+    tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+            + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+            + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+            + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+            + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
 
 #endif
 
-      fac1 = d_wgllwgll_yz[K*NGLLX+J];
-      fac2 = d_wgllwgll_xz[K*NGLLX+I];
-      fac3 = d_wgllwgll_xy[J*NGLLX+I];
+    fac1 = d_wgllwgll_yz[K*NGLLX+J];
+    fac2 = d_wgllwgll_xz[K*NGLLX+I];
+    fac3 = d_wgllwgll_xy[J*NGLLX+I];
 
-      sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
-      sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
-      sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+    sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+    sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+    sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
 
-      // adds gravity term
-      if( GRAVITY ){
-        sum_terms1 += rho_s_H1;
-        sum_terms2 += rho_s_H2;
-        sum_terms3 += rho_s_H3;
-      }
+    // adds gravity term
+    if( GRAVITY ){
+      sum_terms1 += rho_s_H1;
+      sum_terms2 += rho_s_H2;
+      sum_terms3 += rho_s_H3;
+    }
 
-#ifdef USE_TEXTURES
-      d_accel[iglob] = tex1Dfetch(tex_accel, iglob) + sum_terms1);
-      d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) + sum_terms2);
-      d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) + sum_terms3);
+
+#ifdef USE_MESH_COLORING_GPU
+    // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+    d_accel[iglob*3]     = tex1Dfetch(d_accel_ic_tex, iglob*3) + sum_terms1;
+    d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 1) + sum_terms2;
+    d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 2) + sum_terms3;
 #else
-  /* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
+    d_accel[iglob*3]     += sum_terms1;
+    d_accel[iglob*3 + 1] += sum_terms2;
+    d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
 
+#else // MESH_COLORING
 
-#ifdef USE_MESH_COLORING_GPU
-      // no atomic operation needed, colors don't share global points between elements
+    //mesh coloring
+    if( use_mesh_coloring_gpu ){
+
+     // no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+      d_accel[iglob*3]     = tex1Dfetch(d_accel_ic_tex, iglob*3) + sum_terms1;
+      d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 1) + sum_terms2;
+      d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 2) + sum_terms3;
+#else
       d_accel[iglob*3]     += sum_terms1;
       d_accel[iglob*3 + 1] += sum_terms2;
       d_accel[iglob*3 + 2] += sum_terms3;
-#else
-      //mesh coloring
-      if( use_mesh_coloring_gpu ){
+#endif // USE_TEXTURES_FIELDS
 
-       // no atomic operation needed, colors don't share global points between elements
-        d_accel[iglob*3]     += sum_terms1;
-        d_accel[iglob*3 + 1] += sum_terms2;
-        d_accel[iglob*3 + 2] += sum_terms3;
+    }else{
 
-      }else{
+      // for testing purposes only: w/out atomic updates
+      //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+      //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+      //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
 
-        // for testing purposes only: w/out atomic updates
-        //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
-        //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
-        //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+      atomicAdd(&d_accel[iglob*3], sum_terms1);
+      atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+      atomicAdd(&d_accel[iglob*3+2], sum_terms3);
 
-        atomicAdd(&d_accel[iglob*3], sum_terms1);
-        atomicAdd(&d_accel[iglob*3+1], sum_terms2);
-        atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+    }
+#endif // MESH_COLORING
 
-      }
-#endif
+    // update memory variables based upon the Runge-Kutta scheme
+    if( ATTENUATION && ! USE_ATTENUATION_MIMIC ){
+      compute_element_ic_att_memory(tx,working_element,
+                                d_muv,
+                                factor_common,alphaval,betaval,gammaval,
+                                R_xx,R_yy,R_xy,R_xz,R_yz,
+                                epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+                                epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc,
+                                ATTENUATION_3D);
+    }
 
-#endif
-
-      // update memory variables based upon the Runge-Kutta scheme
-      if( ATTENUATION && ! USE_ATTENUATION_MIMIC ){
-        compute_element_ic_att_memory(tx,working_element,
-                                  d_muv,
-                                  factor_common,alphaval,betaval,gammaval,
-                                  R_xx,R_yy,R_xy,R_xz,R_yz,
-                                  epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
-                                  epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc,
-                                  ATTENUATION_3D);
-      }
-
-      // save deviatoric strain for Runge-Kutta scheme
-      if( COMPUTE_AND_STORE_STRAIN ){
-        int ijk_ispec = tx + working_element*NGLL3;
-
-        // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
-        epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
-        epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
-        epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
-        epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
-        epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
-      }
-
+    // save deviatoric strain for Runge-Kutta scheme
+    if( COMPUTE_AND_STORE_STRAIN ){
+      // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+      epsilondev_xx[tx + working_element*NGLL3] = epsilondev_xx_loc;
+      epsilondev_yy[tx + working_element*NGLL3] = epsilondev_yy_loc;
+      epsilondev_xy[tx + working_element*NGLL3] = epsilondev_xy_loc;
+      epsilondev_xz[tx + working_element*NGLL3] = epsilondev_xz_loc;
+      epsilondev_yz[tx + working_element*NGLL3] = epsilondev_yz_loc;
     }
-
-#else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-    d_accel[iglob] -= 0.00000001f;
-    d_accel[iglob + NGLOB] -= 0.00000001f;
-    d_accel[iglob + 2*NGLOB] -= 0.00000001f;
-#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+  }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -1130,7 +1134,7 @@
                                              d_xix, d_xiy, d_xiz,
                                              d_etax, d_etay, d_etaz,
                                              d_gammax, d_gammay, d_gammaz,
-                                             mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                             mp->d_hprime_xx,
                                              mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
                                              mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
                                              d_kappav, d_muv,
@@ -1177,7 +1181,7 @@
                                                 d_xix, d_xiy, d_xiz,
                                                 d_etax, d_etay, d_etaz,
                                                 d_gammax, d_gammay, d_gammaz,
-                                                mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                                mp->d_hprime_xx,
                                                 mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
                                                 mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
                                                 d_kappav, d_muv,
@@ -1283,7 +1287,7 @@
       if( mp->attenuation_3D ){
         color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * NGLL3 * N_SLS;
       }else{
-        color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * 1 * N_SLS;      
+        color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * 1 * N_SLS;
       }
       color_offset_ispec = mp->nspec_outer_inner_core;
     }
@@ -1354,7 +1358,7 @@
       if( mp->attenuation_3D ){
         color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
       }else{
-        color_offset_nonpadded_att2 += nb_blocks_to_compute * 1 * N_SLS;      
+        color_offset_nonpadded_att2 += nb_blocks_to_compute * 1 * N_SLS;
       }
       // for array(ispec)
       color_offset_ispec += nb_blocks_to_compute;

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2012-07-23 21:58:26 UTC (rev 20536)
@@ -37,6 +37,15 @@
 #include "config.h"
 #include "mesh_constants_cuda.h"
 
+#ifdef USE_TEXTURES_FIELDS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_oc_tex;
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_oc_tex;
+#endif
+
+#ifdef USE_TEXTURES_CONSTANTS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_oc_tex;
+#endif
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // elemental routines
@@ -51,14 +60,14 @@
                                             realw deltat,
                                             realw* d_A_array_rotation,
                                             realw* d_B_array_rotation,
-                                            reald dpotentialdxl, reald dpotentialdyl,
-                                            reald* dpotentialdx_with_rot,
-                                            reald* dpotentialdy_with_rot) {
+                                            realw dpotentialdxl, realw dpotentialdyl,
+                                            realw* dpotentialdx_with_rot,
+                                            realw* dpotentialdy_with_rot) {
 
-  reald two_omega_deltat,cos_two_omega_t,sin_two_omega_t;
-  reald A_rotation,B_rotation;
-  reald ux_rotation,uy_rotation;
-  reald source_euler_A,source_euler_B;
+  realw two_omega_deltat,cos_two_omega_t,sin_two_omega_t;
+  realw A_rotation,B_rotation;
+  realw ux_rotation,uy_rotation;
+  realw source_euler_A,source_euler_B;
 
   // non-padded offset
   int offset_nonpadded = tx + working_element*NGLL3;
@@ -108,7 +117,7 @@
                                        realw* d_xix, realw* d_xiy, realw* d_xiz,
                                        realw* d_etax, realw* d_etay, realw* d_etaz,
                                        realw* d_gammax, realw* d_gammay, realw* d_gammaz,
-                                       realw* hprime_xx, realw* hprime_yy, realw* hprime_zz,
+                                       realw* d_hprime_xx,
                                        realw* hprimewgll_xx, realw* hprimewgll_yy, realw* hprimewgll_zz,
                                        realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
                                        int GRAVITY,
@@ -125,12 +134,10 @@
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
   int tx = threadIdx.x;
 
-  //const int NGLL3 = NGLL3;
-  const int NGLL3_ALIGN = NGLL3_PADDED;
   // R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
-  const reald R_EARTH_KM = 6371.0f;
+  //const realw R_EARTH_KM = 6371.0f;
   // uncomment line below for PREM with oceans
-  //const reald R_EARTH_KM = 6368.0f;
+  //const realw R_EARTH_KM = 6368.0f;
 
   int K = (tx/NGLL2);
   int J = ((tx-K*NGLL2)/NGLLX);
@@ -139,32 +146,34 @@
   int active,offset;
   int iglob = 0;
   int working_element;
-  reald temp1l,temp2l,temp3l;
-  reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
-  reald dpotentialdxl,dpotentialdyl,dpotentialdzl;
-  reald dpotentialdx_with_rot,dpotentialdy_with_rot;
-  reald fac1,fac2,fac3;
-  reald sum_terms;
-  reald gravity_term;
-  reald gxl,gyl,gzl;
-  reald radius,theta,phi;
-  reald cos_theta,sin_theta,cos_phi,sin_phi;
-  reald grad_x_ln_rho,grad_y_ln_rho,grad_z_ln_rho;
+
+  realw temp1l,temp2l,temp3l;
+  realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+  realw dpotentialdxl,dpotentialdyl,dpotentialdzl;
+  realw dpotentialdx_with_rot,dpotentialdy_with_rot;
+  realw fac1,fac2,fac3;
+  realw sum_terms;
+  realw gravity_term;
+  realw gxl,gyl,gzl;
+  realw radius,theta,phi;
+  realw cos_theta,sin_theta,cos_phi,sin_phi;
+  realw grad_x_ln_rho,grad_y_ln_rho,grad_z_ln_rho;
   int int_radius;
 
 
 #ifndef MANUALLY_UNROLLED_LOOPS
   int l;
   int offset1,offset2,offset3;
-  realw hp1,hp2,hp3;
 #endif
 
-  __shared__ reald s_dummy_loc[NGLL3];
+  __shared__ realw s_dummy_loc[NGLL3];
 
-  __shared__ reald s_temp1[NGLL3];
-  __shared__ reald s_temp2[NGLL3];
-  __shared__ reald s_temp3[NGLL3];
+  __shared__ realw s_temp1[NGLL3];
+  __shared__ realw s_temp2[NGLL3];
+  __shared__ realw s_temp3[NGLL3];
 
+  __shared__ realw sh_hprime_xx[NGLL2];
+
 // use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
 // because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
   active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
@@ -185,31 +194,32 @@
     }
 #endif
 
-    // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
+    // iglob = d_ibool[working_element*NGLL3_PADDED + tx]-1;
     iglob = d_ibool[working_element*NGLL3 + tx]-1;
 
-#ifdef USE_TEXTURES
-    s_dummy_loc[tx] = tex1Dfetch(tex_potential, iglob);
+#ifdef USE_TEXTURES_FIELDS
+    s_dummy_loc[tx] = tex1Dfetch(d_displ_oc_tex, iglob);
 #else
     // changing iglob indexing to match fortran row changes fast style
     s_dummy_loc[tx] = d_potential[iglob];
 #endif
   }
 
+  if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+    sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_oc_tex,tx);
+#else
+    sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+  }
+
 // synchronize all the threads (one thread for each of the NGLL grid points of the
 // current spectral element) because we need the whole element to be ready in order
 // to be able to compute the matrix products along cut planes of the 3D element below
   __syncthreads();
 
-#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-
   if (active) {
 
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//      if(iglob == 0 )printf("kernel 2: iglob %i  hprime_xx %f %f %f \n",iglob,hprime_xx[0],hprime_xx[1],hprime_xx[2]);
-#endif
-
-
 #ifndef MANUALLY_UNROLLED_LOOPS
 
     temp1l = 0.f;
@@ -217,43 +227,43 @@
     temp3l = 0.f;
 
     for (l=0;l<NGLLX;l++) {
-        hp1 = hprime_xx[l*NGLLX+I];
-        offset1 = K*NGLL2+J*NGLLX+l;
-        temp1l += s_dummy_loc[offset1]*hp1;
+      fac1 = sh_hprime_xx[l*NGLLX+I];
+      offset1 = K*NGLL2+J*NGLLX+l;
+      temp1l += s_dummy_loc[offset1]*fac1;
 
-        //no more assumes that hprime_xx = hprime_yy = hprime_zz
-        hp2 = hprime_yy[l*NGLLX+J];
-        offset2 = K*NGLL2+l*NGLLX+I;
-        temp2l += s_dummy_loc[offset2]*hp2;
+      //assumes that hprime_xx = hprime_yy = hprime_zz
+      fac2 = sh_hprime_xx[l*NGLLX+J];
+      offset2 = K*NGLL2+l*NGLLX+I;
+      temp2l += s_dummy_loc[offset2]*fac2;
 
-        hp3 = hprime_zz[l*NGLLX+K];
-        offset3 = l*NGLL2+J*NGLLX+I;
-        temp3l += s_dummy_loc[offset3]*hp3;
+      fac3 = sh_hprime_xx[l*NGLLX+K];
+      offset3 = l*NGLL2+J*NGLLX+I;
+      temp3l += s_dummy_loc[offset3]*fac3;
     }
 #else
 
-    temp1l = s_dummy_loc[K*NGLL2+J*NGLLX]*hprime_xx[I]
-            + s_dummy_loc[K*NGLL2+J*NGLLX+1]*hprime_xx[NGLLX+I]
-            + s_dummy_loc[K*NGLL2+J*NGLLX+2]*hprime_xx[2*NGLLX+I]
-            + s_dummy_loc[K*NGLL2+J*NGLLX+3]*hprime_xx[3*NGLLX+I]
-            + s_dummy_loc[K*NGLL2+J*NGLLX+4]*hprime_xx[4*NGLLX+I];
+    temp1l = s_dummy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+            + s_dummy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-    temp2l = s_dummy_loc[K*NGLL2+I]*hprime_yy[J]
-            + s_dummy_loc[K*NGLL2+NGLLX+I]*hprime_yy[NGLLX+J]
-            + s_dummy_loc[K*NGLL2+2*NGLLX+I]*hprime_yy[2*NGLLX+J]
-            + s_dummy_loc[K*NGLL2+3*NGLLX+I]*hprime_yy[3*NGLLX+J]
-            + s_dummy_loc[K*NGLL2+4*NGLLX+I]*hprime_yy[4*NGLLX+J];
+    temp2l = s_dummy_loc[K*NGLL2+I]*d_hprime_xx[J]
+            + s_dummy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+            + s_dummy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+            + s_dummy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+            + s_dummy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-    temp3l = s_dummy_loc[J*NGLLX+I]*hprime_zz[K]
-            + s_dummy_loc[NGLL2+J*NGLLX+I]*hprime_zz[NGLLX+K]
-            + s_dummy_loc[2*NGLL2+J*NGLLX+I]*hprime_zz[2*NGLLX+K]
-            + s_dummy_loc[3*NGLL2+J*NGLLX+I]*hprime_zz[3*NGLLX+K]
-            + s_dummy_loc[4*NGLL2+J*NGLLX+I]*hprime_zz[4*NGLLX+K];
+    temp3l = s_dummy_loc[J*NGLLX+I]*d_hprime_xx[K]
+            + s_dummy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+            + s_dummy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+            + s_dummy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+            + s_dummy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
 #endif
 
     // compute derivatives of ux, uy and uz with respect to x, y and z
-    offset = working_element*NGLL3_ALIGN + tx;
+    offset = working_element*NGLL3_PADDED + tx;
 
     xixl = d_xix[offset];
     xiyl = d_xiy[offset];
@@ -414,41 +424,41 @@
     fac3 = wgllwgll_xy[J*NGLLX+I];
 
     sum_terms = -(fac1*temp1l + fac2*temp2l + fac3*temp3l);
+
     if( GRAVITY ) sum_terms += gravity_term;
 
-    iglob = d_ibool[working_element*NGLL3 + tx]-1;
+    //iglob = d_ibool[working_element*NGLL3 + tx]-1;
 
-#ifdef USE_TEXTURES
-    d_potential_dot_dot[iglob] = tex1Dfetch(tex_potential_dot_dot, iglob)
-                                            + sum_terms;
-#else
-
 #ifdef USE_MESH_COLORING_GPU
     // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+    d_potential_dot_dot[iglob] = tex1Dfetch(d_accel_oc_tex, iglob) + sum_terms;
+#else
     d_potential_dot_dot[iglob] += sum_terms;
-#else
+#endif // USE_TEXTURES_FIELDS
+
+#else // MESH_COLORING
+
     //mesh coloring
     if( use_mesh_coloring_gpu ){
 
       // no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+      d_potential_dot_dot[iglob] = tex1Dfetch(d_accel_oc_tex, iglob) + sum_terms;
+#else
       d_potential_dot_dot[iglob] += sum_terms;
+#endif // USE_TEXTURES_FIELDS
 
     }else{
 
       atomicAdd(&d_potential_dot_dot[iglob],sum_terms);
 
     }
-#endif
-
-#endif
+#endif // MESH_COLORING
   }
-
-#else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-  d_potential_dot_dot[iglob] = 123.123f;
-#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
 }
 
-
 /* ----------------------------------------------------------------------------------------------- */
 
 void Kernel_2_outer_core(int nb_blocks_to_compute, Mesh* mp,
@@ -488,45 +498,18 @@
   // cudaEventRecord( start, 0 );
 
   Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
-                                                        mp->NGLOB_OUTER_CORE,
-                                                        d_ibool,
-                                                        mp->d_phase_ispec_inner_outer_core,
-                                                        mp->num_phase_ispec_outer_core,
-                                                        d_iphase,
-                                                        mp->use_mesh_coloring_gpu,
-                                                        mp->d_displ_outer_core,
-                                                        mp->d_accel_outer_core,
-                                                        d_xix, d_xiy, d_xiz,
-                                                        d_etax, d_etay, d_etaz,
-                                                        d_gammax, d_gammay, d_gammaz,
-                                                        mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
-                                                        mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
-                                                        mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
-                                                        mp->gravity,
-                                                        mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
-                                                        mp->d_d_ln_density_dr_table,
-                                                        mp->d_minus_rho_g_over_kappa_fluid,
-                                                        mp->d_wgll_cube,
-                                                        mp->rotation,
-                                                        time,
-                                                        mp->d_two_omega_earth,
-                                                        mp->d_deltat,
-                                                        d_A_array_rotation,d_B_array_rotation);
-
-  if(mp->simulation_type == 3) {
-    Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
                                                           mp->NGLOB_OUTER_CORE,
                                                           d_ibool,
                                                           mp->d_phase_ispec_inner_outer_core,
                                                           mp->num_phase_ispec_outer_core,
                                                           d_iphase,
                                                           mp->use_mesh_coloring_gpu,
-                                                          mp->d_b_displ_outer_core,
-                                                          mp->d_b_accel_outer_core,
+                                                          mp->d_displ_outer_core,
+                                                          mp->d_accel_outer_core,
                                                           d_xix, d_xiy, d_xiz,
                                                           d_etax, d_etay, d_etaz,
                                                           d_gammax, d_gammay, d_gammaz,
-                                                          mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+                                                          mp->d_hprime_xx,
                                                           mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
                                                           mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
                                                           mp->gravity,
@@ -535,10 +518,37 @@
                                                           mp->d_minus_rho_g_over_kappa_fluid,
                                                           mp->d_wgll_cube,
                                                           mp->rotation,
-                                                          b_time,
-                                                          mp->d_b_two_omega_earth,
-                                                          mp->d_b_deltat,
-                                                          d_b_A_array_rotation,d_b_B_array_rotation);
+                                                          time,
+                                                          mp->d_two_omega_earth,
+                                                          mp->d_deltat,
+                                                          d_A_array_rotation,d_B_array_rotation);
+
+  if(mp->simulation_type == 3) {
+    Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+                                                            mp->NGLOB_OUTER_CORE,
+                                                            d_ibool,
+                                                            mp->d_phase_ispec_inner_outer_core,
+                                                            mp->num_phase_ispec_outer_core,
+                                                            d_iphase,
+                                                            mp->use_mesh_coloring_gpu,
+                                                            mp->d_b_displ_outer_core,
+                                                            mp->d_b_accel_outer_core,
+                                                            d_xix, d_xiy, d_xiz,
+                                                            d_etax, d_etay, d_etaz,
+                                                            d_gammax, d_gammay, d_gammaz,
+                                                            mp->d_hprime_xx,
+                                                            mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+                                                            mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                            mp->gravity,
+                                                            mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
+                                                            mp->d_d_ln_density_dr_table,
+                                                            mp->d_minus_rho_g_over_kappa_fluid,
+                                                            mp->d_wgll_cube,
+                                                            mp->rotation,
+                                                            b_time,
+                                                            mp->d_b_two_omega_earth,
+                                                            mp->d_b_deltat,
+                                                            d_b_A_array_rotation,d_b_B_array_rotation);
   }
 
   // cudaEventRecord( stop, 0 );

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2012-07-23 21:58:26 UTC (rev 20536)
@@ -352,8 +352,6 @@
                                         realw* scalar_field,
                                         realw* vector_field_element,
                                         realw* hprime_xx,
-                                        realw* hprime_yy,
-                                        realw* hprime_zz,
                                         realw* d_xix,
                                         realw* d_xiy,
                                         realw* d_xiz,
@@ -370,7 +368,7 @@
   int l,offset,offset1,offset2,offset3;
 
   //const int NGLLX = 5;
-  const int NGLL3_ALIGN = NGLL3_PADDED;
+  //const int NGLL3_ALIGN = NGLL3_PADDED;
 
   int K = (ijk/NGLL2);
   int J = ((ijk-K*NGLL2)/NGLLX);
@@ -387,7 +385,8 @@
   // derivative along y
   temp2l = 0.f;
   for( l=0; l<NGLLX;l++){
-    hp2 = hprime_yy[l*NGLLX+J];
+    //assumes that hprime_xx = hprime_yy = hprime_zz
+    hp2 = hprime_xx[l*NGLLX+J];
     offset2 = K*NGLL2+l*NGLLX+I;
     temp2l += scalar_field[offset2]*hp2;
   }
@@ -395,12 +394,13 @@
   // derivative along z
   temp3l = 0.f;
   for( l=0; l<NGLLX;l++){
-    hp3 = hprime_zz[l*NGLLX+K];
+    //assumes that hprime_xx = hprime_yy = hprime_zz
+    hp3 = hprime_xx[l*NGLLX+K];
     offset3 = l*NGLL2+J*NGLLX+I;
     temp3l += scalar_field[offset3]*hp3;
   }
 
-  offset = ispec*NGLL3_ALIGN + ijk;
+  offset = ispec*NGLL3_PADDED + ijk;
 
   xixl = d_xix[offset];
   xiyl = d_xiy[offset];
@@ -429,8 +429,6 @@
                                                 realw* rhostore,
                                                 realw* kappastore,
                                                 realw* hprime_xx,
-                                                realw* hprime_yy,
-                                                realw* hprime_zz,
                                                 realw* d_xix,
                                                 realw* d_xiy,
                                                 realw* d_xiz,
@@ -476,12 +474,12 @@
 
     // displacement vector from backward field
     compute_gradient_kernel(ijk,ispec,scalar_field_displ,b_displ_elm,
-                            hprime_xx,hprime_yy,hprime_zz,
+                            hprime_xx,
                             d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz);
 
     // acceleration vector
     compute_gradient_kernel(ijk,ispec,scalar_field_accel,accel_elm,
-                            hprime_xx,hprime_yy,hprime_zz,
+                            hprime_xx,
                             d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz);
 
     // gets material parameter
@@ -530,8 +528,6 @@
                                                     mp->d_rhostore_outer_core,
                                                     mp->d_kappavstore_outer_core,
                                                     mp->d_hprime_xx,
-                                                    mp->d_hprime_yy,
-                                                    mp->d_hprime_zz,
                                                     mp->d_xix_outer_core,
                                                     mp->d_xiy_outer_core,
                                                     mp->d_xiz_outer_core,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu	2012-07-23 21:58:26 UTC (rev 20536)
@@ -39,7 +39,6 @@
 
 #include "config.h"
 #include "mesh_constants_cuda.h"
-#include "prepare_constants_cuda.h"
 
 /* ----------------------------------------------------------------------------------------------- */
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-07-23 21:58:26 UTC (rev 20536)
@@ -112,23 +112,58 @@
 #define IREGION_CRUST_MANTLE  1
 #define IREGION_INNER_CORE  3
 
+// R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
+#define R_EARTH_KM 6371.0f
+// uncomment line below for PREM with oceans
+//#define R_EARTH_KM 6368.0f
+
+
 /* ----------------------------------------------------------------------------------------------- */
 
-//typedef float real;   // type of variables passed into function
-typedef float realw;  // type of "working" variables
+// type of "working" variables: see also CUSTOM_REAL
+// double precision temporary variables leads to 10% performance decrease
+// in Kernel_2_impl (not very much..)
+typedef float realw;
 
-// double precision temporary variables leads to 10% performance
-// decrease in Kernel_2_impl (not very much..)
-typedef float reald;
 
+/* ----------------------------------------------------------------------------------------------- */
+
 // (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
 // leads up to ~ 5% performance increase
 //#define USE_MESH_COLORING_GPU
 
+/* ----------------------------------------------------------------------------------------------- */
+
+// Texture memory usage:
+// requires CUDA version >= 4.0, see check below
+// Use textures for d_displ and d_accel -- 10% performance boost
+#define USE_TEXTURES_FIELDS
+
+// Using texture memory for the hprime-style constants is slower on
+// Fermi generation hardware, but *may* be faster on Kepler
+// generation.
+// Use textures for hprime_xx
+#define USE_TEXTURES_CONSTANTS
+
+// CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize()
+#if CUDA_VERSION < 4000
+#undef USE_TEXTURES_FIELDS
+#undef USE_TEXTURES_CONSTANTS
+#endif
+
+#ifdef USE_TEXTURES_FIELDS
+#pragma message ("\nCompiling with: USE_TEXTURES_FIELDS enabled\n")
+#endif
+#ifdef USE_TEXTURES_CONSTANTS
+#pragma message ("\nCompiling with: USE_TEXTURES_CONSTANTS enabled\n")
+#endif
+
 // (optional) unrolling loops
 // leads up to ~1% performance increase
 //#define MANUALLY_UNROLLED_LOOPS
 
+/* ----------------------------------------------------------------------------------------------- */
+
 // cuda kernel block size for updating displacements/potential (newmark time scheme)
 // current hardware: 128 is slightly faster than 256 ( ~ 4%)
 #define BLOCKSIZE_KERNEL1 128
@@ -221,6 +256,12 @@
   // backward/reconstructed elastic wavefield
   realw* d_b_displ_crust_mantle; realw* d_b_veloc_crust_mantle; realw* d_b_accel_crust_mantle;
 
+#ifdef USE_TEXTURES_FIELDS
+  // Texture references for fast non-coalesced scattered access
+  const textureReference* d_displ_cm_tex_ref_ptr;
+  const textureReference* d_accel_cm_tex_ref_ptr;
+#endif
+
   // attenuation
   realw* d_R_xx_crust_mantle;
   realw* d_R_yy_crust_mantle;
@@ -305,6 +346,12 @@
   // backward/reconstructed elastic wavefield
   realw* d_b_displ_outer_core; realw* d_b_veloc_outer_core; realw* d_b_accel_outer_core;
 
+#ifdef USE_TEXTURES_FIELDS
+  // Texture references for fast non-coalesced scattered access
+  const textureReference* d_displ_oc_tex_ref_ptr;
+  const textureReference* d_accel_oc_tex_ref_ptr;
+#endif
+
   // kernels
   realw* d_rho_kl_outer_core;
   realw* d_alpha_kl_outer_core;
@@ -369,6 +416,12 @@
   // backward/reconstructed elastic wavefield
   realw* d_b_displ_inner_core; realw* d_b_veloc_inner_core; realw* d_b_accel_inner_core;
 
+#ifdef USE_TEXTURES_FIELDS
+  // Texture references for fast non-coalesced scattered access
+  const textureReference* d_displ_ic_tex_ref_ptr;
+  const textureReference* d_accel_ic_tex_ref_ptr;
+#endif
+
   // attenuation
   realw* d_R_xx_inner_core;
   realw* d_R_yy_inner_core;
@@ -447,7 +500,15 @@
   // ------------------------------------------------------------------ //
 
   // pointers to constant memory arrays
-  realw* d_hprime_xx; realw* d_hprime_yy; realw* d_hprime_zz;
+  realw* d_hprime_xx;
+  //realw* d_hprime_yy; // only needed if NGLLX != NGLLY != NGLLZ
+  //realw* d_hprime_zz; // only needed if NGLLX != NGLLY != NGLLZ
+
+#ifdef USE_TEXTURES_CONSTANTS
+  const textureReference* d_hprime_xx_tex_ptr;
+  realw* d_hprime_xx_tex;
+#endif
+
   realw* d_hprimewgll_xx; realw* d_hprimewgll_yy; realw* d_hprimewgll_zz;
   realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz;
   realw* d_wgll_cube;
@@ -461,12 +522,12 @@
   // simulation flags
   int save_forward;
   int absorbing_conditions;
-  
+
   int attenuation;
   int attenuation_new;
   int use_attenuation_mimic;
   int attenuation_3D;
-  
+
   int compute_and_store_strain;
   int anisotropic_3D_mantle;
   int gravity;

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2012-07-23 21:58:26 UTC (rev 20536)
@@ -33,111 +33,231 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-// setters for these const arrays (very ugly hack, but will have to do)
+// CONSTANT arrays setup
 
-// elastic
-void setConst_hprime_xx(realw* array,Mesh* mp);
-void setConst_hprime_yy(realw* array,Mesh* mp);
-void setConst_hprime_zz(realw* array,Mesh* mp);
+/* ----------------------------------------------------------------------------------------------- */
 
-void setConst_hprimewgll_xx(realw* array,Mesh* mp);
-void setConst_hprimewgll_yy(realw* array,Mesh* mp);
-void setConst_hprimewgll_zz(realw* array,Mesh* mp);
+/* note:
+ constant arrays when used in other compute_forces_***_cuda.cu routines stay zero,
+ constant declaration and cudaMemcpyToSymbol would have to be in the same file...
 
-void setConst_wgllwgll_xy(realw* array,Mesh* mp);
-void setConst_wgllwgll_xz(realw* array, Mesh* mp);
-void setConst_wgllwgll_yz(realw* array, Mesh* mp);
+ extern keyword doesn't work for __constant__ declarations.
 
-void setConst_wgll_cube(realw* array, Mesh* mp);
+ also:
+ cudaMemcpyToSymbol("deviceCaseParams", caseParams, sizeof(CaseParams));
+ ..
+ and compile with -arch=sm_20
 
-/* ----------------------------------------------------------------------------------------------- */
+ see also: http://stackoverflow.com/questions/4008031/how-to-use-cuda-constant-memory-in-a-programmer-pleasant-way
+ doesn't seem to work.
 
-/* CUDA specific things from specfem3D_kernels.cu */
+ we could keep arrays separated for acoustic and elastic routines...
 
-#ifdef USE_TEXTURES
-  // declaration of textures
-  texture<realw, 1, cudaReadModeElementType> tex_displ;
-  texture<realw, 1, cudaReadModeElementType> tex_veloc;
-  texture<realw, 1, cudaReadModeElementType> tex_accel;
+ workaround:
 
-  texture<realw, 1, cudaReadModeElementType> tex_potential;
-  texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot;
+ for now, we store pointers with cudaGetSymbolAddress() function calls.
+ we pass those pointers in all other compute_forces_..() routines
 
-  // for binding the textures
+ in this file, we can use the above constant array declarations without need of the pointers.
 
-  void bindTexturesDispl(realw* d_displ)
-  {
-    cudaError_t err;
+ */
 
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+// cuda constant arrays
+//
+// note: we use definition __device__ to use global memory rather than constant memory registers
+//          to avoid over-loading registers; this should help increasing the occupancy on the GPU
 
-    err = cudaBindTexture(NULL,tex_displ, d_displ, channelDescFloat, NDIM*NGLOB*sizeof(realw));
-    if (err != cudaSuccess)
-    {
-      fprintf(stderr, "Error in bindTexturesDispl for displ: %s\n", cudaGetErrorString(err));
-      exit(1);
-    }
-  }
+__device__ realw d_hprime_xx[NGLL2];
+//__device__ realw d_hprime_yy[NGLL2]; // only needed if NGLLX != NGLLY != NGLLZ
+//__device__ realw d_hprime_zz[NGLL2]; // only needed if NGLLX != NGLLY != NGLLZ
 
-  void bindTexturesVeloc(realw* d_veloc)
+__device__ realw d_hprimewgll_xx[NGLL2];
+__device__ realw d_hprimewgll_yy[NGLL2];
+__device__ realw d_hprimewgll_zz[NGLL2];
+
+__device__ realw d_wgllwgll_xy[NGLL2];
+__device__ realw d_wgllwgll_xz[NGLL2];
+__device__ realw d_wgllwgll_yz[NGLL2];
+
+__device__ realw d_wgll_cube[NGLL3]; // needed only for gravity case
+
+
+// setup functions
+void setConst_hprime_xx(realw* array,Mesh* mp)
+{
+
+  cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
   {
-    cudaError_t err;
+    fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err));
+    fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+    exit(1);
+  }
 
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprime_xx),"d_hprime_xx");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprime_xx: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
 
-    err = cudaBindTexture(NULL,tex_veloc, d_veloc, channelDescFloat, NDIM*NGLOB*sizeof(realw));
-    if (err != cudaSuccess)
-    {
-      fprintf(stderr, "Error in bindTexturesVeloc for veloc: %s\n", cudaGetErrorString(err));
-      exit(1);
-    }
+/*
+ // only needed if NGLLX != NGLLY != NGLLZ
+ void setConst_hprime_yy(realw* array,Mesh* mp)
+ {
+
+ cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
+ fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+ exit(1);
+ }
+
+ err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+ }
+ */
+
+/*
+ // only needed if NGLLX != NGLLY != NGLLZ
+ void setConst_hprime_zz(realw* array,Mesh* mp)
+ {
+
+ cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
+ fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+ exit(1);
+ }
+
+ err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+ }
+ */
+
+void setConst_hprimewgll_xx(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err));
+    exit(1);
   }
 
-  void bindTexturesAccel(realw* d_accel)
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_xx),"d_hprimewgll_xx");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprimewgll_xx: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+void setConst_hprimewgll_yy(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
   {
-    cudaError_t err;
+    fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
 
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_yy),"d_hprimewgll_yy");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprimewgll_yy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
 
-    err = cudaBindTexture(NULL,tex_accel, d_accel, channelDescFloat, NDIM*NGLOB*sizeof(realw));
-    if (err != cudaSuccess)
-    {
-      fprintf(stderr, "Error in bindTexturesAccel for accel: %s\n", cudaGetErrorString(err));
-      exit(1);
-    }
+void setConst_hprimewgll_zz(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err));
+    exit(1);
   }
 
-  void bindTexturesPotential(realw* d_potential)
+  err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_zz),"d_hprimewgll_zz");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_hprimewgll_zz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+}
+
+void setConst_wgllwgll_xy(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
   {
-    cudaError_t err;
+    fprintf(stderr, "Error in setConst_wgllwgll_xy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
 
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xy),"d_wgllwgll_xy");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgllwgll_xy: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
 
-    err = cudaBindTexture(NULL,tex_potential, d_potential,
-                          channelDescFloat, NGLOB*sizeof(realw));
-    if (err != cudaSuccess)
-    {
-      fprintf(stderr, "Error in bindTexturesPotential for potential: %s\n", cudaGetErrorString(err));
-      exit(1);
-    }
+}
+
+void setConst_wgllwgll_xz(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in  setConst_wgllwgll_xz: %s\n", cudaGetErrorString(err));
+    exit(1);
   }
 
-  void bindTexturesPotential_dot_dot(realw* d_potential_dot_dot)
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xz),"d_wgllwgll_xz");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgllwgll_xz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
+
+}
+
+void setConst_wgllwgll_yz(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(realw));
+  if (err != cudaSuccess)
   {
-    cudaError_t err;
+    fprintf(stderr, "Error in setConst_wgllwgll_yz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
 
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_yz),"d_wgllwgll_yz");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgllwgll_yz: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
 
-    err = cudaBindTexture(NULL,tex_potential_dot_dot, d_potential_dot_dot,
-                          channelDescFloat, NGLOB*sizeof(realw));
-    if (err != cudaSuccess)
-    {
-      fprintf(stderr, "Error in bindTexturesPotential_dot_dot for potential_dot_dot: %s\n", cudaGetErrorString(err));
-      exit(1);
-    }
+}
+
+void setConst_wgll_cube(realw* array,Mesh* mp)
+{
+  cudaError_t err = cudaMemcpyToSymbol(d_wgll_cube, array, NGLL3*sizeof(realw));
+  if (err != cudaSuccess)
+  {
+    fprintf(stderr, "Error in setConst_wgll_cube: %s\n", cudaGetErrorString(err));
+    exit(1);
   }
 
-#endif // USE_TEXTURES
+  err = cudaGetSymbolAddress((void**)&(mp->d_wgll_cube),"d_wgll_cube");
+  if(err != cudaSuccess) {
+    fprintf(stderr, "Error with d_wgll_cube: %s\n", cudaGetErrorString(err));
+    exit(1);
+  }
 
+}
 
+
 #endif //CUDA_HEADER_H

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-07-23 21:58:26 UTC (rev 20536)
@@ -42,7 +42,6 @@
 #include "prepare_constants_cuda.h"
 
 
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // GPU preparation
@@ -101,15 +100,41 @@
 
   // sets constant arrays
   setConst_hprime_xx(h_hprime_xx,mp);
-  setConst_hprime_yy(h_hprime_yy,mp);
-  setConst_hprime_zz(h_hprime_zz,mp);
+  //setConst_hprime_yy(h_hprime_yy,mp); // only needed if NGLLX != NGLLY != NGLLZ
+  //setConst_hprime_zz(h_hprime_zz,mp); // only needed if NGLLX != NGLLY != NGLLZ
+
   setConst_hprimewgll_xx(h_hprimewgll_xx,mp);
   setConst_hprimewgll_yy(h_hprimewgll_yy,mp);
   setConst_hprimewgll_zz(h_hprimewgll_zz,mp);
+
   setConst_wgllwgll_xy(h_wgllwgll_xy,mp);
   setConst_wgllwgll_xz(h_wgllwgll_xz,mp);
   setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
 
+  // Using texture memory for the hprime-style constants is slower on
+  // Fermi generation hardware, but *may* be faster on Kepler
+  // generation. We will reevaluate this again, so might as well leave
+  // in the code with #USE_TEXTURES_FIELDS not-defined.
+  #ifdef USE_TEXTURES_CONSTANTS
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&(mp->d_hprime_xx_tex_ptr), "d_hprime_xx_cm_tex"), 1101);
+    cudaChannelFormatDesc channelDesc1 = cudaCreateChannelDesc<realw>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_hprime_xx_tex_ptr, mp->d_hprime_xx,
+                                            &channelDesc1, sizeof(realw)*(NGLL2)), 1102);
+
+    print_CUDA_error_if_any(cudaGetTextureReference(&(mp->d_hprime_xx_tex_ptr), "d_hprime_xx_oc_tex"), 1103);
+    cudaChannelFormatDesc channelDesc2 = cudaCreateChannelDesc<realw>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_hprime_xx_tex_ptr, mp->d_hprime_xx,
+                                            &channelDesc2, sizeof(realw)*(NGLL2)), 1104);
+
+    print_CUDA_error_if_any(cudaGetTextureReference(&(mp->d_hprime_xx_tex_ptr), "d_hprime_xx_ic_tex"), 1105);
+    cudaChannelFormatDesc channelDesc3 = cudaCreateChannelDesc<realw>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_hprime_xx_tex_ptr, mp->d_hprime_xx,
+                                            &channelDesc3, sizeof(realw)*(NGLL2)), 1106);
+  }
+  #endif
+
+
   // sets global parameters
   mp->NSPEC_CRUST_MANTLE = *NSPEC_CRUST_MANTLE;
   mp->NGLOB_CRUST_MANTLE = *NGLOB_CRUST_MANTLE;
@@ -130,12 +155,12 @@
   mp->oceans = *OCEANS_f;
   mp->gravity = *GRAVITY_f;
   mp->rotation = *ROTATION_f;
-  
+
   mp->attenuation = *ATTENUATION_f;
   mp->attenuation_new = *ATTENUATION_NEW_f;
   mp->use_attenuation_mimic = *USE_ATTENUATION_MIMIC_f;
   mp->attenuation_3D = *ATTENUATION_3D_VAL_f;
-  
+
   mp->compute_and_store_strain = *COMPUTE_AND_STORE_STRAIN_f;
   mp->anisotropic_3D_mantle = *ANISOTROPIC_3D_MANTLE_f;
   mp->anisotropic_inner_core = *ANISOTROPIC_INNER_CORE_f;
@@ -408,9 +433,9 @@
   }else{
     R_size1 = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
     R_size2 = 1*mp->NSPEC_CRUST_MANTLE;
-    R_size3 = N_SLS*1*mp->NSPEC_CRUST_MANTLE;    
+    R_size3 = N_SLS*1*mp->NSPEC_CRUST_MANTLE;
   }
-  
+
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_one_minus_sum_beta_crust_mantle,
                                      R_size2*sizeof(realw)),4430);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta_crust_mantle,one_minus_sum_beta_crust_mantle,
@@ -455,9 +480,9 @@
   }else{
     R_size1 = N_SLS*NGLL3*mp->NSPEC_INNER_CORE;
     R_size2 = 1*mp->NSPEC_INNER_CORE;
-    R_size3 = N_SLS*1*mp->NSPEC_INNER_CORE;    
+    R_size3 = N_SLS*1*mp->NSPEC_INNER_CORE;
   }
-  
+
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_one_minus_sum_beta_inner_core,
                                      R_size2*sizeof(realw)),4430);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta_inner_core,one_minus_sum_beta_inner_core,
@@ -1497,6 +1522,23 @@
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_crust_mantle),sizeof(realw)*size),4003);
   }
 
+  #ifdef USE_TEXTURES_FIELDS
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_cm_tex_ref_ptr, "d_displ_cm_tex"), 4001);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_cm_tex_ref_ptr, mp->d_displ_crust_mantle,
+                                            &channelDesc, sizeof(realw)*size), 4001);
+  }
+
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_cm_tex_ref_ptr, "d_accel_cm_tex"), 4003);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_cm_tex_ref_ptr, mp->d_accel_crust_mantle,
+                                            &channelDesc, sizeof(realw)*size), 4003);
+  }
+  #endif
+
+
   // mass matrices
   if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmassx_crust_mantle),sizeof(realw)*size_glob),2005);
@@ -1696,6 +1738,23 @@
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_outer_core),sizeof(realw)*size),4003);
   }
 
+  #ifdef USE_TEXTURES_FIELDS
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_oc_tex_ref_ptr, "d_displ_oc_tex"), 4001);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_oc_tex_ref_ptr, mp->d_displ_outer_core,
+                                            &channelDesc, sizeof(realw)*size), 4001);
+  }
+
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_oc_tex_ref_ptr, "d_accel_oc_tex"), 4003);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_oc_tex_ref_ptr, mp->d_accel_outer_core,
+                                            &channelDesc, sizeof(realw)*size), 4003);
+  }
+  #endif
+
+
   // mass matrix
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_outer_core),sizeof(realw)*size_glob),2005);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_outer_core,h_rmass,
@@ -1892,6 +1951,23 @@
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_inner_core),sizeof(realw)*size),4003);
   }
 
+  #ifdef USE_TEXTURES_FIELDS
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_ic_tex_ref_ptr, "d_displ_ic_tex"), 4001);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_ic_tex_ref_ptr, mp->d_displ_inner_core,
+                                            &channelDesc, sizeof(realw)*size), 4001);
+  }
+
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_ic_tex_ref_ptr, "d_accel_ic_tex"), 4003);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_ic_tex_ref_ptr, mp->d_accel_inner_core,
+                                            &channelDesc, sizeof(realw)*size), 4003);
+  }
+  #endif
+
+
   // mass matrix
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_inner_core),sizeof(realw)*size_glob),2005);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_inner_core,h_rmass,
@@ -1953,362 +2029,10 @@
 #endif
 }
 
-/* ----------------------------------------------------------------------------------------------- */
 
-// for ELASTIC simulations
 
 /* ----------------------------------------------------------------------------------------------- */
 
-/*
-extern "C"
-void FC_FUNC_(prepare_fields_elastic_device,
-              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
-                                             int* size,
-                                             realw* rmass,
-                                             realw* rho_vp,
-                                             realw* rho_vs,
-                                             int* num_phase_ispec_elastic,
-                                             int* phase_ispec_inner_elastic,
-                                             int* ispec_is_elastic,
-                                             int* ABSORBING_CONDITIONS,
-                                             realw* h_b_absorb_field,
-                                             int* h_b_reclen_field,
-                                             int* SIMULATION_TYPE,int* SAVE_FORWARD,
-                                             int* COMPUTE_AND_STORE_STRAIN,
-                                             realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
-                                             realw* epsilondev_xz,realw* epsilondev_yz,
-                                             int* ATTENUATION,
-                                             int* R_size,
-                                             realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
-                                             realw* one_minus_sum_beta,realw* factor_common,
-                                             realw* alphaval,realw* betaval,realw* gammaval,
-                                             int* OCEANS,
-                                             realw* rmass_ocean_load,
-                                             int* NOISE_TOMOGRAPHY,
-                                             realw* free_surface_normal,
-                                             int* free_surface_ispec,
-                                             int* free_surface_ijk,
-                                             int* num_free_surface_faces,
-                                             int* ACOUSTIC_SIMULATION,
-                                             int* num_colors_outer_elastic,
-                                             int* num_colors_inner_elastic,
-                                             int* num_elem_colors_elastic,
-                                             int* ANISOTROPY,
-                                             realw *c11store,
-                                             realw *c12store,
-                                             realw *c13store,
-                                             realw *c14store,
-                                             realw *c15store,
-                                             realw *c16store,
-                                             realw *c22store,
-                                             realw *c23store,
-                                             realw *c24store,
-                                             realw *c25store,
-                                             realw *c26store,
-                                             realw *c33store,
-                                             realw *c34store,
-                                             realw *c35store,
-                                             realw *c36store,
-                                             realw *c44store,
-                                             realw *c45store,
-                                             realw *c46store,
-                                             realw *c55store,
-                                             realw *c56store,
-                                             realw *c66store){
-
-TRACE("prepare_fields_elastic_device");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-  // Assuming NGLLX==5. Padded is then 128 (5^3+3)
-  int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
-  int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
-
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),4001);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003);
-
-  // mpi buffer
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
-                        3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004);
-
-  // mass matrix
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
-  // transfer element data
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
-                                     sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010);
-
-
-  // element indices
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
-                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012);
-
-  // phase elements
-  mp->num_phase_ispec_elastic = *num_phase_ispec_elastic;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
-                                     mp->num_phase_ispec_elastic*2*sizeof(int)),4008);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
-                                     mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011);
-
-  // for seismograms
-  if( mp->nrec_local > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
-                                     3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
-
-    mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
-    if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
-  }
-
-  // absorbing conditions
-  if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
-    // non-padded arrays
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007);
-
-    // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014);
-
-    // absorb_field array used for file i/o
-    if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )){
-      mp->d_b_reclen_field = *h_b_reclen_field;
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
-                                       mp->d_b_reclen_field),4016);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
-                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017);
-    }
-  }
-
-  // strains used for attenuation and kernel simulations
-  if( *COMPUTE_AND_STORE_STRAIN ){
-    // strains
-    int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
-                                       epsilondev_size*sizeof(realw)),4301);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4302);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
-                                       epsilondev_size*sizeof(realw)),4302);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4303);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
-                                       epsilondev_size*sizeof(realw)),4304);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4305);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
-                                       epsilondev_size*sizeof(realw)),4306);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4307);
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
-                                       epsilondev_size*sizeof(realw)),4308);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4309);
-
-  }
-
-  // attenuation memory variables
-  if( *ATTENUATION ){
-    // memory arrays
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
-                                       (*R_size)*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4402);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
-                                       (*R_size)*sizeof(realw)),4403);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4404);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
-                                       (*R_size)*sizeof(realw)),4405);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4406);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
-                                       (*R_size)*sizeof(realw)),4407);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4408);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
-                                       (*R_size)*sizeof(realw)),4409);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),4410);
-
-    // attenuation factors
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),4430);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
-                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
-                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433);
-
-    // alpha,beta,gamma factors
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
-                                       N_SLS*sizeof(realw)),4434);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
-                                       N_SLS*sizeof(realw)),4436);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
-                                       N_SLS*sizeof(realw)),4438);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
-
-  }
-
-  // anisotropy
-  if( *ANISOTROPY ){
-    // allocates memory on GPU
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),
-                                       size_padded*sizeof(realw)),4700);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),
-                                       size_padded*sizeof(realw)),4701);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),
-                                       size_padded*sizeof(realw)),4702);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),
-                                       size_padded*sizeof(realw)),4703);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),
-                                       size_padded*sizeof(realw)),4704);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),
-                                       size_padded*sizeof(realw)),4705);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),
-                                       size_padded*sizeof(realw)),4706);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),
-                                       size_padded*sizeof(realw)),4707);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),
-                                       size_padded*sizeof(realw)),4708);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),
-                                       size_padded*sizeof(realw)),4709);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),
-                                       size_padded*sizeof(realw)),4710);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),
-                                       size_padded*sizeof(realw)),4711);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),
-                                       size_padded*sizeof(realw)),4712);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),
-                                       size_padded*sizeof(realw)),4713);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),
-                                       size_padded*sizeof(realw)),4714);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),
-                                       size_padded*sizeof(realw)),4715);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),
-                                       size_padded*sizeof(realw)),4716);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),
-                                       size_padded*sizeof(realw)),4717);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),
-                                       size_padded*sizeof(realw)),4718);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),
-                                       size_padded*sizeof(realw)),4719);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),
-                                       size_padded*sizeof(realw)),4720);
-
-    // transfer constant element data with padding
-    for(int i=0;i < mp->NSPEC_AB;i++) {
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store + i*NGLL3_PADDED, &c11store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4800);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store + i*NGLL3_PADDED, &c12store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4801);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store + i*NGLL3_PADDED, &c13store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4802);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c14store + i*NGLL3_PADDED, &c14store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4803);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c15store + i*NGLL3_PADDED, &c15store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4804);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c16store + i*NGLL3_PADDED, &c16store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4805);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c22store + i*NGLL3_PADDED, &c22store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4806);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c23store + i*NGLL3_PADDED, &c23store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4807);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c24store + i*NGLL3_PADDED, &c24store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4808);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c25store + i*NGLL3_PADDED, &c25store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4809);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c26store + i*NGLL3_PADDED, &c26store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4810);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store + i*NGLL3_PADDED, &c33store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4811);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c34store + i*NGLL3_PADDED, &c34store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4812);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c35store + i*NGLL3_PADDED, &c35store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4813);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c36store + i*NGLL3_PADDED, &c36store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4814);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store + i*NGLL3_PADDED, &c44store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4815);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c45store + i*NGLL3_PADDED, &c45store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4816);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c46store + i*NGLL3_PADDED, &c46store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4817);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c55store + i*NGLL3_PADDED, &c55store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4818);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c56store + i*NGLL3_PADDED, &c56store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4819);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_c66store + i*NGLL3_PADDED, &c66store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4820);
-    }
-  }
-
-  // ocean load approximation
-  if( *OCEANS ){
-    // oceans needs a free surface
-    mp->num_free_surface_faces = *num_free_surface_faces;
-    if( mp->num_free_surface_faces > 0 ){
-      // mass matrix
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
-                                         sizeof(realw)*mp->NGLOB_AB),4501);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
-                                         sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502);
-      // surface normal
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
-                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503);
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
-                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504);
-
-      // temporary global array: used to synchronize updates on global accel array
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
-                                         sizeof(int)*mp->NGLOB_AB),4505);
-
-      if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
-        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
-                                          mp->num_free_surface_faces*sizeof(int)),4601);
-        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-                                          mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603);
-        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602);
-        print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604);
-      }
-    }
-  }
-
-  // mesh coloring
-  if( mp->use_mesh_coloring_gpu ){
-    mp->num_colors_outer_elastic = *num_colors_outer_elastic;
-    mp->num_colors_inner_elastic = *num_colors_inner_elastic;
-    mp->h_num_elem_colors_elastic = (int*) num_elem_colors_elastic;
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("prepare_fields_elastic_device");
-#endif
-}
-*/
-
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
 // cleanup
 
 /* ----------------------------------------------------------------------------------------------- */

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-07-23 21:58:26 UTC (rev 20536)
@@ -1,4 +1,4 @@
-/*
+/* 
 !=====================================================================
 !
 !          S p e c f e m 3 D  G l o b e  V e r s i o n  5 . 1
@@ -34,8 +34,8 @@
 
 typedef float realw;
 
+ 
 
-
 //
 // src/cuda/assemble_MPI_scalar_cuda.cu
 //
@@ -43,12 +43,12 @@
 void FC_FUNC_(transfer_boun_pot_from_device,
               TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer_f,
                                              realw* send_potential_dot_dot_buffer,
-                                             int* FORWARD_OR_ADJOINT){}
+                                             int* FORWARD_OR_ADJOINT){} 
 
 void FC_FUNC_(transfer_asmbl_pot_to_device,
               TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
                                             realw* buffer_recv_scalar,
-                                            int* FORWARD_OR_ADJOINT) {}
+                                            int* FORWARD_OR_ADJOINT) {} 
 
 
 //
@@ -59,13 +59,13 @@
               TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
                                                   realw* send_accel_buffer,
                                                   int* IREGION,
-                                                  int* FORWARD_OR_ADJOINT){}
+                                                  int* FORWARD_OR_ADJOINT){} 
 
 void FC_FUNC_(transfer_asmbl_accel_to_device,
               TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
                                               realw* buffer_recv_vector,
                                               int* IREGION,
-                                              int* FORWARD_OR_ADJOINT) {}
+                                              int* FORWARD_OR_ADJOINT) {} 
 
 
 //
@@ -73,58 +73,58 @@
 //
 
 void FC_FUNC_(pause_for_debug,
-              PAUSE_FOR_DEBUG)() {}
+              PAUSE_FOR_DEBUG)() {} 
 
 void FC_FUNC_(output_free_device_memory,
-              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
+              OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {} 
 
 void FC_FUNC_(get_free_device_memory,
-              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+              get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {} 
 
 void FC_FUNC_(check_max_norm_displ_gpu,
-              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_vector,
-              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_displ,
-              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
+              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_displ_gpu,
-              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_accel_gpu,
-              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_veloc_gpu,
-              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
+              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_displ,
-              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
+              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {} 
 
 void FC_FUNC_(check_max_norm_b_accel,
-              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
+              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {} 
 
 void FC_FUNC_(check_error_vectors,
-              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
+              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {} 
 
 void FC_FUNC_(get_max_accel,
-              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
+              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} 
 
 void FC_FUNC_(check_norm_acoustic_from_device,
               CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
-                                                  int* SIMULATION_TYPE) {}
+                                                  int* SIMULATION_TYPE) {} 
 
 void FC_FUNC_(check_norm_elastic_from_device,
               CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
                                               long* Mesh_pointer_f,
-                                              int* SIMULATION_TYPE) {}
+                                              int* SIMULATION_TYPE) {} 
 
 void FC_FUNC_(check_norm_strain_from_device,
               CHECK_NORM_STRAIN_FROM_DEVICE)(realw* strain_norm,
                                              realw* strain_norm2,
-                                             long* Mesh_pointer_f) {}
+                                             long* Mesh_pointer_f) {} 
 
 
 //
@@ -134,12 +134,12 @@
 void FC_FUNC_(compute_add_sources_el_cuda,
               COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
                                            int* NSOURCESf,
-                                           double* h_stf_pre_compute) {}
+                                           double* h_stf_pre_compute) {} 
 
 void FC_FUNC_(compute_add_sources_el_s3_cuda,
               COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer_f,
                                               int* NSOURCESf,
-                                              double* h_stf_pre_compute) {}
+                                              double* h_stf_pre_compute) {} 
 
 void FC_FUNC_(add_sources_el_sim_type_2_or_3,
               ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
@@ -147,7 +147,7 @@
                                               realw* h_adj_sourcearrays,
                                               int* h_islice_selected_rec,
                                               int* h_ispec_selected_rec,
-                                              int* time_index) {}
+                                              int* time_index) {} 
 
 
 //
@@ -155,26 +155,26 @@
 //
 
 void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
-              COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f) {}
+              COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(compute_coupling_fluid_icb_cuda,
-              COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f) {}
+              COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
               COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
                                                double RHO_TOP_OC,
                                                realw minus_g_cmb,
-                                               int GRAVITY_VAL) {}
+                                               int GRAVITY_VAL) {} 
 
 void FC_FUNC_(compute_coupling_icb_fluid_cuda,
               COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f,
                                                double RHO_BOTTOM_OC,
                                                realw minus_g_icb,
-                                               int GRAVITY_VAL) {}
+                                               int GRAVITY_VAL) {} 
 
 void FC_FUNC_(compute_coupling_ocean_cuda,
               COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
-             int* NCHUNKS_VAL) {}
+             int* NCHUNKS_VAL) {} 
 
 
 //
@@ -184,7 +184,7 @@
 void FC_FUNC_(compute_forces_crust_mantle_cuda,
               COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
                                                 realw* deltat,
-                                                int* iphase) {}
+                                                int* iphase) {} 
 
 
 //
@@ -193,8 +193,8 @@
 
 void FC_FUNC_(compute_forces_inner_core_cuda,
               COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
-                realw* deltat,
-                int* iphase) {}
+                                              realw* deltat,
+                                              int* iphase) {} 
 
 
 //
@@ -205,7 +205,7 @@
               COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
                                             int* iphase,
                                             realw* time_f,
-                                            realw* b_time_f) {}
+                                            realw* b_time_f) {} 
 
 
 //
@@ -213,22 +213,22 @@
 //
 
 void FC_FUNC_(compute_kernels_cm_cuda,
-              COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+              COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_ic_cuda,
-              COMPUTE_KERNELS_IC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+              COMPUTE_KERNELS_IC_CUDA)(long* Mesh_pointer,realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_oc_cuda,
-              COMPUTE_KERNELS_OC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+              COMPUTE_KERNELS_OC_CUDA)(long* Mesh_pointer,realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_strgth_noise_cu,
               COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
                                                realw* h_noise_surface_movie,
-                                               realw* deltat_f) {}
+                                               realw* deltat_f) {} 
 
 void FC_FUNC_(compute_kernels_hess_cuda,
               COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
-                                         realw* deltat_f) {}
+                                         realw* deltat_f) {} 
 
 
 //
@@ -238,7 +238,7 @@
 void FC_FUNC_(compute_stacey_acoustic_cuda,
               COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
                                             realw* absorb_potential,
-                                            int* itype) {}
+                                            int* itype) {} 
 
 
 //
@@ -248,7 +248,7 @@
 void FC_FUNC_(compute_stacey_elastic_cuda,
               COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
                                                 realw* absorb_field,
-                                                int* itype) {}
+                                                int* itype) {} 
 
 
 //
@@ -256,10 +256,10 @@
 //
 
 void FC_FUNC_(initialize_cuda_device,
-              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+              INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) { 
  fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
  exit(1);
-}
+} 
 
 
 //
@@ -273,7 +273,7 @@
                                              realw* deltatover2_F,
                                              realw* b_deltat_F,
                                              realw* b_deltatsqover2_F,
-                                             realw* b_deltatover2_F) {}
+                                             realw* b_deltatover2_F) {} 
 
 void FC_FUNC_(it_update_displacement_cm_cuda,
               IT_UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
@@ -282,7 +282,7 @@
                                              realw* deltatover2_F,
                                              realw* b_deltat_F,
                                              realw* b_deltatsqover2_F,
-                                             realw* b_deltatover2_F) {}
+                                             realw* b_deltatover2_F) {} 
 
 void FC_FUNC_(it_update_displacement_oc_cuda,
               IT_UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
@@ -291,7 +291,7 @@
                                                realw* deltatover2_F,
                                                realw* b_deltat_F,
                                                realw* b_deltatsqover2_F,
-                                               realw* b_deltatover2_F) {}
+                                               realw* b_deltatover2_F) {} 
 
 void FC_FUNC_(kernel_3_a_cuda,
               KERNEL_3_A_CUDA)(long* Mesh_pointer,
@@ -299,49 +299,49 @@
                                int* SIMULATION_TYPE_f,
                                realw* b_deltatover2_F,
                                int* OCEANS,
-             int* NCHUNKS_VAL) {}
+             int* NCHUNKS_VAL) {} 
 
 void FC_FUNC_(kernel_3_b_cuda,
               KERNEL_3_B_CUDA)(long* Mesh_pointer,
                                realw* deltatover2_F,
                                int* SIMULATION_TYPE_f,
                                realw* b_deltatover2_F,
-                               int* OCEANS) {}
+                               int* OCEANS) {} 
 
 void FC_FUNC_(kernel_3_outer_core_cuda,
               KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
                                         realw* deltatover2_F,
                                         int* SIMULATION_TYPE_f,
-                                        realw* b_deltatover2_F) {}
+                                        realw* b_deltatover2_F) {} 
 
 
 //
 // src/cuda/noise_tomography_cuda.cu
 //
 
-void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){} 
 
-void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {} 
 
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {} 
 
-void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {} 
 
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {} 
 
 void FC_FUNC_(noise_transfer_surface_to_host,
               NOISE_TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
-                                              realw* h_noise_surface_movie) {}
+                                              realw* h_noise_surface_movie) {} 
 
 void FC_FUNC_(noise_add_source_master_rec_cu,
               NOISE_ADD_SOURCE_MASTER_REC_CU)(long* Mesh_pointer_f,
                                               int* it_f,
                                               int* irec_master_noise_f,
-                                              int* islice_selected_rec) {}
+                                              int* islice_selected_rec) {} 
 
 void FC_FUNC_(noise_add_surface_movie_cuda,
               NOISE_ADD_SURFACE_MOVIE_CUDA)(long* Mesh_pointer_f,
-                                            realw* h_noise_surface_movie) {}
+                                            realw* h_noise_surface_movie) {} 
 
 
 //
@@ -376,13 +376,14 @@
                                         int* ATTENUATION_f,
                                         int* ATTENUATION_NEW_f,
                                         int* USE_ATTENUATION_MIMIC_f,
+                                        int* ATTENUATION_3D_VAL_f,
                                         int* COMPUTE_AND_STORE_STRAIN_f,
                                         int* ANISOTROPIC_3D_MANTLE_f,
                                         int* ANISOTROPIC_INNER_CORE_f,
                                         int* SAVE_BOUNDARY_MESH_f,
                                         int* USE_MESH_COLORING_GPU_f,
                                         int* ANISOTROPIC_KL_f,
-                                        int* APPROXIMATE_HESS_KL_f) {}
+                                        int* APPROXIMATE_HESS_KL_f) {} 
 
 void FC_FUNC_(prepare_fields_rotation_device,
               PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
@@ -395,7 +396,7 @@
                                               realw* b_A_array_rotation,
                                               realw* b_B_array_rotation,
                                               int* NSPEC_OUTER_CORE_ROTATION
-                                              ) {}
+                                              ) {} 
 
 void FC_FUNC_(prepare_fields_gravity_device,
               PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
@@ -406,7 +407,7 @@
                                              realw* density_table,
                                              realw* h_wgll_cube,
                                              int* NRAD_GRAVITY
-                                             ) {}
+                                             ) {} 
 
 void FC_FUNC_(prepare_fields_attenuat_device,
               PREPARE_FIELDS_ATTENUAT_DEVICE)(long* Mesh_pointer_f,
@@ -426,7 +427,7 @@
                                                  realw* one_minus_sum_beta_inner_core,
                                                  realw* alphaval,realw* betaval,realw* gammaval,
                                                  realw* b_alphaval,realw* b_betaval,realw* b_gammaval
-                                                 ) {}
+                                                 ) {} 
 
 void FC_FUNC_(prepare_fields_strain_device,
               PREPARE_FIELDS_STRAIN_DEVICE)(long* Mesh_pointer_f,
@@ -454,7 +455,7 @@
                                             realw* b_epsilondev_yz_inner_core,
                                             realw* eps_trace_over_3_inner_core,
                                             realw* b_eps_trace_over_3_inner_core
-                                            ) {}
+                                            ) {} 
 
 void FC_FUNC_(prepare_fields_absorb_device,
               PREPARE_FIELDS_ABSORB_DEVICE)(long* Mesh_pointer_f,
@@ -486,7 +487,7 @@
                                             realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
                                             realw* jacobian2D_bottom_outer_core,
                                             realw* vp_outer_core
-                                            ) {}
+                                            ) {} 
 
 void FC_FUNC_(prepare_mpi_buffers_device,
               PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f,
@@ -502,7 +503,7 @@
                                           int* max_nibool_interfaces_outer_core,
                                           int* nibool_interfaces_outer_core,
                                           int* ibool_interfaces_outer_core
-                                          ){}
+                                          ){} 
 
 void FC_FUNC_(prepare_fields_noise_device,
               PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
@@ -514,7 +515,7 @@
                                            realw* normal_y_noise,
                                            realw* normal_z_noise,
                                            realw* mask_noise,
-                                           realw* jacobian2D_top_crust_mantle) {}
+                                           realw* jacobian2D_top_crust_mantle) {} 
 
 void FC_FUNC_(prepare_crust_mantle_device,
              PREPARE_CRUST_MANTLE_DEVICE)(long* Mesh_pointer_f,
@@ -548,7 +549,7 @@
              int* NSPEC2D_TOP_CM,
        int* NSPEC2D_BOTTOM_CM,
        int* NCHUNKS_VAL
-             ) {}
+             ) {} 
 
 void FC_FUNC_(prepare_outer_core_device,
               PREPARE_OUTER_CORE_DEVICE)(long* Mesh_pointer_f,
@@ -571,7 +572,7 @@
                                          int* nspec_inner,
            int* NSPEC2D_TOP_OC,
            int* NSPEC2D_BOTTOM_OC
-                                         ) {}
+                                         ) {} 
 
 void FC_FUNC_(prepare_inner_core_device,
               PREPARE_INNER_CORE_DEVICE)(long* Mesh_pointer_f,
@@ -590,70 +591,15 @@
            int* phase_ispec_inner,
            int* nspec_outer,
            int* nspec_inner,
-           int* NSPEC2D_TOP_IC) {}
+           int* NSPEC2D_TOP_IC) {} 
 
 void FC_FUNC_(prepare_oceans_device,
               PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
-             realw* h_rmass_ocean_load) {}
+             realw* h_rmass_ocean_load) {} 
 
-void FC_FUNC_(prepare_fields_elastic_device,
-              PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
-                                             int* size,
-                                             realw* rmass,
-                                             realw* rho_vp,
-                                             realw* rho_vs,
-                                             int* num_phase_ispec_elastic,
-                                             int* phase_ispec_inner_elastic,
-                                             int* ispec_is_elastic,
-                                             int* ABSORBING_CONDITIONS,
-                                             realw* h_b_absorb_field,
-                                             int* h_b_reclen_field,
-                                             int* SIMULATION_TYPE,int* SAVE_FORWARD,
-                                             int* COMPUTE_AND_STORE_STRAIN,
-                                             realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
-                                             realw* epsilondev_xz,realw* epsilondev_yz,
-                                             int* ATTENUATION,
-                                             int* R_size,
-                                             realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
-                                             realw* one_minus_sum_beta,realw* factor_common,
-                                             realw* alphaval,realw* betaval,realw* gammaval,
-                                             int* OCEANS,
-                                             realw* rmass_ocean_load,
-                                             int* NOISE_TOMOGRAPHY,
-                                             realw* free_surface_normal,
-                                             int* free_surface_ispec,
-                                             int* free_surface_ijk,
-                                             int* num_free_surface_faces,
-                                             int* ACOUSTIC_SIMULATION,
-                                             int* num_colors_outer_elastic,
-                                             int* num_colors_inner_elastic,
-                                             int* num_elem_colors_elastic,
-                                             int* ANISOTROPY,
-                                             realw *c11store,
-                                             realw *c12store,
-                                             realw *c13store,
-                                             realw *c14store,
-                                             realw *c15store,
-                                             realw *c16store,
-                                             realw *c22store,
-                                             realw *c23store,
-                                             realw *c24store,
-                                             realw *c25store,
-                                             realw *c26store,
-                                             realw *c33store,
-                                             realw *c34store,
-                                             realw *c35store,
-                                             realw *c36store,
-                                             realw *c44store,
-                                             realw *c45store,
-                                             realw *c46store,
-                                             realw *c55store,
-                                             realw *c56store,
-                                             realw *c66store){}
-
 void FC_FUNC_(prepare_cleanup_device,
               PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
-              int* NCHUNKS_VAL) {}
+              int* NCHUNKS_VAL) {} 
 
 
 //
@@ -661,82 +607,82 @@
 //
 
 void FC_FUNC_(transfer_fields_cm_to_device,
-              TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_ic_to_device,
-              TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_oc_to_device,
-              TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_cm_to_device,
               TRANSFER_FIELDS_B_CM_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                              long* Mesh_pointer_f) {}
+                                              long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_ic_to_device,
               TRANSFER_FIELDS_B_IC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                              long* Mesh_pointer_f) {}
+                                              long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_oc_to_device,
               TRANSFER_FIELDS_B_OC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                              long* Mesh_pointer_f) {}
+                                              long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_cm_from_device,
-              TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_ic_from_device,
-              TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_fields_oc_from_device,
-              TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_cm_from_device,
               TRANSFER_B_FIELDS_CM_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                long* Mesh_pointer_f) {}
+                                                long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_ic_from_device,
               TRANSFER_B_FIELDS_IC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                long* Mesh_pointer_f) {}
+                                                long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_fields_oc_from_device,
               TRANSFER_B_FIELDS_OC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                long* Mesh_pointer_f) {}
+                                                long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_accel_cm_to_device,
-              TRANSFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_displ_cm_from_device,
-              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_displ_cm_from_device,
-              TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_displ_ic_from_device,
-              TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_displ_ic_from_device,
-              TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_displ_oc_from_device,
-              TRANSFER_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_displ_oc_from_device,
-              TRANSFER_B_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+              TRANSFER_B_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_veloc_cm_from_device,
-              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
+              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_accel_cm_from_device,
-              TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_accel_cm_from_device,
-              TRANSFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
+              TRANSFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_accel_ic_from_device,
-              TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_accel_oc_from_device,
-              TRANSFER_ACCEL_OC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+              TRANSFER_ACCEL_OC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_strain_cm_from_device,
               TRANSFER_STRAIN_CM_FROM_DEVICE)(long* Mesh_pointer,
@@ -745,7 +691,7 @@
                                                   realw* epsilondev_yy,
                                                   realw* epsilondev_xy,
                                                   realw* epsilondev_xz,
-                                                  realw* epsilondev_yz) {}
+                                                  realw* epsilondev_yz) {} 
 
 void FC_FUNC_(transfer_b_strain_cm_to_device,
               TRANSFER_B_STRAIN_CM_TO_DEVICE)(long* Mesh_pointer,
@@ -753,7 +699,7 @@
                                               realw* epsilondev_yy,
                                               realw* epsilondev_xy,
                                               realw* epsilondev_xz,
-                                              realw* epsilondev_yz) {}
+                                              realw* epsilondev_yz) {} 
 
 void FC_FUNC_(transfer_strain_ic_from_device,
               TRANSFER_STRAIN_IC_FROM_DEVICE)(long* Mesh_pointer,
@@ -762,7 +708,7 @@
                                               realw* epsilondev_yy,
                                               realw* epsilondev_xy,
                                               realw* epsilondev_xz,
-                                              realw* epsilondev_yz) {}
+                                              realw* epsilondev_yz) {} 
 
 void FC_FUNC_(transfer_b_strain_ic_to_device,
               TRANSFER_B_STRAIN_IC_TO_DEVICE)(long* Mesh_pointer,
@@ -770,17 +716,17 @@
                                               realw* epsilondev_yy,
                                               realw* epsilondev_xy,
                                               realw* epsilondev_xz,
-                                              realw* epsilondev_yz) {}
+                                              realw* epsilondev_yz) {} 
 
 void FC_FUNC_(transfer_rotation_from_device,
               TRANSFER_ROTATION_FROM_DEVICE)(long* Mesh_pointer,
                                              realw* A_array_rotation,
-                                             realw* B_array_rotation) {}
+                                             realw* B_array_rotation) {} 
 
 void FC_FUNC_(transfer_b_rotation_to_device,
               TRANSFER_B_ROTATION_TO_DEVICE)(long* Mesh_pointer,
                                               realw* A_array_rotation,
-                                              realw* B_array_rotation) {}
+                                              realw* B_array_rotation) {} 
 
 void FC_FUNC_(transfer_kernels_cm_to_host,
               TRANSFER_KERNELS_CM_TO_HOST)(long* Mesh_pointer,
@@ -788,30 +734,30 @@
                                            realw* h_alpha_kl,
                                            realw* h_beta_kl,
                                            realw* h_cijkl_kl,
-                                           int* NSPEC) {}
+                                           int* NSPEC) {} 
 
 void FC_FUNC_(transfer_kernels_ic_to_host,
               TRANSFER_KERNELS_IC_TO_HOST)(long* Mesh_pointer,
                                                     realw* h_rho_kl,
                                                     realw* h_alpha_kl,
                                                     realw* h_beta_kl,
-                                                    int* NSPEC) {}
+                                                    int* NSPEC) {} 
 
 void FC_FUNC_(transfer_kernels_oc_to_host,
               TRANSFER_KERNELS_OC_TO_HOST)(long* Mesh_pointer,
                                            realw* h_rho_kl,
                                            realw* h_alpha_kl,
-                                           int* NSPEC) {}
+                                           int* NSPEC) {} 
 
 void FC_FUNC_(transfer_kernels_noise_to_host,
               TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
                                               realw* h_Sigma_kl,
-                                              int* NSPEC) {}
+                                              int* NSPEC) {} 
 
 void FC_FUNC_(transfer_kernels_hess_cm_tohost,
               TRANSFER_KERNELS_HESS_CM_TOHOST)(long* Mesh_pointer,
                                               realw* h_hess_kl,
-                                              int* NSPEC) {}
+                                              int* NSPEC) {} 
 
 
 //
@@ -831,7 +777,7 @@
                                                int* number_receiver_global,
                                                int* ispec_selected_rec,
                                                int* ispec_selected_source,
-                                               int* ibool) {}
+                                               int* ibool) {} 
 
 void FC_FUNC_(transfer_station_ac_from_device,
               TRANSFER_STATION_AC_FROM_DEVICE)(
@@ -846,5 +792,5 @@
                                                 int* ispec_selected_rec,
                                                 int* ispec_selected_source,
                                                 int* ibool,
-                                                int* SIMULATION_TYPEf) {}
+                                                int* SIMULATION_TYPEf) {} 
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2012-07-23 21:58:26 UTC (rev 20536)
@@ -35,7 +35,6 @@
 
 #include "config.h"
 #include "mesh_constants_cuda.h"
-#include "prepare_constants_cuda.h"
 
 /* ----------------------------------------------------------------------------------------------- */
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90	2012-07-23 21:58:26 UTC (rev 20536)
@@ -263,6 +263,28 @@
 !-------------------------------------------------------------------------------------------------
 !
 
+  subroutine max_all_cr(sendbuf, recvbuf)
+
+  implicit none
+
+! standard include of the MPI library
+  include 'mpif.h'
+
+  include "constants.h"
+  include "precision.h"
+
+  real(kind=CUSTOM_REAL) :: sendbuf, recvbuf
+  integer :: ier
+
+  call MPI_REDUCE(sendbuf,recvbuf,1,CUSTOM_MPI_TYPE, &
+                  MPI_MAX,0,MPI_COMM_WORLD,ier)
+
+  end subroutine max_all_cr
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
   subroutine sum_all_dp(sendbuf, recvbuf)
 
   implicit none

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90	2012-07-23 21:58:26 UTC (rev 20536)
@@ -38,9 +38,6 @@
 
   implicit none
 
-  include 'mpif.h'
-  include "precision.h"
-
   ! time step
   integer it,NSTEP,myrank
 
@@ -56,8 +53,6 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: &
     eps_trace_over_3_crust_mantle
 
-!  real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) ::  &
-!    epsilondev_crust_mantle
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) ::  &
     epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
     epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle
@@ -67,8 +62,6 @@
 
   double precision :: time_start,DT,t0
 
-!  logical COMPUTE_AND_STORE_STRAIN
-
   ! local parameters
   ! maximum of the norm of the displacement and of the potential in the fluid
   real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all,Ufluidnorm,Ufluidnorm_all
@@ -129,10 +122,8 @@
     call exit_MPI(myrank,'forward simulation became unstable in fluid and blew up')
 
   ! compute the maximum of the maxima for all the slices using an MPI reduction
-  call MPI_REDUCE(Usolidnorm,Usolidnorm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
-                      MPI_COMM_WORLD,ier)
-  call MPI_REDUCE(Ufluidnorm,Ufluidnorm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
-                      MPI_COMM_WORLD,ier)
+  call max_all_cr(Usolidnorm,Usolidnorm_all)
+  call max_all_cr(Ufluidnorm,Ufluidnorm_all)
 
   if (SIMULATION_TYPE == 3) then
     if( .not. GPU_MODE) then
@@ -157,10 +148,8 @@
       call exit_MPI(myrank,'backward simulation became unstable and blew up  in the fluid')
 
     ! compute the maximum of the maxima for all the slices using an MPI reduction
-    call MPI_REDUCE(b_Usolidnorm,b_Usolidnorm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
-             MPI_COMM_WORLD,ier)
-    call MPI_REDUCE(b_Ufluidnorm,b_Ufluidnorm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
-             MPI_COMM_WORLD,ier)
+    call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
+    call max_all_cr(b_Ufluidnorm,b_Ufluidnorm_all)
   endif
 
   if (COMPUTE_AND_STORE_STRAIN) then
@@ -177,10 +166,8 @@
       call check_norm_strain_from_device(Strain_norm,Strain2_norm,Mesh_pointer)
     endif
 
-    call MPI_REDUCE(Strain_norm,Strain_norm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
-             MPI_COMM_WORLD,ier)
-    call MPI_REDUCE(Strain2_norm,Strain2_norm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
-             MPI_COMM_WORLD,ier)
+    call max_all_cr(Strain_norm,Strain_norm_all)
+    call max_all_cr(Strain2_norm,Strain2_norm_all)
   endif
 
   if(myrank == 0) then

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90	2012-07-23 21:58:26 UTC (rev 20536)
@@ -671,7 +671,7 @@
               if( ATTENUATION_VAL ) then
                 if( ATTENUATION_3D_VAL ) then
                   mul = mul * one_minus_sum_beta(i,j,k,ispec)
-                else 
+                else
                   mul = mul * one_minus_sum_beta(1,1,1,ispec)
                 endif
               endif

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90	2012-07-23 21:58:26 UTC (rev 20536)
@@ -46,7 +46,7 @@
   integer :: i,j,k,ispec,ier
   double precision, dimension(N_SLS) :: tau_e, fc
   double precision :: omsb, Q_mu, sf, T_c_source, scale_t
-  
+
   ! checks if attenuation is on and anything to do
   if( .not. ATTENUATION_VAL) return
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-07-23 21:58:26 UTC (rev 20536)
@@ -714,7 +714,7 @@
   factor_common_crust_mantle_dble = 0.d0
   factor_scale_crust_mantle_dble = 0.d0
   tau_sigma_dble = 0.d0
-  
+
   call get_attenuation_model_3D_or_1D(myrank, prnamel, omsb_crust_mantle_dble, &
            factor_common_crust_mantle_dble,factor_scale_crust_mantle_dble,tau_sigma_dble, &
            ATT1,ATT2,ATT3,ATT4)
@@ -727,7 +727,7 @@
   factor_common_inner_core_dble = 0.d0
   factor_scale_inner_core_dble = 0.d0
   tau_sigma_dble = 0.d0
-  
+
   call get_attenuation_model_3D_or_1D(myrank, prnamel, omsb_inner_core_dble, &
            factor_common_inner_core_dble,factor_scale_inner_core_dble,tau_sigma_dble, &
            ATT1,ATT2,ATT3,ATT5)