[cig-commits] r19039 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda specfem3D

Thu Oct 6 18:29:29 PDT 2011

Author: danielpeter
Date: 2011-10-06 18:29:29 -0700 (Thu, 06 Oct 2011)
New Revision: 19039

Modified:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
Log:
updates attenuation in elastic kernel; unravels elastic and noise simulation preparation

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu	2011-10-07 01:29:29 UTC (rev 19039)
@@ -172,7 +172,10 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool, int* ispec_selected_rec, int irec_master_noise, real* accel, real* noise_sourcearray, int it) {
+__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool, int* ispec_selected_rec, 
+                                                        int irec_master_noise, 
+                                                        real* accel, real* noise_sourcearray, 
+                                                        int it) {
   int tx = threadIdx.x;
   int iglob = ibool[tx + 125*(ispec_selected_rec[irec_master_noise-1]-1)]-1;
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-10-07 01:29:29 UTC (rev 19039)
@@ -21,12 +21,33 @@
 __constant__ float d_wgllwgll_yz[NGLL2];
 
 
-/* ----------------------------------------------------------------------------------------------- */
-
 void Kernel_2(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
-	      int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE);
+	      int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION);
 
+__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic, 
+                            int num_phase_ispec_elastic, int d_iphase, int* d_ibool);
 
+__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
+                              int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,
+                              float* d_displ, float* d_accel, 
+                              float* d_xix, float* d_xiy, float* d_xiz, 
+                              float* d_etax, float* d_etay, float* d_etaz, 
+                              float* d_gammax, float* d_gammay, float* d_gammaz, 
+                              float* d_kappav, float* d_muv,
+                              float* d_debug,
+                              int COMPUTE_AND_STORE_STRAIN,
+                              float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
+                              float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,
+                              int SIMULATION_TYPE,
+                              int ATTENUATION,int NSPEC,
+                              float* one_minus_sum_beta,float* factor_common,                              
+                              float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+                              float* alphaval,float* betaval,float* gammaval);
+
+__global__ void kernel_3_cuda_device(real* veloc,
+                                     real* accel, int size,
+                                     real deltatover2, real* rmass);
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // prepares a device array with with all inter-element edge-nodes -- this
@@ -237,7 +258,8 @@
                                            int* nspec_outer_elastic,
                                            int* nspec_inner_elastic,
                                            int* COMPUTE_AND_STORE_STRAIN,
-                                           int* SIMULATION_TYPE) {
+                                           int* SIMULATION_TYPE,
+                                           int* ATTENUATION) {
 
 TRACE("compute_forces_elastic_cuda");  
 // EPIK_TRACER("compute_forces_elastic_cuda");
@@ -256,23 +278,135 @@
   /* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
   /* if(myrank==0) { */
 
-  Kernel_2(num_elements, mp, *iphase, *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE);
+  Kernel_2(num_elements,mp,*iphase,*COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION);
   
   
   cudaThreadSynchronize();
 /* MPI_Barrier(MPI_COMM_WORLD); */
 }
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase, int* d_ibool);
+// updates stress
 
-__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,float* d_displ, float* d_accel, float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz, float* d_gammax, float* d_gammay, float* d_gammaz, float* d_kappav, float* d_muv,float* d_debug,int COMPUTE_AND_STORE_STRAIN,float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,int SIMULATION_TYPE);
+__device__ void compute_element_att_stress(int tx,int working_element,int NSPEC,
+                                           float* R_xx, 
+                                           float* R_yy, 
+                                           float* R_xy, 
+                                           float* R_xz, 
+                                           float* R_yz, 
+                                           reald* sigma_xx, 
+                                           reald* sigma_yy, 
+                                           reald* sigma_zz,
+                                           reald* sigma_xy, 
+                                           reald* sigma_xz,
+                                           reald* sigma_yz) {
 
+  int i_sls,offset_sls;
+  reald R_xx_val,R_yy_val;
+  
+  for(i_sls = 0; i_sls < N_SLS; i_sls++){
+    // index
+    offset_sls = tx + 125*(working_element + NSPEC*i_sls);
+    
+    R_xx_val = R_xx[offset_sls]; //(i,j,k,ispec,i_sls)
+    R_yy_val = R_yy[offset_sls];
+    
+    *sigma_xx = *sigma_xx - R_xx_val;
+    *sigma_yy = *sigma_yy - R_yy_val;
+    *sigma_zz = *sigma_zz + R_xx_val + R_yy_val;
+    *sigma_xy = *sigma_xy - R_xy[offset_sls];
+    *sigma_xz = *sigma_xz - R_xz[offset_sls];
+    *sigma_yz = *sigma_yz - R_yz[offset_sls];
+  }
+  return;
+}  
+
 /* ----------------------------------------------------------------------------------------------- */
 
+// updates R_memory
+
+__device__ void compute_element_att_memory(int tx,int working_element,int NSPEC,
+                                          float* d_muv,
+                                          float* factor_common,
+                                          float* alphaval,float* betaval,float* gammaval,
+                                          float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+                                          float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
+                                          float* epsilondev_xz,float* epsilondev_yz,
+                                          reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
+                                          reald epsilondev_xz_loc,reald epsilondev_yz_loc
+                                          ){
+
+  int i_sls;
+  int ijk_ispec;
+  int offset_sls,offset_align,offset_common;
+  reald mul;
+  reald alphaval_loc,betaval_loc,gammaval_loc;
+  reald factor_loc,Sn,Snp1;
+  
+  // indices
+  offset_align = tx + 128 * working_element;
+  ijk_ispec = tx + 125 * working_element;
+  
+  mul = d_muv[offset_align];
+  
+  // use Runge-Kutta scheme to march in time
+  for(i_sls = 0; i_sls < N_SLS; i_sls++){
+
+    // indices
+    offset_common = i_sls + N_SLS*(tx + 125*working_element); // (i_sls,i,j,k,ispec)
+    offset_sls = tx + 125*(working_element + NSPEC*i_sls);   // (i,j,k,ispec,i_sls)
+    
+    factor_loc = mul * factor_common[offset_common]; //mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+    
+    alphaval_loc = alphaval[i_sls]; // (i_sls)
+    betaval_loc = betaval[i_sls];
+    gammaval_loc = gammaval[i_sls];
+    
+    // term in xx
+    Sn   = factor_loc * epsilondev_xx[ijk_ispec]; //(i,j,k,ispec)
+    Snp1   = factor_loc * epsilondev_xx_loc; //(i,j,k)
+    
+    //R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) + 
+    //                          betaval_loc * Sn + gammaval_loc * Snp1;
+
+    R_xx[offset_sls] = alphaval_loc * R_xx[offset_sls] + 
+                       betaval_loc * Sn + gammaval_loc * Snp1;
+
+    // term in yy
+    Sn   = factor_loc * epsilondev_yy[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_yy_loc;
+    R_yy[offset_sls] = alphaval_loc * R_yy[offset_sls] + 
+                        betaval_loc * Sn + gammaval_loc * Snp1;
+    // term in zz not computed since zero trace
+    // term in xy
+    Sn   = factor_loc * epsilondev_xy[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_xy_loc;
+    R_xy[offset_sls] = alphaval_loc * R_xy[offset_sls] + 
+                        betaval_loc * Sn + gammaval_loc * Snp1;
+    // term in xz
+    Sn   = factor_loc * epsilondev_xz[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_xz_loc;
+    R_xz[offset_sls] = alphaval_loc * R_xz[offset_sls] + 
+                        betaval_loc * Sn + gammaval_loc * Snp1;
+    // term in yz
+    Sn   = factor_loc * epsilondev_yz[ijk_ispec];
+    Snp1   = factor_loc * epsilondev_yz_loc;
+    R_yz[offset_sls] = alphaval_loc * R_yz[offset_sls] + 
+                        betaval_loc * Sn + gammaval_loc * Snp1;
+  }
+  return;  
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 2
+
+/* ----------------------------------------------------------------------------------------------- */
+
 void Kernel_2(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
-	      int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE)
+              int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION)
   {
     
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -314,7 +448,8 @@
 						 mp->d_xix, mp->d_xiy, mp->d_xiz,
 						 mp->d_etax, mp->d_etay, mp->d_etaz,
 						 mp->d_gammax, mp->d_gammay, mp->d_gammaz,
-						 mp->d_kappav, mp->d_muv,d_debug,
+						 mp->d_kappav, mp->d_muv,
+             d_debug,
 						 COMPUTE_AND_STORE_STRAIN,
 						 mp->d_epsilondev_xx,
 						 mp->d_epsilondev_yy,
@@ -322,8 +457,12 @@
 						 mp->d_epsilondev_xz,
 						 mp->d_epsilondev_yz,
 						 mp->d_epsilon_trace_over_3,
-						 // 1);
-						 SIMULATION_TYPE);
+						 SIMULATION_TYPE,
+             ATTENUATION,mp->NSPEC_AB,
+             mp->d_one_minus_sum_beta,mp->d_factor_common,
+             mp->d_R_xx,mp->d_R_yy,mp->d_R_xy,mp->d_R_xz,mp->d_R_yz,
+             mp->d_alphaval,mp->d_betaval,mp->d_gammaval
+             );
     
 
     // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
@@ -348,7 +487,8 @@
 						   mp->d_xix, mp->d_xiy, mp->d_xiz,
 						   mp->d_etax, mp->d_etay, mp->d_etaz,
 						   mp->d_gammax, mp->d_gammay, mp->d_gammaz,
-						   mp->d_kappav, mp->d_muv,d_debug,
+						   mp->d_kappav, mp->d_muv,
+               d_debug,
 						   COMPUTE_AND_STORE_STRAIN,
 						   mp->d_b_epsilondev_xx,
 						   mp->d_b_epsilondev_yy,
@@ -356,7 +496,12 @@
 						   mp->d_b_epsilondev_xz,
 						   mp->d_b_epsilondev_yz,
 						   mp->d_b_epsilon_trace_over_3,
-						   SIMULATION_TYPE);
+						   SIMULATION_TYPE,
+               ATTENUATION,mp->NSPEC_AB,
+               mp->d_one_minus_sum_beta,mp->d_factor_common,               
+               mp->d_b_R_xx,mp->d_b_R_yy,mp->d_b_R_xy,mp->d_b_R_xz,mp->d_b_R_yz,
+               mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval
+               );
     }
     
     // cudaEventRecord( stop, 0 );
@@ -381,7 +526,8 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
+__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic, 
+                            int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
   int bx = blockIdx.x;
   int tx = threadIdx.x;
   int working_element;
@@ -411,18 +557,31 @@
 // doesn't seem to change the performance.
 // #define MANUALLY_UNROLLED_LOOPS
 
-/* ----------------------------------------------------------------------------------------------- */
 
-__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,float* d_displ, float* d_accel, float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz, float* d_gammax, float* d_gammay, float* d_gammaz, float* d_kappav, float* d_muv,float* d_debug,int COMPUTE_AND_STORE_STRAIN,float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,int SIMULATION_TYPE)
-{
+__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
+                              int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,
+                              float* d_displ, float* d_accel, 
+                              float* d_xix, float* d_xiy, float* d_xiz, 
+                              float* d_etax, float* d_etay, float* d_etaz, 
+                              float* d_gammax, float* d_gammay, float* d_gammaz, 
+                              float* d_kappav, float* d_muv,
+                              float* d_debug,
+                              int COMPUTE_AND_STORE_STRAIN,
+                              float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
+                              float* epsilondev_xz,float* epsilondev_yz,
+                              float* epsilon_trace_over_3,
+                              int SIMULATION_TYPE,
+                              int ATTENUATION, int NSPEC,
+                              float* one_minus_sum_beta,float* factor_common,                                   
+                              float* R_xx, float* R_yy, float* R_xy, float* R_xz, float* R_yz,
+                              float* alphaval,float* betaval,float* gammaval
+                              ){
     
   /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
   /* int bx = blockIdx.x; */
-  int tx = threadIdx.x;
-
+  int tx = threadIdx.x;  
   
-  
   //const int NGLLX = 5;
   // const int NGLL2 = 25; 
   const int NGLL3 = 125;
@@ -435,6 +594,7 @@
   int active,offset;
   int iglob = 0;
   int working_element;
+  
   reald tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
   reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
   reald duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
@@ -442,6 +602,7 @@
   reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
   reald fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
   reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+  reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
 
 #ifndef MANUALLY_UNROLLED_LOOPS
     int l;
@@ -475,14 +636,14 @@
       iglob = d_ibool[working_element*125 + tx]-1;
       
 #ifdef USE_TEXTURES
-        s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
-        s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
-        s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
+      s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
+      s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
+      s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
 #else
-	// changing iglob indexing to match fortran row changes fast style
-	s_dummyx_loc[tx] = d_displ[iglob*3];
-	s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
-        s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+      // changing iglob indexing to match fortran row changes fast style
+      s_dummyx_loc[tx] = d_displ[iglob*3];
+      s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+      s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
 #endif
     }
 
@@ -497,179 +658,203 @@
 
 #ifndef MANUALLY_UNROLLED_LOOPS
 
-        tempx1l = 0.f;
-        tempx2l = 0.f;
-        tempx3l = 0.f;
+      tempx1l = 0.f;
+      tempx2l = 0.f;
+      tempx3l = 0.f;
 
-        tempy1l = 0.f;
-        tempy2l = 0.f;
-        tempy3l = 0.f;
+      tempy1l = 0.f;
+      tempy2l = 0.f;
+      tempy3l = 0.f;
 
-        tempz1l = 0.f;
-        tempz2l = 0.f;
-        tempz3l = 0.f;
+      tempz1l = 0.f;
+      tempz2l = 0.f;
+      tempz3l = 0.f;
 
-        for (l=0;l<NGLLX;l++) {
-            hp1 = d_hprime_xx[l*NGLLX+I];
-            offset = K*NGLL2+J*NGLLX+l;
-            tempx1l += s_dummyx_loc[offset]*hp1;
-            tempy1l += s_dummyy_loc[offset]*hp1;
-            tempz1l += s_dummyz_loc[offset]*hp1;
-	    
-            hp2 = d_hprime_xx[l*NGLLX+J];
-            offset = K*NGLL2+l*NGLLX+I;
-            tempx2l += s_dummyx_loc[offset]*hp2;
-            tempy2l += s_dummyy_loc[offset]*hp2;
-            tempz2l += s_dummyz_loc[offset]*hp2;
+      for (l=0;l<NGLLX;l++) {
+          hp1 = d_hprime_xx[l*NGLLX+I];
+          offset = K*NGLL2+J*NGLLX+l;
+          tempx1l += s_dummyx_loc[offset]*hp1;
+          tempy1l += s_dummyy_loc[offset]*hp1;
+          tempz1l += s_dummyz_loc[offset]*hp1;
+    
+          hp2 = d_hprime_xx[l*NGLLX+J];
+          offset = K*NGLL2+l*NGLLX+I;
+          tempx2l += s_dummyx_loc[offset]*hp2;
+          tempy2l += s_dummyy_loc[offset]*hp2;
+          tempz2l += s_dummyz_loc[offset]*hp2;
 
-            hp3 = d_hprime_xx[l*NGLLX+K];
-            offset = l*NGLL2+J*NGLLX+I;
-            tempx3l += s_dummyx_loc[offset]*hp3;
-            tempy3l += s_dummyy_loc[offset]*hp3;
-            tempz3l += s_dummyz_loc[offset]*hp3;
+          hp3 = d_hprime_xx[l*NGLLX+K];
+          offset = l*NGLL2+J*NGLLX+I;
+          tempx3l += s_dummyx_loc[offset]*hp3;
+          tempy3l += s_dummyy_loc[offset]*hp3;
+          tempz3l += s_dummyz_loc[offset]*hp3;
 
-	    // if(working_element == 169 && tx == 0) {
-	    //   atomicAdd(&d_debug[0],1.0);
-	    //   d_debug[1+3*l] = tempz3l;
-	    //   d_debug[2+3*l] = s_dummyz_loc[offset];	      
-	    //   d_debug[3+3*l] = hp3;	      
-	    // }
-	    
-        }
+    // if(working_element == 169 && tx == 0) {
+    //   atomicAdd(&d_debug[0],1.0);
+    //   d_debug[1+3*l] = tempz3l;
+    //   d_debug[2+3*l] = s_dummyz_loc[offset];	      
+    //   d_debug[3+3*l] = hp3;	      
+    // }
+    
+      }
 #else
 
-            tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-                    + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-                    + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-                    + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-                    + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];	    
-	    
-            tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-                    + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-                    + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-                    + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-                    + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+      tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];	    
 
-            tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
-                    + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
-                    + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
-                    + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
-                    + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+      tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-            tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
-                    + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-                    + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-                    + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-                    + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+      tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
 
-            tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
-                    + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-                    + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-                    + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-                    + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+      tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-            tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
-                    + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
-                    + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
-                    + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
-                    + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+      tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-            tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
-                    + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-                    + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-                    + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-                    + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+      tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
 
-            tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
-                    + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-                    + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-                    + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-                    + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+      tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
-            tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
-                    + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
-                    + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
-                    + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
-                    + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+      tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
+      tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
 #endif
 
 // compute derivatives of ux, uy and uz with respect to x, y and z
-        offset = working_element*NGLL3_ALIGN + tx;
+      offset = working_element*NGLL3_ALIGN + tx;
 
-        xixl = d_xix[offset];
-        xiyl = d_xiy[offset];
-        xizl = d_xiz[offset];
-        etaxl = d_etax[offset];
-        etayl = d_etay[offset];
-        etazl = d_etaz[offset];
-        gammaxl = d_gammax[offset];
-        gammayl = d_gammay[offset];
-        gammazl = d_gammaz[offset];
+      xixl = d_xix[offset];
+      xiyl = d_xiy[offset];
+      xizl = d_xiz[offset];
+      etaxl = d_etax[offset];
+      etayl = d_etay[offset];
+      etazl = d_etaz[offset];
+      gammaxl = d_gammax[offset];
+      gammayl = d_gammay[offset];
+      gammazl = d_gammaz[offset];
 
-        duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;	
-	duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
-        duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+      duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;	
+      duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+      duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
 
-        duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
-        duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
-        duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+      duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+      duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+      duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
 
-        duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
-        duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
-        duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+      duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+      duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+      duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
 
-	
-	
-        duxdxl_plus_duydyl = duxdxl + duydyl;
-        duxdxl_plus_duzdzl = duxdxl + duzdzl;
-        duydyl_plus_duzdzl = duydyl + duzdzl;
-        duxdyl_plus_duydxl = duxdyl + duydxl;
-        duzdxl_plus_duxdzl = duzdxl + duxdzl;
-        duzdyl_plus_duydzl = duzdyl + duydzl;
 
-	if(COMPUTE_AND_STORE_STRAIN) {
-	  float templ = 1.0f/3.0f * (duxdxl + duydyl + duzdzl);
-	  epsilondev_xx[offset] = duxdxl - templ;
-	  epsilondev_yy[offset] = duydyl - templ;
-	  epsilondev_xy[offset] = 0.5 * duxdyl_plus_duydxl;
-	  epsilondev_xz[offset] = 0.5 * duzdxl_plus_duxdzl;
-	  epsilondev_yz[offset] = 0.5 * duzdyl_plus_duydzl;
-	  if(SIMULATION_TYPE == 3) {
-	    epsilon_trace_over_3[tx + working_element*125] = templ;
-	  }
-	}
 
-// compute elements with an elastic isotropic rheology
-        kappal = d_kappav[offset];
-        mul = d_muv[offset];
+      duxdxl_plus_duydyl = duxdxl + duydyl;
+      duxdxl_plus_duzdzl = duxdxl + duzdzl;
+      duydyl_plus_duzdzl = duydyl + duzdzl;
+      duxdyl_plus_duydxl = duxdyl + duydxl;
+      duzdxl_plus_duxdzl = duzdxl + duxdzl;
+      duzdyl_plus_duydzl = duzdyl + duydzl;
 
-        lambdalplus2mul = kappal + 1.33333333333333333333f * mul;  // 4./3. = 1.3333333
-        lambdal = lambdalplus2mul - 2.f*mul;
+      // computes deviatoric strain attenuation and/or for kernel calculations
+      if(COMPUTE_AND_STORE_STRAIN) {
+        float templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+        /*
+        epsilondev_xx[offset] = duxdxl - templ;
+        epsilondev_yy[offset] = duydyl - templ;
+        epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
+        epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
+        epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
+              */
+        // local storage: stresses at this current time step      
+        epsilondev_xx_loc = duxdxl - templ;
+        epsilondev_yy_loc = duydyl - templ;
+        epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+        epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+        epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;        
 
-// compute the six components of the stress tensor sigma
-        sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
-        sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
-        sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+        if(SIMULATION_TYPE == 3) {
+          epsilon_trace_over_3[tx + working_element*125] = templ;
+        }
+      }
+
+      // compute elements with an elastic isotropic rheology
+      kappal = d_kappav[offset];
+      mul = d_muv[offset];
+
+      // attenuation
+      if(ATTENUATION){
+        // use unrelaxed parameters if attenuation
+        mul  = mul * one_minus_sum_beta[tx+working_element*125]; // (i,j,k,ispec)
+      }
+          
+      // isotropic case    
+      lambdalplus2mul = kappal + 1.33333333333333333333f * mul;  // 4./3. = 1.3333333
+      lambdal = lambdalplus2mul - 2.0f * mul;
+
+      // compute the six components of the stress tensor sigma
+      sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+      sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+      sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
 	
-        sigma_xy = mul*duxdyl_plus_duydxl;
-        sigma_xz = mul*duzdxl_plus_duxdzl;
-        sigma_yz = mul*duzdyl_plus_duydzl;
+      sigma_xy = mul*duxdyl_plus_duydxl;
+      sigma_xz = mul*duzdxl_plus_duxdzl;
+      sigma_yz = mul*duzdyl_plus_duydzl;
 
-        jacobianl = 1.f / (xixl*(etayl*gammazl-etazl*gammayl)-xiyl*(etaxl*gammazl-etazl*gammaxl)+xizl*(etaxl*gammayl-etayl*gammaxl));
+      if(ATTENUATION){
+        // subtract memory variables if attenuation
+        compute_element_att_stress(tx,working_element,NSPEC,
+                                   R_xx,R_yy,R_xy,R_xz,R_yz,
+                                   &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+      }
+          
+      jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)-xiyl*(etaxl*gammazl-etazl*gammaxl)+xizl*(etaxl*gammayl-etayl*gammaxl));
 
 // form the dot product with the test vector
-        s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl);	
-	s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl);
-        s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+      s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl);	
+      s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl);
+      s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
 
-        s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl);
-        s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl);
-        s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+      s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl);
+      s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl);
+      s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
 
-        s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl);
-        s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl);
-        s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+      s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl);
+      s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl);
+      s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
 	
     }
 
@@ -682,152 +867,173 @@
 
 #ifndef MANUALLY_UNROLLED_LOOPS
 
-        tempx1l = 0.f;
-        tempy1l = 0.f;
-        tempz1l = 0.f;
+      tempx1l = 0.f;
+      tempy1l = 0.f;
+      tempz1l = 0.f;
 
-        tempx2l = 0.f;
-        tempy2l = 0.f;
-        tempz2l = 0.f;
+      tempx2l = 0.f;
+      tempy2l = 0.f;
+      tempz2l = 0.f;
 
-        tempx3l = 0.f;
-        tempy3l = 0.f;
-        tempz3l = 0.f;
+      tempx3l = 0.f;
+      tempy3l = 0.f;
+      tempz3l = 0.f;
 
-        for (l=0;l<NGLLX;l++) {	  	  
+      for (l=0;l<NGLLX;l++) {	  	  
 	  
-	  fac1 = d_hprimewgll_xx[I*NGLLX+l];
-	  offset = K*NGLL2+J*NGLLX+l;
-	  tempx1l += s_tempx1[offset]*fac1;
-	  tempy1l += s_tempy1[offset]*fac1;
-	  tempz1l += s_tempz1[offset]*fac1;
-	    
-	  fac2 = d_hprimewgll_xx[J*NGLLX+l];
-	  offset = K*NGLL2+l*NGLLX+I;
-	  tempx2l += s_tempx2[offset]*fac2;
-	  tempy2l += s_tempy2[offset]*fac2;
-	  tempz2l += s_tempz2[offset]*fac2;
+        fac1 = d_hprimewgll_xx[I*NGLLX+l];
+        offset = K*NGLL2+J*NGLLX+l;
+        tempx1l += s_tempx1[offset]*fac1;
+        tempy1l += s_tempy1[offset]*fac1;
+        tempz1l += s_tempz1[offset]*fac1;
+          
+        fac2 = d_hprimewgll_xx[J*NGLLX+l];
+        offset = K*NGLL2+l*NGLLX+I;
+        tempx2l += s_tempx2[offset]*fac2;
+        tempy2l += s_tempy2[offset]*fac2;
+        tempz2l += s_tempz2[offset]*fac2;
 
-	  fac3 = d_hprimewgll_xx[K*NGLLX+l];
-	  offset = l*NGLL2+J*NGLLX+I;
-	  tempx3l += s_tempx3[offset]*fac3;
-	  tempy3l += s_tempy3[offset]*fac3;
-	  tempz3l += s_tempz3[offset]*fac3;
+        fac3 = d_hprimewgll_xx[K*NGLLX+l];
+        offset = l*NGLL2+J*NGLLX+I;
+        tempx3l += s_tempx3[offset]*fac3;
+        tempy3l += s_tempy3[offset]*fac3;
+        tempz3l += s_tempz3[offset]*fac3;
 
-	  if(working_element == 169)
-	    if(l==0)
-	      if(I+J+K == 0) {
-		// atomicAdd(&d_debug[0],1.0);
-		// d_debug[0] = fac3;
-		// d_debug[1] = offset;
-		// d_debug[2] = s_tempz3[offset];
-	      }
-        }
+        if(working_element == 169)
+          if(l==0)
+            if(I+J+K == 0) {
+              // atomicAdd(&d_debug[0],1.0);
+              // d_debug[0] = fac3;
+              // d_debug[1] = offset;
+              // d_debug[2] = s_tempz3[offset];
+            }
+      }
 #else
 
-            tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
-                    + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
-                    + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
-                    + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
-                    + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
-	    
-            tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
-                    + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
-                    + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
-                    + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
-                    + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+      tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
 
-            tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
-                    + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
-                    + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
-                    + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
-                    + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+      tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
 
-            tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
-                    + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
-                    + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
-                    + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
-                    + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+      tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+              + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+              + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+              + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+              + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
 
-            tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
-                    + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
-                    + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
-                    + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
-                    + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+      tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+              + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+              + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+              + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+              + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
 
-            tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
-                    + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
-                    + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
-                    + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
-                    + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+      tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+              + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+              + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+              + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+              + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
 
-            tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
-                    + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
-                    + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
-                    + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
-                    + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+      tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+              + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+              + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+              + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+              + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
 
-            tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
-                    + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
-                    + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
-                    + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
-                    + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+      tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+              + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+              + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+              + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+              + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
 
-            tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
-                    + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
-                    + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
-                    + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
-                    + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+      tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+              + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+              + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+              + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+              + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
 
+      tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+              + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+              + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+              + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+              + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
 #endif
 
-        fac1 = d_wgllwgll_yz[K*NGLLX+J];
-        fac2 = d_wgllwgll_xz[K*NGLLX+I];
-        fac3 = d_wgllwgll_xy[J*NGLLX+I];
+      fac1 = d_wgllwgll_yz[K*NGLLX+J];
+      fac2 = d_wgllwgll_xz[K*NGLLX+I];
+      fac3 = d_wgllwgll_xy[J*NGLLX+I];
 
 #ifdef USE_TEXTURES
-        d_accel[iglob] = tex1Dfetch(tex_accel, iglob) - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
-        d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
-        d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+      d_accel[iglob] = tex1Dfetch(tex_accel, iglob) - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+      d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+      d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
 #else
 	/* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
-	// d_accel[iglob*3] -= (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
-        // d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
-        // d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);		
+      // d_accel[iglob*3] -= (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+      // d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+      // d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);		
 
-	if(iglob*3+2 == 41153) {
-	  // int ot = d_debug[5];
-	  // d_debug[0+1+ot] = d_accel[iglob*3+2];
-	  // // d_debug[1+1+ot] = fac1*tempz1l;
-	  // // d_debug[2+1+ot] = fac2*tempz2l;
-	  // // d_debug[3+1+ot] = fac3*tempz3l;
-	  // d_debug[1+1+ot] = fac1;
-	  // d_debug[2+1+ot] = fac2;
-	  // d_debug[3+1+ot] = fac3;
-	  // d_debug[4+1+ot] = d_accel[iglob*3+2]-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
-	  // atomicAdd(&d_debug[0],1.0);
-	  // d_debug[6+ot] = d_displ[iglob*3+2];
-	}
+      if(iglob*3+2 == 41153) {
+        // int ot = d_debug[5];
+        // d_debug[0+1+ot] = d_accel[iglob*3+2];
+        // // d_debug[1+1+ot] = fac1*tempz1l;
+        // // d_debug[2+1+ot] = fac2*tempz2l;
+        // // d_debug[3+1+ot] = fac3*tempz3l;
+        // d_debug[1+1+ot] = fac1;
+        // d_debug[2+1+ot] = fac2;
+        // d_debug[3+1+ot] = fac3;
+        // d_debug[4+1+ot] = d_accel[iglob*3+2]-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+        // atomicAdd(&d_debug[0],1.0);
+        // d_debug[6+ot] = d_displ[iglob*3+2];
+      }
 	
-	atomicAdd(&d_accel[iglob*3],-(fac1*tempx1l + fac2*tempx2l + fac3*tempx3l));		
-	atomicAdd(&d_accel[iglob*3+1],-(fac1*tempy1l + fac2*tempy2l + fac3*tempy3l));
-	atomicAdd(&d_accel[iglob*3+2],-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l));
+      atomicAdd(&d_accel[iglob*3],-(fac1*tempx1l + fac2*tempx2l + fac3*tempx3l));		
+      atomicAdd(&d_accel[iglob*3+1],-(fac1*tempy1l + fac2*tempy2l + fac3*tempy3l));
+      atomicAdd(&d_accel[iglob*3+2],-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l));
 	
 #endif
+
+      // update memory variables based upon the Runge-Kutta scheme
+      if( ATTENUATION ){
+        compute_element_att_memory(tx,working_element,NSPEC,
+                                  d_muv,
+                                  factor_common,alphaval,betaval,gammaval,
+                                  R_xx,R_yy,R_xy,R_xz,R_yz,
+                                  epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+                                  epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc);
+      }
+
+      // save deviatoric strain for Runge-Kutta scheme
+      if( COMPUTE_AND_STORE_STRAIN ){
+        int ijk_ispec = tx + working_element*125;
+        
+        // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+        epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;        
+        epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
+        epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
+        epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
+        epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
+      }
+
     }
 
 #else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-        d_accel[iglob] -= 0.00000001f;
-        d_accel[iglob + NGLOB] -= 0.00000001f;
-        d_accel[iglob + 2*NGLOB] -= 0.00000001f;
+    d_accel[iglob] -= 0.00000001f;
+    d_accel[iglob + NGLOB] -= 0.00000001f;
+    d_accel[iglob + 2*NGLOB] -= 0.00000001f;
 #endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void kernel_3_cuda_device(real* veloc,
-				     real* accel, int size,
-				     real deltatover2, real* rmass);
+// KERNEL 3
 
 /* ----------------------------------------------------------------------------------------------- */
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2011-10-07 01:29:29 UTC (rev 19039)
@@ -159,29 +159,14 @@
 #endif
 }
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
-extern "C" 
-void FC_FUNC_(transfer_sensitivity_kernels_to_host,
-              TRANSFER_SENSITIVITY_KERNELS_TO_HOST)(long* Mesh_pointer, float* h_rho_kl,
-                                                    float* h_mu_kl, float* h_kappa_kl,
-                                                    float* h_Sigma_kl,int* NSPEC_AB,int* NSPEC_AB_VAL) {
+// NOISE SIMULATIONS
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
-  print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*125*sizeof(float),
-				     cudaMemcpyDeviceToHost),1);
-  print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*125*sizeof(float),
-				     cudaMemcpyDeviceToHost),1);
-  print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*125*sizeof(float),
-				     cudaMemcpyDeviceToHost),1);
-  print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,125*(*NSPEC_AB_VAL)*sizeof(float),
-				     cudaMemcpyHostToDevice),4);
-  
-}
-
 /* ----------------------------------------------------------------------------------------------- */
 
+
 __global__ void compute_kernels_strength_noise_cuda_kernel(float* displ, 
                                                            int* free_surface_ispec,
                                                            int* free_surface_ijk,
@@ -236,6 +221,9 @@
                                                     float* h_noise_surface_movie,
                                                     int* num_free_surface_faces_f,
                                                     float* deltat) {
+
+TRACE("compute_kernels_strength_noise_cuda");
+                                                    
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
   int num_free_surface_faces = *num_free_surface_faces_f;
 
@@ -498,24 +486,3 @@
 #endif
 }
 
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C" 
-void FC_FUNC_(transfer_sensitivity_kernels_acoustic_to_host,
-              TRANSFER_SENSITIVITY_KERNELS_ACOUSTIC_TO_HOST)(long* Mesh_pointer, 
-                                                             float* h_rho_ac_kl,
-                                                             float* h_kappa_ac_kl,
-                                                             int* NSPEC_AB) {
-
-TRACE("transfer_sensitivity_kernels_acoustic_to_host");  
-  
-  //get mesh pointer out of fortran integer container  
-  Mesh* mp = (Mesh*)(*Mesh_pointer); 
-  int size = *NSPEC_AB*125;
-  
-  // copies kernel values over to CPU host
-  print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(float),
-                                     cudaMemcpyDeviceToHost),911);
-  print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(float),
-                                     cudaMemcpyDeviceToHost),922);  
-}

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2011-10-07 01:29:29 UTC (rev 19039)
@@ -53,7 +53,9 @@
       
       // determines bulk sound speed
       rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+
       kappal = kappastore[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+
       cpl = sqrt( kappal / rhol );
             
       // gets associated, weighted jacobian      

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2011-10-07 01:29:29 UTC (rev 19039)
@@ -63,6 +63,7 @@
 #define NDIM 3
 #define NGLLX 5
 #define NGLL2 25
+#define N_SLS 3
 
 typedef float real;   // type of variables passed into function
 typedef float realw;  // type of "working" variables
@@ -156,13 +157,29 @@
   // surface movie elements to save for noise tomography
   float* d_noise_surface_movie;
 
+  // attenuation
+  float* d_R_xx;
+  float* d_R_yy;
+  float* d_R_xy;
+  float* d_R_xz;
+  float* d_R_yz;
+
+  float* d_one_minus_sum_beta;
+  float* d_factor_common;
+  
+  float* d_alphaval;
+  float* d_betaval;
+  float* d_gammaval;
+  
+  // attenuation & kernel
   float* d_epsilondev_xx;
   float* d_epsilondev_yy;
   float* d_epsilondev_xy;
   float* d_epsilondev_xz;
   float* d_epsilondev_yz;
   float* d_epsilon_trace_over_3;
-  
+      
+  // noise
   float* d_normal_x_noise;
   float* d_normal_y_noise;
   float* d_normal_z_noise;
@@ -171,17 +188,30 @@
 
   float* d_noise_sourcearray;
 
+  // attenuation & kernel backward fields
+  float* d_b_R_xx;
+  float* d_b_R_yy;
+  float* d_b_R_xy;
+  float* d_b_R_xz;
+  float* d_b_R_yz;
+  
   float* d_b_epsilondev_xx;
   float* d_b_epsilondev_yy;
   float* d_b_epsilondev_xy;
   float* d_b_epsilondev_xz;
   float* d_b_epsilondev_yz;
   float* d_b_epsilon_trace_over_3;
+
+  float* d_b_alphaval;
+  float* d_b_betaval;
+  float* d_b_gammaval;
   
   // sensitivity kernels
   float* d_rho_kl;
   float* d_mu_kl;
   float* d_kappa_kl;
+  
+  // noise sensitivity kernel
   float* d_Sigma_kl;
 
   // ------------------------------------------------------------------ //

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2011-10-07 01:29:29 UTC (rev 19039)
@@ -120,7 +120,17 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void noise_read_add_surface_movie_cuda_kernel(real* accel, int* ibool, int* free_surface_ispec,int* free_surface_ijk, int num_free_surface_faces, real* noise_surface_movie, real* normal_x_noise, real* normal_y_noise, real* normal_z_noise, real* mask_noise, real* free_surface_jacobian2Dw, real* wgllwgll_xy,float* d_debug) {
+__global__ void noise_read_add_surface_movie_cuda_kernel(real* accel, int* ibool, 
+                                                         int* free_surface_ispec,int* free_surface_ijk, 
+                                                         int num_free_surface_faces, 
+                                                         real* noise_surface_movie, 
+                                                         real* normal_x_noise, 
+                                                         real* normal_y_noise, 
+                                                         real* normal_z_noise, 
+                                                         real* mask_noise, 
+                                                         real* free_surface_jacobian2Dw, 
+                                                         real* wgllwgll_xy,
+                                                         float* d_debug) {
 
   int iface = blockIdx.x + gridDim.x*blockIdx.y; // surface element id
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2011-10-07 01:29:29 UTC (rev 19039)
@@ -92,7 +92,7 @@
 void setConst_wgllwgll_xz(float* array, Mesh* mp);
 void setConst_wgllwgll_yz(float* array, Mesh* mp);
 
-void exit_on_cuda_error(char* kernel_name);
-void show_free_memory(char* info_str);
+//void exit_on_cuda_error(char* kernel_name);
+//void show_free_memory(char* info_str);
 
 #endif //CUDA_HEADER_H

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-10-07 01:29:29 UTC (rev 19039)
@@ -70,11 +70,24 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+void exit_on_error(char* info)
+{
+  printf("\nERROR: %s\n",info);
+  fflush(stdout);
+#ifdef USE_MPI
+  MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+  exit(EXIT_FAILURE);
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
 void print_CUDA_error_if_any(cudaError_t err, int num)
 {
   if (cudaSuccess != err)
   {
-    printf("\nCUDA error !!!!! <%s> !!!!! at CUDA call # %d\n",cudaGetErrorString(err),num);
+    printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call # %d\n",cudaGetErrorString(err),num);
     fflush(stdout);
 #ifdef USE_MPI
     MPI_Abort(MPI_COMM_WORLD,1);
@@ -114,17 +127,6 @@
   double free_db,used_db,total_db;
 
   get_free_memory(&free_db,&used_db,&total_db);
-  //  size_t free_byte ;
-  //  size_t total_byte ;
-  //  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
-  //  if ( cudaSuccess != cuda_status ){
-  //    printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
-  //    exit(1); 
-  //  }
-  // 
-  //  double free_db = (double)free_byte ;
-  //  double total_db = (double)total_byte ;
-  //  double used_db = total_db - free_db ;
   
   sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_mem_usage_proc_%03d.txt",myrank);
   fp = fopen(filename,"a+");
@@ -157,18 +159,6 @@
   
   get_free_memory(&free_db,&used_db,&total_db);
   
-//  size_t free_byte ;
-//  size_t total_byte ;
-//  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
-//  if ( cudaSuccess != cuda_status ){
-//    printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
-//    exit(1);
-//  }
-//
-//  double free_db = (double)free_byte ;
-//  double total_db = (double)total_byte ;
-//  double used_db = total_db - free_db ;
-
   printf("%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
 	 used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
   
@@ -209,183 +199,6 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-//void prepare_constants(int NGLLX, int NSPEC_AB, int NGLOB_AB,
-//		       float* h_xix, float* h_xiy, float* h_xiz,
-//		       float** d_xix, float** d_xiy, float** d_xiz,
-//		       float* h_etax, float* h_etay, float* h_etaz,
-//		       float** d_etax, float** d_etay, float** d_etaz,
-//		       float* h_gammax, float* h_gammay, float* h_gammaz,
-//		       float** d_gammax, float** d_gammay, float** d_gammaz,
-//		       float* h_kappav, float* h_muv,
-//		       float** d_kappav, float** d_muv,
-//		       int* h_ibool, int** d_ibool,
-//		       //int* h_phase_ispec_inner_elastic, int** d_phase_ispec_inner_elastic,
-//		       //int num_phase_ispec_elastic,
-//		       //float* h_rmass, float** d_rmass,
-//		       int num_interfaces_ext_mesh, int max_nibool_interfaces_ext_mesh,
-//		       int* h_nibool_interfaces_ext_mesh, int** d_nibool_interfaces_ext_mesh,
-//		       int* h_ibool_interfaces_ext_mesh, int** d_ibool_interfaces_ext_mesh,		       
-//		       float* h_hprime_xx, float* h_hprimewgll_xx,
-//		       float* h_wgllwgll_xy, float* h_wgllwgll_xz,
-//		       float* h_wgllwgll_yz,
-//		       int* h_abs_boundary_ispec, int** d_abs_boundary_ispec,
-//		       int* h_abs_boundary_ijk, int** d_abs_boundary_ijk,
-//		       float* h_abs_boundary_normal, float** d_abs_boundary_normal,
-//		       //float* h_rho_vp,float** d_rho_vp,
-//		       //float* h_rho_vs,float** d_rho_vs,
-//		       float* h_abs_boundary_jacobian2Dw,float** d_abs_boundary_jacobian2Dw,
-//		       float* h_b_absorb_field,float** d_b_absorb_field,
-//		       int num_abs_boundary_faces, int b_num_abs_boundary_faces,
-//		       int* h_ispec_is_inner, int** d_ispec_is_inner,
-//		       //int* h_ispec_is_elastic, int** d_ispec_is_elastic,
-//		       int NSOURCES,
-//		       float* h_sourcearrays,float** d_sourcearrays,
-//		       int* h_islice_selected_source, int** d_islice_selected_source,
-//		       int* h_ispec_selected_source, int** d_ispec_selected_source,
-//           int SIMULATION_TYPE){
-//
-//TRACE("prepare_constants");
-//  
-//  // EPIK_USER_REG(r_name,"compute_forces");
-//  // EPIK_USER_REG(r_name,
-//  
-//  /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
-//  int size_padded = 128*NSPEC_AB;
-//  int size = NGLLX*NGLLX*NGLLX*NSPEC_AB;
-//
-//  // mesh  
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_xix, size_padded*sizeof(float)),5);	 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_xiy, size_padded*sizeof(float)),6);	 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_xiz, size_padded*sizeof(float)),7);	 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_etax, size_padded*sizeof(float)),8);	 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_etay, size_padded*sizeof(float)),9);	 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_etaz, size_padded*sizeof(float)),10);	 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_gammax, size_padded*sizeof(float)),11);	 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_gammay, size_padded*sizeof(float)),12);	 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_gammaz, size_padded*sizeof(float)),13);	 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_kappav, size_padded*sizeof(float)),14); 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_muv, size_padded*sizeof(float)),15);	 
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_ibool, size_padded*sizeof(int)),16);
-//
-////  print_CUDA_error_if_any(cudaMalloc((void**) d_phase_ispec_inner_elastic, num_phase_ispec_elastic*2*sizeof(int)),17);
-////  print_CUDA_error_if_any(cudaMalloc((void**) d_rmass, NGLOB_AB*sizeof(float)),17);
-//
-//  // absorbing boundaries
-//  if( num_abs_boundary_faces > 0 ){
-//    print_CUDA_error_if_any(cudaMalloc((void**) d_abs_boundary_ispec,
-//                                       num_abs_boundary_faces*sizeof(int)),769);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_abs_boundary_ispec, h_abs_boundary_ispec,
-//                                       num_abs_boundary_faces*sizeof(int),cudaMemcpyHostToDevice),770);
-//    
-//    print_CUDA_error_if_any(cudaMalloc((void**) d_abs_boundary_ijk,
-//                                       3*25*num_abs_boundary_faces*sizeof(int)),772);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_abs_boundary_ijk, h_abs_boundary_ijk,
-//                                       3*25*num_abs_boundary_faces*sizeof(int),cudaMemcpyHostToDevice),773);
-//    
-//    print_CUDA_error_if_any(cudaMalloc((void**) d_abs_boundary_normal,
-//                                       3*25*num_abs_boundary_faces*sizeof(int)),783);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_abs_boundary_normal, h_abs_boundary_normal,
-//                                       3*25*num_abs_boundary_faces*sizeof(int),cudaMemcpyHostToDevice),783);
-//    
-//    print_CUDA_error_if_any(cudaMalloc((void**) d_abs_boundary_jacobian2Dw,
-//                                       25*num_abs_boundary_faces*sizeof(float)),784);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
-//                                       25*num_abs_boundary_faces*sizeof(float),cudaMemcpyHostToDevice),784);
-//  }
-//  
-//
-///*  
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_rho_vp, size*sizeof(float)),5);
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_rho_vs, size*sizeof(float)),6);
-//  print_CUDA_error_if_any(cudaMemcpy(*d_rho_vp,h_rho_vp,size*sizeof(float),
-//				     cudaMemcpyHostToDevice),5);
-//  print_CUDA_error_if_any(cudaMemcpy(*d_rho_vs,h_rho_vs,size*sizeof(float),
-//				     cudaMemcpyHostToDevice),5);
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_b_absorb_field, 3*25*b_num_abs_boundary_faces*sizeof(float)),7);
-//  print_CUDA_error_if_any(cudaMemcpy(*d_b_absorb_field, h_b_absorb_field,
-//				     3*25*b_num_abs_boundary_faces*sizeof(float),
-//				     cudaMemcpyHostToDevice),7);
-//  
-//  print_CUDA_error_if_any(cudaMemcpy(*d_rmass,h_rmass,NGLOB_AB*sizeof(float),cudaMemcpyHostToDevice),18);
-//*/
-//
-//  // prepare interprocess-edge exchange information
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_nibool_interfaces_ext_mesh,
-//				     num_interfaces_ext_mesh*sizeof(int)),19);
-//  print_CUDA_error_if_any(cudaMemcpy(*d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
-//				     num_interfaces_ext_mesh*sizeof(int),cudaMemcpyHostToDevice),19);
-//  
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_ibool_interfaces_ext_mesh,
-//				     num_interfaces_ext_mesh*max_nibool_interfaces_ext_mesh*sizeof(int)),20);
-//  print_CUDA_error_if_any(cudaMemcpy(*d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
-//				     num_interfaces_ext_mesh*max_nibool_interfaces_ext_mesh*sizeof(int),
-//				     cudaMemcpyHostToDevice),20);
-//
-//  print_CUDA_error_if_any(cudaMalloc((void**) d_ispec_is_inner,NSPEC_AB*sizeof(int)),21);
-//  print_CUDA_error_if_any(cudaMemcpy(*d_ispec_is_inner, h_ispec_is_inner,
-//				     NSPEC_AB*sizeof(int),
-//				     cudaMemcpyHostToDevice),21);
-//
-///*  print_CUDA_error_if_any(cudaMalloc((void**) d_ispec_is_elastic,NSPEC_AB*sizeof(int)),21);
-//  print_CUDA_error_if_any(cudaMemcpy(*d_ispec_is_elastic, h_ispec_is_elastic,
-//				     NSPEC_AB*sizeof(int),
-//				     cudaMemcpyHostToDevice),21);
-//*/
-//
-//  print_CUDA_error_if_any(cudaMemcpy(*d_ibool, h_ibool,
-//				     size*sizeof(int)  ,cudaMemcpyHostToDevice),512);    
-//
-//  // sources
-//  if (SIMULATION_TYPE == 1  || SIMULATION_TYPE == 3){
-//    print_CUDA_error_if_any(cudaMalloc((void**)d_sourcearrays, sizeof(float)*NSOURCES*3*125),22);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_sourcearrays, h_sourcearrays, sizeof(float)*NSOURCES*3*125,
-//                                       cudaMemcpyHostToDevice),522);
-//  }
-//  
-//
-//  print_CUDA_error_if_any(cudaMalloc((void**)d_islice_selected_source, sizeof(int)*NSOURCES),23);
-//  print_CUDA_error_if_any(cudaMemcpy(*d_islice_selected_source, h_islice_selected_source, sizeof(int)*NSOURCES,
-//                                     cudaMemcpyHostToDevice),523);
-//  
-//  print_CUDA_error_if_any(cudaMalloc((void**)d_ispec_selected_source, sizeof(int)*NSOURCES),24);
-//  print_CUDA_error_if_any(cudaMemcpy(*d_ispec_selected_source, h_ispec_selected_source,sizeof(int)*NSOURCES,
-//                                     cudaMemcpyHostToDevice),524);
-//  
-//  // transfer constant element data with padding
-//  for(int i=0;i<NSPEC_AB;i++) {
-//    print_CUDA_error_if_any(cudaMemcpy(*d_xix + i*128, &h_xix[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),70);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_xiy+i*128,   &h_xiy[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),71);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_xiz+i*128,   &h_xiz[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),72);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_etax+i*128,  &h_etax[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),73);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_etay+i*128,  &h_etay[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),74);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_etaz+i*128,  &h_etaz[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),75);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_gammax+i*128,&h_gammax[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),76);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_gammay+i*128,&h_gammay[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),78);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_gammaz+i*128,&h_gammaz[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),79);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_kappav+i*128,&h_kappav[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),80);
-//    print_CUDA_error_if_any(cudaMemcpy(*d_muv+i*128,   &h_muv[i*125],
-//                                       125*sizeof(float),cudaMemcpyHostToDevice),81);      
-//  }
-//
-////  print_CUDA_error_if_any(cudaMemcpy(*d_phase_ispec_inner_elastic, h_phase_ispec_inner_elastic, num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),13);
-//  
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//  exit_on_cuda_error("prepare_constants");
-//#endif        
-//}
-
-/* ----------------------------------------------------------------------------------------------- */
-
 extern "C"
 void FC_FUNC_(prepare_constants_device,
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
@@ -396,8 +209,6 @@
                                         float* h_gammax, float* h_gammay, float* h_gammaz,
                                         float* h_kappav, float* h_muv,
                                         int* h_ibool, 
-                                        //int* h_phase_ispec_inner_elastic,int* num_phase_ispec_elastic,
-                                        //float* h_rmass,
                                         int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
                                         int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
                                         float* h_hprime_xx, 
@@ -407,18 +218,12 @@
                                         float* h_wgllwgll_xy, 
                                         float* h_wgllwgll_xz,
                                         float* h_wgllwgll_yz,            
-                                        //float* h_hprime_xx, float* h_hprimewgll_xx,
-                                        //float* h_wgllwgll_xy, float* h_wgllwgll_xz,
-                                        //float* h_wgllwgll_yz,
                                         int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
                                         float* h_abs_boundary_normal,
-                                        //float* h_rho_vp,
-                                        //float* h_rho_vs,
                                         float* h_abs_boundary_jacobian2Dw,
                                         float* h_b_absorb_field,
                                         int* num_abs_boundary_faces, int* b_num_abs_boundary_faces,
                                         int* h_ispec_is_inner, 
-                                        //int* h_ispec_is_elastic,
                                         int* NSOURCES,
                                         float* h_sourcearrays,
                                         int* h_islice_selected_source,
@@ -431,14 +236,18 @@
 
 TRACE("prepare_constants_device");
   
-  int device_count,procid;
+  int procid;
+  int device_count = 0;
   
   // cuda initialization (needs -lcuda library)
-  cuInit(0);
-  
+  //cuInit(0);
+  CUresult status = cuInit(0);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device initialization failed");
+      
   // Gets number of GPU devices     
   cudaGetDeviceCount(&device_count);
   //printf("Cuda Devices: %d\n", device_count);
+  if (device_count == 0) exit_on_error("There is no device supporting CUDA\n");    
   
   // Gets rank number of MPI process
   MPI_Comm_rank(MPI_COMM_WORLD, &procid);
@@ -451,15 +260,14 @@
     exit_on_cuda_error("cudaSetDevice");   
   }
 
-  //printf("GPU_MODE Active. Preparing Fields and Constants on Device.\n");
-
   // allocates mesh parameter structure  
   Mesh* mp = (Mesh*)malloc(sizeof(Mesh));
+  if (mp == NULL) exit_on_error("error allocating mesh pointer"); 
   *Mesh_pointer = (long)mp;
 
   // checks if NGLLX == 5
   if( *h_NGLLX != NGLLX ){
-    exit_on_cuda_error("NGLLX must be 5 for CUDA devices");   
+    exit_on_error("NGLLX must be 5 for CUDA devices");   
   }
   
   // sets global parameters  
@@ -476,150 +284,120 @@
   setConst_wgllwgll_xy(h_wgllwgll_xy,mp);
   setConst_wgllwgll_xz(h_wgllwgll_xz,mp);
   setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
-  
-/*  setConst_hprime_xx    (h_hprime_xx    );
-  setConst_hprimewgll_xx(h_hprimewgll_xx);
-  setConst_wgllwgll_xy  (h_wgllwgll_xy,mp);
-  setConst_wgllwgll_xz  (h_wgllwgll_xz,mp);
-  setConst_wgllwgll_yz  (h_wgllwgll_yz,mp);
-*/
-  
+    
   /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
   int size_padded = 128 * (*NSPEC_AB);
   int size = 125 * (*NSPEC_AB);
 
   // mesh    
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(float)),5);	 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(float)),6);	 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(float)),7);	 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax, size_padded*sizeof(float)),8);	 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay, size_padded*sizeof(float)),9);	 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz, size_padded*sizeof(float)),10);	 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax, size_padded*sizeof(float)),11);	 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay, size_padded*sizeof(float)),12);	 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz, size_padded*sizeof(float)),13);	 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappav, size_padded*sizeof(float)),14); 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muv, size_padded*sizeof(float)),15);	 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool, size_padded*sizeof(int)),16);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(float)),1001);	 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(float)),1002);	 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(float)),1003);	 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax, size_padded*sizeof(float)),1004);	 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay, size_padded*sizeof(float)),1005);	 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz, size_padded*sizeof(float)),1006);	 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax, size_padded*sizeof(float)),1007);	 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay, size_padded*sizeof(float)),1008);	 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz, size_padded*sizeof(float)),1009);	 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappav, size_padded*sizeof(float)),1010); 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muv, size_padded*sizeof(float)),1011);	 
 
+  // global indexing
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool, size_padded*sizeof(int)),1021);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool, h_ibool,
+                                     size*sizeof(int)  ,cudaMemcpyHostToDevice),1022);    
+
     
-//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_phase_ispec_inner_elastic, *num_phase_ispec_elastic*2*sizeof(int)),17);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic, h_phase_ispec_inner_elastic, *num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),13);
-  
-//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rmass, *NGLOB_AB*sizeof(float)),17);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,h_rmass,*NGLOB_AB*sizeof(float),cudaMemcpyHostToDevice),18);
+  // prepare interprocess-edge exchange information
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_ext_mesh,
+				     *num_interfaces_ext_mesh*sizeof(int)),1201);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
+				     *num_interfaces_ext_mesh*sizeof(int),cudaMemcpyHostToDevice),1202);
 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_ext_mesh,
+                                     *num_interfaces_ext_mesh* *max_nibool_interfaces_ext_mesh*
+                                     sizeof(int)),1203);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
+				     *num_interfaces_ext_mesh* *max_nibool_interfaces_ext_mesh*sizeof(int),
+				     cudaMemcpyHostToDevice),1204);
+
+  // inner elements 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,*NSPEC_AB*sizeof(int)),1205);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
+				     *NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),1206);
+              
+
   // absorbing boundaries
   if( *num_abs_boundary_faces > 0 ){  
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_ispec,
-               (*num_abs_boundary_faces)*sizeof(int)),771);
+                                       (*num_abs_boundary_faces)*sizeof(int)),1101);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ispec, h_abs_boundary_ispec,
-               (*num_abs_boundary_faces)*sizeof(int),
-               cudaMemcpyHostToDevice),771);
+                                       (*num_abs_boundary_faces)*sizeof(int),
+                                       cudaMemcpyHostToDevice),1102);
     
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_ijk,
-               3*25*(*num_abs_boundary_faces)*sizeof(int)),772);
+                                       3*25*(*num_abs_boundary_faces)*sizeof(int)),1103);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
-               3*25*(*num_abs_boundary_faces)*sizeof(int),
-               cudaMemcpyHostToDevice),772);
+                                       3*25*(*num_abs_boundary_faces)*sizeof(int),
+                                       cudaMemcpyHostToDevice),1104);
     
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_normal,
-               3*25*(*num_abs_boundary_faces)*sizeof(int)),773);
+                                       3*25*(*num_abs_boundary_faces)*sizeof(int)),1105);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
-               3*25*(*num_abs_boundary_faces)*sizeof(int),
-               cudaMemcpyHostToDevice),773);
+                                       3*25*(*num_abs_boundary_faces)*sizeof(int),
+                                       cudaMemcpyHostToDevice),1106);
     
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_jacobian2Dw,
-               25*(*num_abs_boundary_faces)*sizeof(float)),774);
+                                       25*(*num_abs_boundary_faces)*sizeof(float)),1107);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
-               25*(*num_abs_boundary_faces)*sizeof(float),
-               cudaMemcpyHostToDevice),774);  
+                                       25*(*num_abs_boundary_faces)*sizeof(float),
+                                       cudaMemcpyHostToDevice),1108);  
   }
-
-/*  
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rho_vp, size*sizeof(float)),5);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rho_vs, size*sizeof(float)),6);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp,h_rho_vp,size*sizeof(float),
-				     cudaMemcpyHostToDevice),5);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs,h_rho_vs,size*sizeof(float),
-				     cudaMemcpyHostToDevice),5);
   
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_absorb_field, 3*25* *b_num_abs_boundary_faces*sizeof(float)),7);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
-				     3*25* *b_num_abs_boundary_faces*sizeof(float),
-				     cudaMemcpyHostToDevice),7);
-*/  
-
-  // prepare interprocess-edge exchange information
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_ext_mesh,
-				     *num_interfaces_ext_mesh*sizeof(int)),19);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
-				     *num_interfaces_ext_mesh*sizeof(int),cudaMemcpyHostToDevice),19);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_ext_mesh,
-                                     *num_interfaces_ext_mesh* *max_nibool_interfaces_ext_mesh*
-                                     sizeof(int)),20);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
-				     *num_interfaces_ext_mesh* *max_nibool_interfaces_ext_mesh*sizeof(int),
-				     cudaMemcpyHostToDevice),20);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,*NSPEC_AB*sizeof(int)),21);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
-				     *NSPEC_AB*sizeof(int),
-				     cudaMemcpyHostToDevice),21);
-             
-//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_elastic,*NSPEC_AB*sizeof(int)),21);
-//  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic, h_ispec_is_elastic,
-//				     *NSPEC_AB*sizeof(int),
-//				     cudaMemcpyHostToDevice),21);
-
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool, h_ibool,
-				     size*sizeof(int)  ,cudaMemcpyHostToDevice),22);    
-
   // sources
   if (*SIMULATION_TYPE == 1  || *SIMULATION_TYPE == 3){
     // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2)
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_sourcearrays, sizeof(float)* *NSOURCES*3*125),522);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_sourcearrays, sizeof(float)* *NSOURCES*3*125),1301);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_sourcearrays, h_sourcearrays, sizeof(float)* *NSOURCES*3*125,
-                                       cudaMemcpyHostToDevice),522);
+                                       cudaMemcpyHostToDevice),1302);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_stf_pre_compute),
-                                       *NSOURCES*sizeof(double)),525);
+                                       *NSOURCES*sizeof(double)),1303);
   }
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_source, sizeof(int) * *NSOURCES),523);
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_source, sizeof(int) * *NSOURCES),1401);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_source, h_islice_selected_source, sizeof(int)* *NSOURCES,
-				     cudaMemcpyHostToDevice),523);
+				     cudaMemcpyHostToDevice),1402);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_ispec_selected_source, sizeof(int)* *NSOURCES),524);
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_ispec_selected_source, sizeof(int)* *NSOURCES),1403);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_source, h_ispec_selected_source,sizeof(int)* *NSOURCES,
-				     cudaMemcpyHostToDevice),524);
+				     cudaMemcpyHostToDevice),1404);
 
   
   // transfer constant element data with padding
   for(int i=0;i<*NSPEC_AB;i++) {
     print_CUDA_error_if_any(cudaMemcpy(mp->d_xix + i*128, &h_xix[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),70);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1501);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy+i*128,   &h_xiy[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),71);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1502);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz+i*128,   &h_xiz[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),72);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1503);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_etax+i*128,  &h_etax[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),73);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1504);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_etay+i*128,  &h_etay[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),74);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1505);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz+i*128,  &h_etaz[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),75);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1506);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax+i*128,&h_gammax[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),76);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1507);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay+i*128,&h_gammay[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),77);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1508);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz+i*128,&h_gammaz[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),78);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1509);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_kappav+i*128,&h_kappav[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),79);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1510);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_muv+i*128,   &h_muv[i*125],
-				       125*sizeof(float),cudaMemcpyHostToDevice),80);
+				       125*sizeof(float),cudaMemcpyHostToDevice),1511);
       
   }
         
@@ -633,15 +411,12 @@
   mp->nrec_local = nrec_local;
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_number_receiver_global),nrec_local*sizeof(int)),1);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_number_receiver_global,h_number_receiver_global,
-                                     nrec_local*sizeof(int),cudaMemcpyHostToDevice),602);  
+                                     nrec_local*sizeof(int),cudaMemcpyHostToDevice),1512);  
   
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_selected_rec),nrec*sizeof(int)),603);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_selected_rec),nrec*sizeof(int)),1513);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_rec,h_ispec_selected_rec,
-                                     nrec*sizeof(int),cudaMemcpyHostToDevice),604);  
+                                     nrec*sizeof(int),cudaMemcpyHostToDevice),1514);  
 
-//  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),3*125*nrec_local*sizeof(float)),605);
-//  mp->h_station_seismo_field = (float*)malloc(3*125*nrec_local*sizeof(float));
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_constants_device");
 #endif            
@@ -649,254 +424,10 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-extern "C"
-void FC_FUNC_(prepare_and_transfer_noise_backward_fields,
-              PREPARE_AND_TRANSFER_NOISE_BACKWARD_FIELDS)(long* Mesh_pointer_f,
-                                                          int* size,
-                                                          real* b_displ,
-                                                          real* b_veloc,
-                                                          real* b_accel,
-                                                          real* b_epsilondev_xx,
-                                                          real* b_epsilondev_yy,
-                                                          real* b_epsilondev_xy,
-                                                          real* b_epsilondev_xz,
-                                                          real* b_epsilondev_yz,
-                                                          int* NSPEC_STRAIN_ONLY) {
+// purely adjoint & kernel simulations
 
-TRACE("prepare_and_transfer_noise_backward_fields_");
-                  
-  //show_free_memory("prep_and_xfer_noise_bwd_fields");
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-  int epsilondev_size = 128*(*NSPEC_STRAIN_ONLY);
-  
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),*size*sizeof(real)),1);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),*size*sizeof(real)),2);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),*size*sizeof(real)),3);
-  
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
-				     epsilondev_size*sizeof(real)),4);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
-				     epsilondev_size*sizeof(real)),4);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
-				     epsilondev_size*sizeof(real)),4);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
-				     epsilondev_size*sizeof(real)),4);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
-				     epsilondev_size*sizeof(real)),4);
-
-  
-  cudaMemcpy(mp->d_b_displ,b_displ,*size*sizeof(real),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_veloc,b_veloc,*size*sizeof(real),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_accel,b_accel,*size*sizeof(real),cudaMemcpyHostToDevice);  
-
-  cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
-	     epsilondev_size*sizeof(real),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
-	     epsilondev_size*sizeof(real),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
-	     epsilondev_size*sizeof(real),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
-	     epsilondev_size*sizeof(real),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
-	     epsilondev_size*sizeof(real),cudaMemcpyHostToDevice);
-  
-}
-
 /* ----------------------------------------------------------------------------------------------- */
 
-extern "C"
-void FC_FUNC_(prepare_and_transfer_noise_backward_constants,
-              PREPARE_AND_TRANSFER_NOISE_BACKWARD_CONSTANTS)(long* Mesh_pointer_f,
-                                                            float* normal_x_noise,
-                                                            float* normal_y_noise,
-                                                            float* normal_z_noise,
-                                                            float* mask_noise,
-                                                            float* free_surface_jacobian2Dw,
-                                                            int* nfaces_surface_ext_mesh
-                                                            ) {
-
-TRACE("prepare_and_transfer_noise_backward_constants_");
-
-  //show_free_memory("prep_and_xfer_noise_bwd_constants");
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  
-  int nface_size = 5*5*(*nfaces_surface_ext_mesh);
-  
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
-				     nface_size*sizeof(float)),1);
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
-				     nface_size*sizeof(float)),2);
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
-				     nface_size*sizeof(float)),3);
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise, nface_size*sizeof(float)),4);
-
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
-				     nface_size*sizeof(float)),5);
-
-  cudaMemcpy(mp->d_normal_x_noise, normal_x_noise, nface_size*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_normal_y_noise, normal_y_noise, nface_size*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_normal_z_noise, normal_z_noise, nface_size*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_mask_noise, mask_noise, nface_size*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw, nface_size*sizeof(float),cudaMemcpyHostToDevice);
-  
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  printf("jacobian_size = %d\n",25*(*nfaces_surface_ext_mesh));
-  exit_on_cuda_error("prepare_and_transfer_noise_backward_constants_");  
-#endif  
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(prepare_noise_constants_device,
-              PREPARE_NOISE_CONSTANTS_DEVICE)(long* Mesh_pointer_f, 
-                                              int* h_NGLLX,
-                                              int* NSPEC_AB, int* NGLOB_AB,
-                                              int* free_surface_ispec,int* free_surface_ijk,
-                                              int* num_free_surface_faces,
-                                              int* size_free_surface_ijk, int* SIMULATION_TYPE) {
-
-TRACE("prepare_noise_constants_device_");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-
-  mp->num_free_surface_faces = *num_free_surface_faces;
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec, *num_free_surface_faces*sizeof(int)),1);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec, *num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),1);
-
-  // alloc storage for the surface buffer to be copied
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie, 3*25*(*num_free_surface_faces)*sizeof(float)),1);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk, (*size_free_surface_ijk)*sizeof(float)),1);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,(*size_free_surface_ijk)*sizeof(float),cudaMemcpyHostToDevice),1);
-  
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(prepare_sensitivity_kernels,
-              PREPARE_SENSITIVITY_KERNELS)(long* Mesh_pointer_f,
-                                           float* rho_kl,
-                                           float* mu_kl,
-                                           float* kappa_kl,
-                                           float* epsilon_trace_over_3,
-                                           float* b_epsilon_trace_over_3,
-                                           float* Sigma_kl,
-                                           int* NSPEC_ADJOINTf) {
-
-TRACE("prepare_sensitivity_kernels_");
-               
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-  int NSPEC_ADJOINT = *NSPEC_ADJOINTf;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),
-				     125*mp->NSPEC_AB*sizeof(float)),800);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),
-				     125*mp->NSPEC_AB*sizeof(float)),801);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),
-				     125*mp->NSPEC_AB*sizeof(float)),802);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
-				     125*mp->NSPEC_AB*sizeof(float)),803);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
-				     125*mp->NSPEC_AB*sizeof(float)),804);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
-				     125*(mp->NSPEC_AB)*sizeof(float)),805);
-
-  cudaMemcpy(mp->d_rho_kl,rho_kl, 125*NSPEC_ADJOINT*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_mu_kl,mu_kl, 125*NSPEC_ADJOINT*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_kappa_kl,kappa_kl, 125*NSPEC_ADJOINT*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
-	     125*NSPEC_ADJOINT*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
-	     125*NSPEC_ADJOINT*sizeof(float),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_Sigma_kl, Sigma_kl, 125*(NSPEC_ADJOINT)*sizeof(float),
-	     cudaMemcpyHostToDevice);
-  
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("prepare_sensitivity_kernels");
-#endif
-}
-					     
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(prepare_adjoint_constants_device,
-              PREPARE_ADJOINT_CONSTANTS_DEVICE)(long* Mesh_pointer_f,
-                                                //int* ispec_selected_rec,
-                                                //int* islice_selected_rec,
-                                                //int* islice_selected_rec_size,
-                                                //int* nrec,
-                                                float* noise_sourcearray,
-                                                int* NSTEP,
-                                                float* epsilondev_xx,
-                                                float* epsilondev_yy,
-                                                float* epsilondev_xy,
-                                                float* epsilondev_xz,
-                                                float* epsilondev_yz,
-                                                int* NSPEC_STRAIN_ONLY
-                                                ) {
-TRACE("prepare_adjoint_constants_device_");
-              
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("prepare_adjoint_constants_device 1");  
-#endif
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-  int epsilondev_size = 128*(*NSPEC_STRAIN_ONLY);
-  
-  // already done earlier
-  // print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_ispec_selected_rec,
-  // *nrec*sizeof(int)),1);
-  // cudaMemcpy(mp->d_ispec_selected_rec,ispec_selected_rec, *nrec*sizeof(int),  
-  // cudaMemcpyHostToDevice);
-
-  //print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
-	//			     *islice_selected_rec_size*sizeof(int)),901);
-  
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
-				     3*125*(*NSTEP)*sizeof(float)),902);
-
-  
-  cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
-	     3*125*(*NSTEP)*sizeof(float),
-	     cudaMemcpyHostToDevice);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("prepare_adjoint_constants_device 2");  
-#endif
-  
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
-				     epsilondev_size*sizeof(float)),903);
-  cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(float),
-	     cudaMemcpyHostToDevice);
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
-				     epsilondev_size*sizeof(float)),904);
-  cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(float),
-	     cudaMemcpyHostToDevice);
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
-				     epsilondev_size*sizeof(float)),905);
-  cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(float),
-	     cudaMemcpyHostToDevice);
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
-				     epsilondev_size*sizeof(float)),906);
-  cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(float),
-	     cudaMemcpyHostToDevice);
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
-				     epsilondev_size*sizeof(float)),907);
-  cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(float),
-	     cudaMemcpyHostToDevice);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING       
-  exit_on_cuda_error("prepare_adjoint_constants_device 3");  
-#endif  
-  
-  // these don't seem necessary and crash code for NOISE_TOMOGRAPHY >
-  // 0 b/c rho_kl, etc not yet allocated when NT=1
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
 extern "C" 
 void FC_FUNC_(prepare_adjoint_sim2_or_3_constants_device,
               PREPARE_ADJOINT_SIM2_OR_3_CONSTANTS_DEVICE)(
@@ -910,11 +441,11 @@
   
   // allocates arrays for receivers
   print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
-                                     *islice_selected_rec_size*sizeof(int)),802);
+                                     *islice_selected_rec_size*sizeof(int)),7001);
   
   // copies arrays to GPU device
   print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_rec,islice_selected_rec, 
-                                     *islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),804);
+                                     *islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),7002);
   
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING       
   exit_on_cuda_error("prepare_adjoint_sim2_or_3_constants_device");  
@@ -923,28 +454,126 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-/*
-extern "C" {
-  void prepare_fields_device_(long* Mesh_pointer_f, int* size);
-  void transfer_fields_to_device_(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f);
-  void transfer_fields_from_device_(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f);
-}
-*/
+// for ACOUSTIC simulations
 
 /* ----------------------------------------------------------------------------------------------- */
 
-/*
-void prepare_fields_device_(long* Mesh_pointer_f, int* size) {
+extern "C"
+void FC_FUNC_(prepare_fields_acoustic_device,
+              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f, 
+                                              float* rmass_acoustic, 
+                                              float* rhostore,
+                                              float* kappastore,
+                                              int* num_phase_ispec_acoustic, 
+                                              int* phase_ispec_inner_acoustic,
+                                              int* ispec_is_acoustic,
+                                              int* NOISE_TOMOGRAPHY,
+                                              int* num_free_surface_faces,
+                                              int* free_surface_ispec,
+                                              int* free_surface_ijk,
+                                              int* ABSORBING_CONDITIONS,
+                                              int* b_reclen_potential,
+                                              float* b_absorb_potential,
+                                              int* SIMULATION_TYPE,
+                                              float* rho_ac_kl,
+                                              float* kappa_ac_kl
+                                              ) {
   
+  TRACE("prepare_fields_acoustic_device");
+  
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(float)*(*size)),0);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(float)*(*size)),1);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(float)*(*size)),2);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),sizeof(float)*(*size)),2);
-
+  /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
+  int size_padded = 128 * mp->NSPEC_AB;
+  int size_nonpadded = 125 * mp->NSPEC_AB;
+  int size = mp->NGLOB_AB;
+  
+  // allocates arrays on device (GPU)
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(float)*size),9001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(float)*size),9002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(float)*size),9003);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),sizeof(float)*size),9004);
+  
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(float)*size),9005);
+  
+  // padded array
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(float)),9006); 
+  
+  // non-padded array
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(float)),9007); 
+  
+  mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic), mp->num_phase_ispec_acoustic*2*sizeof(int)),9008);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),mp->NSPEC_AB*sizeof(int)),9009);
+  
+  // free surface
+  if( *NOISE_TOMOGRAPHY == 0 ){
+    // allocate surface arrays
+    mp->num_free_surface_faces = *num_free_surface_faces;
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),mp->num_free_surface_faces*sizeof(int)),9201);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),3*25*mp->num_free_surface_faces*sizeof(int)),9202);
+    
+    // transfers values onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
+                                       mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9203);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+                                       3*25*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9204);    
+  }
+  
+  // absorbing boundaries
+  if( *ABSORBING_CONDITIONS == 1 ){
+    mp->d_b_reclen_potential = *b_reclen_potential;
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),9301); 
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
+                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),9302);    
+  }
+  
+  // kernel simulations
+  if( *SIMULATION_TYPE == 3 ){
+    // allocates backward/reconstructed arrays on device (GPU)
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(float)*size),9014);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(float)*size),9015);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(float)*size),9016);    
+    
+    // allocates kernels  
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),125*mp->NSPEC_AB*sizeof(float)),9017);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),125*mp->NSPEC_AB*sizeof(float)),9018);
+    // copies over initial values
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_ac_kl,rho_ac_kl, 
+                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),9019);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_kappa_ac_kl,kappa_ac_kl, 
+                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),9020);
+    
+  }
+  
+  // transfer element data
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
+                                     sizeof(float)*size,cudaMemcpyHostToDevice),9100);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic, 
+                                     mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),9101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
+                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),9102);
+                                     
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
+                                     size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),9105);
+  
+  // transfer constant element data with padding
+  for(int i=0;i<mp->NSPEC_AB;i++) {  
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*128, &rhostore[i*125],
+                                       125*sizeof(float),cudaMemcpyHostToDevice),9106);
+  }
+  
+  // for seismograms
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
+                                     mp->nrec_local*125*sizeof(float)),9107);
+  mp->h_station_seismo_potential = (float*)malloc(mp->nrec_local*125*sizeof(float));
+  if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING       
+  exit_on_cuda_error("prepare_fields_acoustic_device");  
+#endif    
 }
-*/
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // for ELASTIC simulations
@@ -963,164 +592,549 @@
                                              int* ispec_is_elastic,
                                              int* ABSORBING_CONDITIONS,
                                              float* h_b_absorb_field,
-                                             int* b_num_abs_boundary_faces){
+                                             int* b_num_abs_boundary_faces,
+                                             int* SIMULATION_TYPE,
+                                             float* rho_kl,
+                                             float* mu_kl,
+                                             float* kappa_kl,
+                                             int* COMPUTE_AND_STORE_STRAIN,
+                                             float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
+                                             float* epsilondev_xz,float* epsilondev_yz,
+                                             float* epsilon_trace_over_3,
+                                             float* b_epsilondev_xx,float* b_epsilondev_yy,float* b_epsilondev_xy,
+                                             float* b_epsilondev_xz,float* b_epsilondev_yz,
+                                             float* b_epsilon_trace_over_3,
+                                             int* ATTENUATION, int* R_size,
+                                             float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+                                             float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
+                                             float* one_minus_sum_beta,float* factor_common,
+                                             float* alphaval,float* betaval,float* gammaval,
+                                             float* b_alphaval,float* b_betaval,float* b_gammaval                                             
+                                             ){
   
 TRACE("prepare_fields_elastic_device");
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
   /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */  
-  int size_padded = 128 * mp->NSPEC_AB;
+  //int size_padded = 128 * mp->NSPEC_AB;
   int size_nonpadded = 125 * mp->NSPEC_AB;
   
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(float)*(*size)),200);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(float)*(*size)),201);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(float)*(*size)),202);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),sizeof(float)*(*size)),203);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(float)*(*size)),8001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(float)*(*size)),8002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(float)*(*size)),8003);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),sizeof(float)*(*size)),8004);
   
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(float)*mp->NGLOB_AB),204);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_padded*sizeof(float)),205); 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_padded*sizeof(float)),206); 
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(float)*mp->NGLOB_AB),8005);
   
+  // non-padded arrays
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(float)),8006); 
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(float)),8007); 
+  
+  // element indices
   mp->d_num_phase_ispec_elastic = *num_phase_ispec_elastic;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic), mp->d_num_phase_ispec_elastic*2*sizeof(int)),207);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),208);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic), 
+                                     mp->d_num_phase_ispec_elastic*2*sizeof(int)),8008);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),8009);
   
   // transfer element data
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
-                                     sizeof(float)*mp->NGLOB_AB,cudaMemcpyHostToDevice),209);  
+                                     sizeof(float)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8010);  
   print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic, 
-                                     mp->d_num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),210);
+                                     mp->d_num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),8011);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
-                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),211);
+                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),8012);
   
   // daniel: not sure if rho_vp, rho_vs needs padding... they are needed for stacey boundary condition
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
-                                     size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),212);
+                                     size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),8013);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
-                                     size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),213);
+                                     size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),8014);
   
   // for seismograms
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),3*125*(mp->nrec_local)*sizeof(float)),214);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
+                                     3*125*(mp->nrec_local)*sizeof(float)),8015);
   mp->h_station_seismo_field = (float*)malloc(3*125*(mp->nrec_local)*sizeof(float));
   
   // absorbing conditions
   if( *ABSORBING_CONDITIONS == 1 ){
     mp->b_num_abs_boundary_faces = *b_num_abs_boundary_faces;
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field), 
-                                       3*25*mp->b_num_abs_boundary_faces*sizeof(float)),791);
+                                       3*25*mp->b_num_abs_boundary_faces*sizeof(float)),8016);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
-                                       3*25*mp->b_num_abs_boundary_faces*sizeof(float),cudaMemcpyHostToDevice),792);
+                                       3*25*mp->b_num_abs_boundary_faces*sizeof(float),cudaMemcpyHostToDevice),8017);
   }
+
+  // kernel simulations
+  if( *SIMULATION_TYPE == 3 ){
+    // allocates backward/reconstructed arrays on device (GPU)
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(float)*(*size)),8201);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(float)*(*size)),8202);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(float)*(*size)),8203);
+    
+    // allocates kernels  
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),125*mp->NSPEC_AB*sizeof(float)),8204);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),125*mp->NSPEC_AB*sizeof(float)),8205);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),125*mp->NSPEC_AB*sizeof(float)),8206);
+    
+    // copies over initial values
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_kl,rho_kl, 
+                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8207);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_mu_kl,mu_kl, 
+                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8208);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_kappa_kl,kappa_kl, 
+                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8209);
+    
+  }
   
+  // strains used for attenuation and kernel simulations
+  if( *COMPUTE_AND_STORE_STRAIN == 1 ){
+    // strains
+    int epsilondev_size = 125*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+    
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
+                                       epsilondev_size*sizeof(float)),8301);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(float),
+                                       cudaMemcpyHostToDevice),8302);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
+                                       epsilondev_size*sizeof(float)),8302);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(float),
+                                       cudaMemcpyHostToDevice),8303);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
+                                       epsilondev_size*sizeof(float)),8304);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(float),
+                                       cudaMemcpyHostToDevice),8305);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
+                                       epsilondev_size*sizeof(float)),8306);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(float),
+                                       cudaMemcpyHostToDevice),8307);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
+                                       epsilondev_size*sizeof(float)),8308);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(float),
+                                       cudaMemcpyHostToDevice),8309);
+    
+    if( *SIMULATION_TYPE == 3 ){  
+      // solid pressure
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
+                                         125*mp->NSPEC_AB*sizeof(float)),8310);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
+                                         125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8311);
+      // backward solid pressure
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
+                                         125*mp->NSPEC_AB*sizeof(float)),8312);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
+                                         125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8313);
+      // prepares backward strains
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
+                                         epsilondev_size*sizeof(float)),8321);
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
+                                         epsilondev_size*sizeof(float)),8322);
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
+                                         epsilondev_size*sizeof(float)),8323);
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
+                                         epsilondev_size*sizeof(float)),8324);
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
+                                         epsilondev_size*sizeof(float)),8325);
+
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
+                                         epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8326);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
+                                         epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8327);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
+                                         epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8328);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
+                                         epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8329);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
+                                         epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8330);            
+    }
+  }
+  
+  // attenuation memory variables
+  if( *ATTENUATION == 1 ){
+    // memory arrays
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
+                                       (*R_size)*sizeof(float)),8401);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(float),
+                                       cudaMemcpyHostToDevice),8402);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
+                                       (*R_size)*sizeof(float)),8403);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(float),
+                                       cudaMemcpyHostToDevice),8404);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
+                                       (*R_size)*sizeof(float)),8405);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(float),
+                                       cudaMemcpyHostToDevice),8406);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
+                                       (*R_size)*sizeof(float)),8407);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(float),
+                                       cudaMemcpyHostToDevice),8408);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
+                                       (*R_size)*sizeof(float)),8409);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(float),
+                                       cudaMemcpyHostToDevice),8410);    
+    if( *SIMULATION_TYPE == 3 ){
+        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
+                                           (*R_size)*sizeof(float)),8421);
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(float),
+                                           cudaMemcpyHostToDevice),8422);
+        
+        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
+                                           (*R_size)*sizeof(float)),8423);
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(float),
+                                           cudaMemcpyHostToDevice),8424);
+        
+        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
+                                           (*R_size)*sizeof(float)),8425);
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(float),
+                                           cudaMemcpyHostToDevice),8426);
+        
+        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
+                                           (*R_size)*sizeof(float)),8427);
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(float),
+                                           cudaMemcpyHostToDevice),8428);
+        
+        print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
+                                           (*R_size)*sizeof(float)),8429);
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(float),
+                                           cudaMemcpyHostToDevice),8420);              
+    }
+    
+    // attenuation factors
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
+                                       125*mp->NSPEC_AB*sizeof(float)),8430);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
+                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8431);          
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
+                                       N_SLS*125*mp->NSPEC_AB*sizeof(float)),8432);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
+                                       N_SLS*125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8433);
+
+    // alpha,beta,gamma factors
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
+                                       N_SLS*sizeof(float)),8434);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
+                                       N_SLS*sizeof(float),cudaMemcpyHostToDevice),8435);
+                                       
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
+                                       N_SLS*sizeof(float)),8436);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
+                                       N_SLS*sizeof(float),cudaMemcpyHostToDevice),8437);
+                                       
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
+                                       N_SLS*sizeof(float)),8438);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
+                                       N_SLS*sizeof(float),cudaMemcpyHostToDevice),8439);
+    
+    if( *SIMULATION_TYPE == 3 ){
+      // alpha,beta,gamma factors for backward fields
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
+                                         N_SLS*sizeof(float)),8434);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
+                                         N_SLS*sizeof(float),cudaMemcpyHostToDevice),8435);
+      
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
+                                         N_SLS*sizeof(float)),8436);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
+                                         N_SLS*sizeof(float),cudaMemcpyHostToDevice),8437);
+      
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
+                                         N_SLS*sizeof(float)),8438);
+      print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
+                                         N_SLS*sizeof(float),cudaMemcpyHostToDevice),8439);
+    }
+    
+    
+  }
+  
+  
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING       
   exit_on_cuda_error("prepare_fields_elastic_device");  
 #endif    
 }
 
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
-// for ACOUSTIC simulations
+// for NOISE simulations
 
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(prepare_fields_acoustic_device,
-              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f, 
-                                               float* rmass_acoustic, 
-                                               float* rhostore,
-                                               float* kappastore,
-                                               int* num_phase_ispec_acoustic, 
-                                               int* phase_ispec_inner_acoustic,
-                                               int* ispec_is_acoustic,
-                                               int* num_free_surface_faces,
-                                               int* free_surface_ispec,
-                                               int* free_surface_ijk,
-                                               int* ABSORBING_CONDITIONS,
-                                               int* b_reclen_potential,
-                                               float* b_absorb_potential,
-                                               int* SIMULATION_TYPE,
-                                               float* rho_ac_kl,
-                                               float* kappa_ac_kl) {
+void FC_FUNC_(prepare_fields_noise_device,
+              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f, 
+                                           int* NSPEC_AB, int* NGLOB_AB,
+                                           int* free_surface_ispec,int* free_surface_ijk,
+                                           int* num_free_surface_faces,
+                                           int* size_free_surface_ijk, 
+                                           int* SIMULATION_TYPE,
+                                           int* NOISE_TOMOGRAPHY,
+                                           int* NSTEP,
+                                           float* noise_sourcearray,
+                                           float* normal_x_noise,
+                                           float* normal_y_noise,
+                                           float* normal_z_noise,
+                                           float* mask_noise,
+                                           float* free_surface_jacobian2Dw,
+                                           float* Sigma_kl
+                                           ) {
   
-TRACE("prepare_fields_acoustic_device");
+  TRACE("prepare_fields_noise_device");
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-  /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
-  int size_padded = 128 * mp->NSPEC_AB;
-  int size_nonpadded = 125 * mp->NSPEC_AB;
-  int size = mp->NGLOB_AB;
   
-  // allocates arrays on device (GPU)
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(float)*size),100);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(float)*size),101);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(float)*size),102);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),sizeof(float)*size),103);
+  // free surface
+  mp->num_free_surface_faces = *num_free_surface_faces;
   
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(float)*size),104);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(float)),105); 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_padded*sizeof(float)),106); 
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec, 
+                                     *num_free_surface_faces*sizeof(int)),4001);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec, 
+                                     *num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4002);
   
-  mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic), mp->num_phase_ispec_acoustic*2*sizeof(int)),107);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),mp->NSPEC_AB*sizeof(int)),108);
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk, 
+                                     (*size_free_surface_ijk)*sizeof(int)),4003);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+                                     (*size_free_surface_ijk)*sizeof(int),cudaMemcpyHostToDevice),4004);
   
-  mp->num_free_surface_faces = *num_free_surface_faces;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),mp->num_free_surface_faces*sizeof(int)),109);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),3*25*mp->num_free_surface_faces*sizeof(int)),110);
+  // alloc storage for the surface buffer to be copied
+  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie, 
+                                     3*25*(*num_free_surface_faces)*sizeof(float)),4005);
   
+  // prepares noise source array
+  if( *NOISE_TOMOGRAPHY == 1 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
+                                       3*125*(*NSTEP)*sizeof(float)),4101);  
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
+                                       3*125*(*NSTEP)*sizeof(float),cudaMemcpyHostToDevice),4102);    
+  }
+  
+  // prepares noise directions
+  if( *NOISE_TOMOGRAPHY > 1 ){
+    int nface_size = 25*(*num_free_surface_faces);
+    // allocates memory on GPU
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
+                                       nface_size*sizeof(float)),4301);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
+                                       nface_size*sizeof(float)),4302);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
+                                       nface_size*sizeof(float)),4303);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise, 
+                                       nface_size*sizeof(float)),4304);    
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
+                                       nface_size*sizeof(float)),4305);
+    // transfers data onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise, 
+                                       nface_size*sizeof(float),cudaMemcpyHostToDevice),4306);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise, 
+                                       nface_size*sizeof(float),cudaMemcpyHostToDevice),4307);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise, 
+                                       nface_size*sizeof(float),cudaMemcpyHostToDevice),4308);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise, 
+                                       nface_size*sizeof(float),cudaMemcpyHostToDevice),4309);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw, 
+                                       nface_size*sizeof(float),cudaMemcpyHostToDevice),4310);    
+  }
+  
+  // prepares noise strength kernel
+  if( *NOISE_TOMOGRAPHY == 3 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
+                                       125*(mp->NSPEC_AB)*sizeof(float)),4401);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_Sigma_kl, Sigma_kl, 
+                                       125*(mp->NSPEC_AB)*sizeof(float),cudaMemcpyHostToDevice),4403);  
+  }  
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("jacobian_size = %d\n",25*(*num_free_surface_faces));
+  exit_on_cuda_error("prepare_fields_noise_device");  
+#endif    
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// cleanup
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_cleanup_device,
+              PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
+                                      int* num_abs_boundary_faces,
+                                      int* SIMULATION_TYPE,
+                                      int* ACOUSTIC_SIMULATION,
+                                      int* ELASTIC_SIMULATION,
+                                      int* ABSORBING_CONDITIONS,
+                                      int* NOISE_TOMOGRAPHY,
+                                      int* COMPUTE_AND_STORE_STRAIN,
+                                      int* ATTENUATION) {
+  
+TRACE("prepare_cleanup_device");
+
+  // frees allocated memory arrays  
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // frees memory on GPU  
+  // mesh
+  cudaFree(mp->d_xix);
+  cudaFree(mp->d_xiy);
+  cudaFree(mp->d_xiz);
+  cudaFree(mp->d_etax);
+  cudaFree(mp->d_etay);
+  cudaFree(mp->d_etaz);
+  cudaFree(mp->d_gammax);
+  cudaFree(mp->d_gammay);
+  cudaFree(mp->d_gammaz);
+  cudaFree(mp->d_muv);
+  
   // absorbing boundaries
-  if( *ABSORBING_CONDITIONS == 1 ){
-    mp->d_b_reclen_potential = *b_reclen_potential;
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),111); 
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
-                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),112);    
+  if( *num_abs_boundary_faces > 0 ){ 
+    cudaFree(mp->d_abs_boundary_ispec);
+    cudaFree(mp->d_abs_boundary_ijk);
+    cudaFree(mp->d_abs_boundary_normal);
+    cudaFree(mp->d_abs_boundary_jacobian2Dw);
   }
   
-  // kernel simulations
-  if( *SIMULATION_TYPE == 3 ){
-    // allocates backward/reconstructed arrays on device (GPU)
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(float)*size),113);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(float)*size),114);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(float)*size),115);    
+  // interfaces
+  cudaFree(mp->d_nibool_interfaces_ext_mesh);
+  cudaFree(mp->d_ibool_interfaces_ext_mesh);
+
+  // global indexing
+  cudaFree(mp->d_ispec_is_inner);
+  cudaFree(mp->d_ibool);
+
+  // sources
+  if (*SIMULATION_TYPE == 1  || *SIMULATION_TYPE == 3){ 
+    cudaFree(mp->d_sourcearrays);
+    cudaFree(mp->d_stf_pre_compute);    
+  }
+  
+  cudaFree(mp->d_islice_selected_source);
+  cudaFree(mp->d_ispec_selected_source);
+
+  // receivers
+  cudaFree(mp->d_number_receiver_global);
+  cudaFree(mp->d_ispec_selected_rec);
+
+  // ACOUSTIC arrays
+  if( *ACOUSTIC_SIMULATION == 1 ){ 
+    cudaFree(mp->d_potential_acoustic);
+    cudaFree(mp->d_potential_dot_acoustic);
+    cudaFree(mp->d_potential_dot_dot_acoustic);
+    cudaFree(mp->d_send_potential_dot_dot_buffer);
+    cudaFree(mp->d_rmass_acoustic);
+    cudaFree(mp->d_rhostore);
+    cudaFree(mp->d_kappastore);
+    cudaFree(mp->d_phase_ispec_inner_acoustic);
+    cudaFree(mp->d_ispec_is_acoustic);
     
-    // allocates kernels  
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),125*mp->NSPEC_AB*sizeof(float)),181);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),125*mp->NSPEC_AB*sizeof(float)),182);
-    // copies over initial values
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_ac_kl,rho_ac_kl, 
-                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),183);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_kappa_ac_kl,kappa_ac_kl, 
-                                       125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),184);
+    if( *NOISE_TOMOGRAPHY == 0 ){ 
+      cudaFree(mp->d_free_surface_ispec);
+      cudaFree(mp->d_free_surface_ijk);
+    }
     
+    if( *ABSORBING_CONDITIONS == 1 ) cudaFree(mp->d_b_absorb_potential);
+    
+    if( *SIMULATION_TYPE == 3 ) {
+      cudaFree(mp->d_b_potential_acoustic);
+      cudaFree(mp->d_b_potential_dot_acoustic);
+      cudaFree(mp->d_b_potential_dot_dot_acoustic);
+      cudaFree(mp->d_rho_ac_kl);
+      cudaFree(mp->d_kappa_ac_kl);
+    }
+    
+    cudaFree(mp->d_station_seismo_potential);
+
+    free(mp->h_station_seismo_potential);
   }
+
+  // ELASTIC arrays
+  if( *ELASTIC_SIMULATION == 1 ){ 
+    cudaFree(mp->d_displ);
+    cudaFree(mp->d_veloc);
+    cudaFree(mp->d_accel);
+    cudaFree(mp->d_send_accel_buffer);
+    cudaFree(mp->d_rmass);
+    cudaFree(mp->d_rho_vp);
+    cudaFree(mp->d_rho_vs);
+    cudaFree(mp->d_phase_ispec_inner_elastic);
+    cudaFree(mp->d_ispec_is_elastic);
+    cudaFree(mp->d_station_seismo_field);
+    
+    if( *ABSORBING_CONDITIONS == 1 ) cudaFree(mp->d_b_absorb_field);
+
+    if( *SIMULATION_TYPE == 3 ) {
+      cudaFree(mp->d_b_displ);
+      cudaFree(mp->d_b_veloc);
+      cudaFree(mp->d_b_accel);
+      cudaFree(mp->d_rho_kl);
+      cudaFree(mp->d_mu_kl);
+      cudaFree(mp->d_kappa_kl);
+    }
+
+    if( *COMPUTE_AND_STORE_STRAIN == 1 ){  
+      cudaFree(mp->d_epsilondev_xx);
+      cudaFree(mp->d_epsilondev_yy);
+      cudaFree(mp->d_epsilondev_xy);
+      cudaFree(mp->d_epsilondev_xz);
+      cudaFree(mp->d_epsilondev_yz);
+      if( *SIMULATION_TYPE == 3 ){ 
+        cudaFree(mp->d_epsilon_trace_over_3);
+        cudaFree(mp->d_b_epsilon_trace_over_3);
+        cudaFree(mp->d_b_epsilondev_xx);
+        cudaFree(mp->d_b_epsilondev_yy);
+        cudaFree(mp->d_b_epsilondev_xy);
+        cudaFree(mp->d_b_epsilondev_xz);
+        cudaFree(mp->d_b_epsilondev_yz);        
+      }    
+    }
+    
+    if( *ATTENUATION == 1 ){
+      cudaFree(mp->d_factor_common);
+      cudaFree(mp->d_one_minus_sum_beta);
+      cudaFree(mp->d_alphaval);
+      cudaFree(mp->d_betaval);
+      cudaFree(mp->d_gammaval);
+      cudaFree(mp->d_R_xx);
+      cudaFree(mp->d_R_yy);
+      cudaFree(mp->d_R_xy);
+      cudaFree(mp->d_R_xz);
+      cudaFree(mp->d_R_yz);
+      if( *SIMULATION_TYPE == 3){ 
+        cudaFree(mp->d_b_R_xx);
+        cudaFree(mp->d_b_R_yy);
+        cudaFree(mp->d_b_R_xy);
+        cudaFree(mp->d_b_R_xz);
+        cudaFree(mp->d_b_R_yz);
+        cudaFree(mp->d_b_alphaval);
+        cudaFree(mp->d_b_betaval);
+        cudaFree(mp->d_b_gammaval);
+      }
+    }
+  }
   
-  // transfer element data
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
-                                     sizeof(float)*size,cudaMemcpyHostToDevice),116);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic, 
-                                     mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),117);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
-                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),118);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-                                     mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),119);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                     3*25*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),120);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
-                                     size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),121);
-  
-  // transfer constant element data with padding
-  for(int i=0;i<mp->NSPEC_AB;i++) {  
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*128, &rhostore[i*125],
-                                       125*sizeof(float),cudaMemcpyHostToDevice),122);
+  // purely adjoint & kernel array
+  if( *SIMULATION_TYPE == 2 || *SIMULATION_TYPE == 3 ) cudaFree(mp->d_islice_selected_rec);
+        
+  // NOISE arrays      
+  if( *NOISE_TOMOGRAPHY > 0 ){ 
+    cudaFree(mp->d_free_surface_ispec);
+    cudaFree(mp->d_free_surface_ijk);
+    cudaFree(mp->d_noise_surface_movie);
+    if( *NOISE_TOMOGRAPHY == 1 ) cudaFree(mp->d_noise_sourcearray);
+    if( *NOISE_TOMOGRAPHY > 1 ){
+      cudaFree(mp->d_normal_x_noise);
+      cudaFree(mp->d_normal_y_noise);
+      cudaFree(mp->d_normal_z_noise);
+      cudaFree(mp->d_mask_noise);
+      cudaFree(mp->d_free_surface_jacobian2Dw);
+    }  
+    if( *NOISE_TOMOGRAPHY == 3 ) cudaFree(mp->d_Sigma_kl);
   }
   
-  // for seismograms
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),mp->nrec_local*125*sizeof(float)),123);
-  mp->h_station_seismo_potential = (float*)malloc(mp->nrec_local*125*sizeof(float));
-  
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING       
-  exit_on_cuda_error("prepare_fields_acoustic_device");  
-#endif    
-}
-
+  // mesh pointer - not needed anymore
+  free(mp);
+}
\ No newline at end of file

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2011-10-07 01:29:29 UTC (rev 19039)
@@ -24,8 +24,6 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-/* ----------------------------------------------------------------------------------------------- */
-
 extern "C"
 void FC_FUNC_(transfer_b_fields_to_device,
               TRANSFER_B_FIELDS_TO_DEVICE)(int* size, float* b_displ, float* b_veloc, float* b_accel,
@@ -50,9 +48,9 @@
     
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(float)*(*size),cudaMemcpyHostToDevice),3);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(float)*(*size),cudaMemcpyHostToDevice),4);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),5);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(float)*(*size),cudaMemcpyHostToDevice),40003);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(float)*(*size),cudaMemcpyHostToDevice),40004);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),40005);
   
 }
 
@@ -82,9 +80,9 @@
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
-  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
-  print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(float)*(*size),cudaMemcpyDeviceToHost),7);
-  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),8);
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40006);
+  print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40007);
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40008);
   
   // printf("Transfered Fields From Device\n");
   // int procid;
@@ -106,7 +104,7 @@
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),6);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),40016);
 
 }
 
@@ -120,7 +118,7 @@
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
-  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40026);
 
 }
 
@@ -134,7 +132,7 @@
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
-  print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
+  print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40036);
 
 }
 
@@ -148,7 +146,7 @@
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
-  print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
+  print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40046);
 
 }
 
@@ -162,7 +160,7 @@
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
-  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40056);
 
 }
 
@@ -176,7 +174,7 @@
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
-  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40066);
 
 }
 
@@ -198,7 +196,7 @@
 }
 
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_compute_kernel_fields_from_device,
               TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
@@ -249,17 +247,132 @@
   exit_on_cuda_error("after transfer_compute_kernel_fields_from_device");
 #endif
 }
+*/
 
+/* ----------------------------------------------------------------------------------------------- */
 
+// attenuation fields
+
+extern "C"
+void FC_FUNC_(transfer_b_fields_att_to_device,
+              TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
+                                             float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
+                                             int* size_R,
+                                             float* b_epsilondev_xx,
+                                             float* b_epsilondev_yy,
+                                             float* b_epsilondev_xy,
+                                             float* b_epsilondev_xz,
+                                             float* b_epsilondev_yz,
+                                             int* size_epsilondev) {
+  TRACE("transfer_b_fields_att_to_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); 
+  
+  cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+  
+  cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);  
+  cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+  
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_fields_att_to_device");
+#endif
+}
+
 /* ----------------------------------------------------------------------------------------------- */
 
-// for ACOUSTIC simulations
+// attenuation fields
 
+extern "C"
+void FC_FUNC_(transfer_fields_att_from_device,
+              TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
+                                               float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+                                               int* size_R,
+                                               float* epsilondev_xx,
+                                               float* epsilondev_yy,
+                                               float* epsilondev_xy,
+                                               float* epsilondev_xz,
+                                               float* epsilondev_yz,
+                                               int* size_epsilondev) {
+  TRACE("transfer_fields_att_from_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); 
+
+  cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+  
+  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+  
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_fields_att_from_device");
+#endif
+}
+
+
 /* ----------------------------------------------------------------------------------------------- */
 
+extern "C" 
+void FC_FUNC_(transfer_sensitivity_kernels_to_host,
+              TRANSFER_SENSITIVITY_KERNELS_TO_HOST)(long* Mesh_pointer, 
+                                                    float* h_rho_kl,
+                                                    float* h_mu_kl, 
+                                                    float* h_kappa_kl,
+                                                    int* NSPEC_AB) {
+TRACE("transfer_sensitivity_kernels_to_host");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer); 
+  
+  print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*125*sizeof(float),
+                                     cudaMemcpyDeviceToHost),40101);
+  print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*125*sizeof(float),
+                                     cudaMemcpyDeviceToHost),40102);
+  print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*125*sizeof(float),
+                                     cudaMemcpyDeviceToHost),40103);
+  
+}
+
 /* ----------------------------------------------------------------------------------------------- */
 
+// for NOISE simulations
 
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C" 
+void FC_FUNC_(transfer_sensitivity_kernels_noise_to_host,
+              TRANSFER_SENSITIVITY_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer, 
+                                                          float* h_Sigma_kl,
+                                                          int* NSPEC_AB) {
+TRACE("transfer_sensitivity_kernels_noise_to_host");
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  
+  print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,125*(*NSPEC_AB)*sizeof(float),
+                                     cudaMemcpyHostToDevice),40201);
+  
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
 extern "C"
 void FC_FUNC_(transfer_fields_acoustic_to_device,
               TRANSFER_FIELDS_ACOUSTIC_TO_DEVICE)(
@@ -273,11 +386,11 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
   print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_acoustic,potential_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),110);
+                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),50110);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_acoustic,potential_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),120);
+                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),50120);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),130);
+                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),50130);
   
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_fields_acoustic_to_device");
@@ -299,11 +412,11 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_acoustic,b_potential_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),110);
+                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),51110);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),120);
+                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),51120);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),130);
+                                     sizeof(float)*(*size),cudaMemcpyHostToDevice),51130);
   
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_b_fields_acoustic_to_device");
@@ -325,11 +438,11 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
   print_CUDA_error_if_any(cudaMemcpy(potential_acoustic,mp->d_potential_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),111);
+                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),52111);
   print_CUDA_error_if_any(cudaMemcpy(potential_dot_acoustic,mp->d_potential_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),121);
+                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),52121);
   print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),131);
+                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),52131);
   
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_fields_acoustic_from_device");
@@ -351,11 +464,11 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   
   print_CUDA_error_if_any(cudaMemcpy(b_potential_acoustic,mp->d_b_potential_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),111);
+                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),53111);
   print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_acoustic,mp->d_b_potential_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),121);
+                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),53121);
   print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
-                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),131);  
+                                     sizeof(float)*(*size),cudaMemcpyDeviceToHost),53131);  
   
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_b_fields_acoustic_from_device");
@@ -363,3 +476,25 @@
 }
 
 
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C" 
+void FC_FUNC_(transfer_sensitivity_kernels_acoustic_to_host,
+              TRANSFER_SENSITIVITY_KERNELS_ACOUSTIC_TO_HOST)(long* Mesh_pointer, 
+                                                             float* h_rho_ac_kl,
+                                                             float* h_kappa_ac_kl,
+                                                             int* NSPEC_AB) {
+  
+  TRACE("transfer_sensitivity_kernels_acoustic_to_host");  
+  
+  //get mesh pointer out of fortran integer container  
+  Mesh* mp = (Mesh*)(*Mesh_pointer); 
+  int size = *NSPEC_AB*125;
+  
+  // copies kernel values over to CPU host
+  print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(float),
+                                     cudaMemcpyDeviceToHost),54101);
+  print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(float),
+                                     cudaMemcpyDeviceToHost),54102);  
+}
+

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90	2011-10-07 01:29:29 UTC (rev 19039)
@@ -81,7 +81,7 @@
           else ! GPU_MODE==.true.
              ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
              call compute_forces_elastic_cuda(Mesh_pointer, iphase, nspec_outer_elastic, &
-                  nspec_inner_elastic,COMPUTE_AND_STORE_STRAIN,SIMULATION_TYPE)   
+                  nspec_inner_elastic,COMPUTE_AND_STORE_STRAIN,SIMULATION_TYPE,ATTENUATION)   
           endif
       else if (NGLLX == 6) then
       call compute_forces_elastic_Dev_6points(iphase, NSPEC_AB,NGLOB_AB,displ,accel, &

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90	2011-10-07 01:29:29 UTC (rev 19039)
@@ -63,7 +63,6 @@
 
   use specfem_par
   use specfem_par_elastic
-  use specfem_par_movie,only: nfaces_surface_ext_mesh
 
   implicit none
   ! local parameters

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90	2011-10-07 01:29:29 UTC (rev 19039)
@@ -205,9 +205,20 @@
       stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ is in [5-10]'
   endif
 
+  ! check for GPU runs
   if( GPU_MODE ) then
     if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) &
       stop 'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5'
+    if( CUSTOM_REAL /= 4 ) &
+      stop 'GPU mode runs only with CUSTOM_REAL == 4'
+    if( SAVE_MOHO_MESH ) &
+      stop 'GPU mode does not support SAVE_MOHO_MESH yet'
+    if( ATTENUATION ) then
+      if( N_SLS /= 3 ) &
+        stop 'GPU mode does not support N_SLS /= 3 yet'   
+    endif
+    if( ANISOTROPY ) &  
+      stop 'GPU mode does not support ANISOTROPY yet'    
   endif
 
   ! absorbing surfaces

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90	2011-10-07 01:29:29 UTC (rev 19039)
@@ -468,10 +468,11 @@
       read(27) b_epsilondev_yz
        
       ! puts elastic attenuation arrays to GPU 
-      ! daniel: TODO transfer R_xx,... and epsilondev_xx,... as well
       if(GPU_MODE) &
-        call exit_MPI(myrank,'read forward arrays: not fully implemented yet for elastic domains with attenuation')
-       
+          call transfer_b_fields_att_to_device(Mesh_pointer, &
+                                            b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), &
+                                            b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
+                                            size(b_epsilondev_xx))                 
     endif
 
   endif
@@ -523,10 +524,13 @@
                
         ! puts elastic fields onto GPU
         if(GPU_MODE) then
+          ! wavefields
           call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
-          
-          ! daniel: TODO transfer R_xx,... and epsilondev_xx,... as well
-          call exit_MPI(myrank,'store attenuation arrays: not fully implemented yet for elastic domains')
+          ! attenuation arrays
+          call transfer_b_fields_att_to_device(Mesh_pointer, &
+                                            b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), &
+                                            b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
+                                            size(b_epsilondev_xx))          
         endif        
       endif
       
@@ -552,9 +556,12 @@
         ! gets elastic fields from GPU onto CPU
         if(GPU_MODE) then
           call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
-          
-          ! daniel: TODO transfer R_xx,... and epsilondev_xx,... as well
-          call exit_MPI(myrank,'store attenuation arrays: not fully implemented yet for elastic domains')
+
+          ! attenuation arrays
+          call transfer_fields_att_from_device(Mesh_pointer, &
+                                            R_xx,R_yy,R_xy,R_xz,R_yz,size(R_xx), &
+                                            epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
+                                            size(epsilondev_xx))          
         endif
         
         ! writes to disk file      
@@ -603,25 +610,58 @@
 
   implicit none
 
-  ! acoustic potentials
-  if( ACOUSTIC_SIMULATION ) then
-    call transfer_fields_acoustic_from_device(NGLOB_AB,potential_acoustic, &
+  ! to store forward wave fields
+  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+
+    ! acoustic potentials
+    if( ACOUSTIC_SIMULATION ) &
+      call transfer_fields_acoustic_from_device(NGLOB_AB,potential_acoustic, &
                             potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)    
-    if( SIMULATION_TYPE == 3 ) then
-      call transfer_b_fields_acoustic_from_device(NGLOB_AB,b_potential_acoustic, &
-                            b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)    
+                            
+    ! elastic wavefield
+    if( ELASTIC_SIMULATION ) then
+      call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
+
+      if (ATTENUATION) &
+        call transfer_fields_att_from_device(Mesh_pointer, &
+                                            R_xx,R_yy,R_xy,R_xz,R_yz,size(R_xx), &
+                                            epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
+                                            size(epsilondev_xx))
+        
+    endif
+  else if (SIMULATION_TYPE == 3) then
+
+    ! to store kernels
+    
+    if( ACOUSTIC_SIMULATION ) then
+      ! only in case needed...
+      !call transfer_b_fields_acoustic_from_device(NGLOB_AB,b_potential_acoustic, &
+      !                      b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)    
+      
+      ! acoustic kernels                      
       call transfer_sensitivity_kernels_acoustic_to_host(Mesh_pointer,rho_ac_kl,kappa_ac_kl,NSPEC_AB)
     endif
+
+    if( ELASTIC_SIMULATION ) then
+      ! only in case needed...
+      !call transfer_b_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
+      
+      ! elastic kernels
+      call transfer_sensitivity_kernels_to_host(Mesh_pointer,rho_kl,mu_kl,kappa_kl,NSPEC_AB)
+    endif
+
+    ! specific noise strength kernel
+    if( NOISE_TOMOGRAPHY == 3 ) then
+      call transfer_sensitivity_kernels_to_host(Mesh_pointer,Sigma_kl,NSPEC_AB)  
+    endif
+
   endif
 
-  ! elastic wavefield
-  if( ELASTIC_SIMULATION ) then
-    call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
-    if( SIMULATION_TYPE == 3 ) then
-      call transfer_b_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
-      call transfer_sensitivity_kernels_to_host(Mesh_pointer, rho_kl, mu_kl, kappa_kl,Sigma_kl, &
-                                               NSPEC_AB)
-    endif
-  endif    
   
+  ! frees allocated memory on GPU
+  call prepare_cleanup_device(Mesh_pointer,num_abs_boundary_faces, &
+                              SIMULATION_TYPE,ACOUSTIC_SIMULATION,ELASTIC_SIMULATION, &
+                              ABSORBING_CONDITIONS,NOISE_TOMOGRAPHY,COMPUTE_AND_STORE_STRAIN, &
+                              ATTENUATION)
+  
   end subroutine it_transfer_from_GPU

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-10-07 01:29:29 UTC (rev 19039)
@@ -732,30 +732,28 @@
   
   ! prepares general fields on GPU  
   call prepare_constants_device(Mesh_pointer, &
-                      NGLLX, NSPEC_AB, NGLOB_AB, &
-                      xix, xiy, xiz, etax,etay,etaz, gammax, gammay, gammaz, &
-                      kappastore, mustore,ibool, &
-                      num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
-                      nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
-                      hprime_xx, hprime_yy, hprime_zz, &
-                      hprimewgll_xx, wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
-                      abs_boundary_ispec, abs_boundary_ijk, &
-                      abs_boundary_normal, &
-                      abs_boundary_jacobian2Dw, &
-                      b_absorb_field, num_abs_boundary_faces, b_num_abs_boundary_faces, &
-                      ispec_is_inner, &
-                      NSOURCES, sourcearrays, islice_selected_source, ispec_selected_source, &
-                      number_receiver_global, ispec_selected_rec, nrec, nrec_local, &
-                      SIMULATION_TYPE)
+                                  NGLLX, NSPEC_AB, NGLOB_AB, &
+                                  xix, xiy, xiz, etax,etay,etaz, gammax, gammay, gammaz, &
+                                  kappastore, mustore,ibool, &
+                                  num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+                                  nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
+                                  hprime_xx, hprime_yy, hprime_zz, &
+                                  hprimewgll_xx, wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
+                                  abs_boundary_ispec, abs_boundary_ijk, &
+                                  abs_boundary_normal, &
+                                  abs_boundary_jacobian2Dw, &
+                                  b_absorb_field, num_abs_boundary_faces, b_num_abs_boundary_faces, &
+                                  ispec_is_inner, &
+                                  NSOURCES, sourcearrays, islice_selected_source, ispec_selected_source, &
+                                  number_receiver_global, ispec_selected_rec, nrec, nrec_local, &
+                                  SIMULATION_TYPE)
 
-!    call prepare_fields_device(Mesh_pointer, NDIM*NGLOB_AB);
-
   ! prepares fields on GPU for acoustic simulations 
   if( ACOUSTIC_SIMULATION ) &
     call prepare_fields_acoustic_device(Mesh_pointer,rmass_acoustic,rhostore,kappastore, &
                                   num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
                                   ispec_is_acoustic, &
-                                  num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
+                                  NOISE_TOMOGRAPHY,num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
                                   ABSORBING_CONDITIONS,b_reclen_potential,b_absorb_potential, &
                                   SIMULATION_TYPE,rho_ac_kl,kappa_ac_kl)
   
@@ -765,59 +763,38 @@
                                   rmass,rho_vp,rho_vs, &
                                   num_phase_ispec_elastic,phase_ispec_inner_elastic, &
                                   ispec_is_elastic, &
-                                  ABSORBING_CONDITIONS,b_absorb_field,b_num_abs_boundary_faces)
+                                  ABSORBING_CONDITIONS,b_absorb_field,b_num_abs_boundary_faces, &
+                                  SIMULATION_TYPE,rho_kl,mu_kl,kappa_kl, &
+                                  COMPUTE_AND_STORE_STRAIN, &
+                                  epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
+                                  epsilon_trace_over_3, &
+                                  b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
+                                  b_epsilon_trace_over_3, &
+                                  ATTENUATION,size(R_xx), &
+                                  R_xx,R_yy,R_xy,R_xz,R_yz, &
+                                  b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
+                                  one_minus_sum_beta,factor_common, &
+                                  alphaval,betaval,gammaval, &
+                                  b_alphaval,b_betaval,b_gammaval)
 
-  ! prepares receiver arrays for adjoint runs
-  if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) then
+  ! prepares needed receiver array for adjoint runs
+  if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) &
     call prepare_adjoint_sim2_or_3_constants_device(Mesh_pointer, &
-                                    islice_selected_rec, &
-                                    size(islice_selected_rec))
+                                  islice_selected_rec,size(islice_selected_rec))
   
-  endif
-
   ! prepares fields on GPU for noise simulations      
   if ( NOISE_TOMOGRAPHY > 0 ) then
     ! note: noise tomography is only supported for elastic domains so far.
     
-    ! copies noise (free) surface arrays to GPU
-    call prepare_noise_constants_device(Mesh_pointer, NGLLX, NSPEC_AB, NGLOB_AB, &
-             free_surface_ispec,free_surface_ijk,num_free_surface_faces,size(free_surface_ijk), &
-             SIMULATION_TYPE)
+    ! copies noise  arrays to GPU
+    call prepare_fields_noise_device(Mesh_pointer, NSPEC_AB, NGLOB_AB, &
+                                  free_surface_ispec,free_surface_ijk,num_free_surface_faces,size(free_surface_ijk), &
+                                  SIMULATION_TYPE,NOISE_TOMOGRAPHY, &
+                                  NSTEP,noise_sourcearray, &
+                                  normal_x_noise,normal_y_noise,normal_z_noise, &
+                                  mask_noise,free_surface_jacobian2Dw, &
+                                  Sigma_kl)
 
-    call prepare_adjoint_constants_device(Mesh_pointer, &
-             !ispec_selected_rec,islice_selected_rec,nrec,size(islice_selected_rec),&
-             noise_sourcearray, NSTEP,&
-             epsilondev_xx,&
-             epsilondev_yy,&
-             epsilondev_xy,&
-             epsilondev_xz,&
-             epsilondev_yz,&
-             NSPEC_STRAIN_ONLY)
-
-    if(NOISE_TOMOGRAPHY > 1) &             
-      call prepare_and_transfer_noise_backward_constants(Mesh_pointer,&
-             normal_x_noise,&
-             normal_y_noise,&
-             normal_z_noise,&
-             mask_noise,&
-             free_surface_jacobian2Dw,&
-             nfaces_surface_ext_mesh)
-    
-    if( SIMULATION_TYPE == 3) then 
-      ! now have backward fields in addition to standard fields
-      call prepare_and_transfer_noise_backward_fields(Mesh_pointer, NDIM*NGLOB_AB, &
-                b_displ, b_veloc, b_accel,&
-                b_epsilondev_xx, b_epsilondev_yy, b_epsilondev_xy,&
-                b_epsilondev_xz, b_epsilondev_yz, &
-                NSPEC_STRAIN_ONLY)
-                
-      call prepare_sensitivity_kernels(Mesh_pointer,&
-                rho_kl,mu_kl,kappa_kl,&
-                epsilon_trace_over_3,b_epsilon_trace_over_3,&
-                Sigma_kl,NSPEC_AB)
-
-
-    endif
   endif ! NOISE_TOMOGRAPHY
 
   ! sends initial data to device
@@ -839,9 +816,6 @@
       call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc, b_accel,Mesh_pointer)
   endif
 
-  ! outputs GPU usage to files for all processes
-  call output_free_device_memory(myrank)
-
   ! outputs usage 
   if( myrank == 0 ) then
     call get_free_device_memory(free_mb,used_mb,total_mb)
@@ -850,5 +824,8 @@
     write(IMAIN,*)"             total =",total_mb," MB"
     write(IMAIN,*)
   endif    
+
+  ! outputs GPU usage to files for all processes
+  call output_free_device_memory(myrank)
   
   end subroutine prepare_timerun_GPU