[cig-commits] r19137 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda meshfem3D shared specfem3D

Mon Oct 31 14:45:36 PDT 2011

Author: danielpeter
Date: 2011-10-31 14:45:35 -0700 (Mon, 31 Oct 2011)
New Revision: 19137

Modified:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/check_mesh_quality.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90
Log:
updates preparation for acoustic & elastic adjoint simulations

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu	2011-10-31 21:45:35 UTC (rev 19137)
@@ -58,8 +58,8 @@
                                            int* islice_selected_source,
                                            int* ispec_selected_source,
                                            int* ispec_is_elastic,
-                                           int NSOURCES,
-                                           float* d_debug) {
+                                           int NSOURCES //,float* d_debug
+                                           ) {
   int i = threadIdx.x;
   int j = threadIdx.y;
   int k = threadIdx.z;
@@ -82,10 +82,12 @@
         //if(i==0 && j==0 && k==0) printf("add sources kernel: stf = %e\n",stf);
 
         iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
         atomicAdd(&accel[iglob*3],
                   sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf);
         atomicAdd(&accel[iglob*3+1],
                   sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 1, i,j,k)]*stf);
+
   // if((iglob*3+2 == 304598)) {
   //   atomicAdd(&d_debug[0],1.0f);
   //   d_debug[1] = accel[iglob*3+2];
@@ -93,6 +95,7 @@
   //   d_debug[3] = stf;
   // }
   // d_debug[4] = 42.0f;
+
         atomicAdd(&accel[iglob*3+2],
                   sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)]*stf);
       }
@@ -137,7 +140,6 @@
   //int USE_FORCE_POINT_SOURCE = *USE_FORCE_POINT_SOURCEf;
   int myrank = *myrankf;
 
-  float* d_debug;
 
   int num_blocks_x = NSOURCES;
   int num_blocks_y = 1;
@@ -152,16 +154,15 @@
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(5,5,5);
+
+  //float* d_debug;
   // (float* accel, int* ibool, int* ispec_is_inner, int phase_is_inner,
   // float* sourcearrays, double* stf_pre_compute,int myrank,
   // int* islice_selected_source, int* ispec_selected_source,
   // int* ispec_is_elastic, int NSOURCES)
-
-  //daniel
   //printf("add sources : nsources_local = %d\n",mp->nsources_local);
   //printf("add sources : stf = %e\n",h_stf_pre_compute[0]);
 
-
   compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel,
                                                mp->d_ibool,
                                                mp->d_ispec_is_inner,
@@ -172,8 +173,8 @@
                                                mp->d_islice_selected_source,
                                                mp->d_ispec_selected_source,
                                                mp->d_ispec_is_elastic,
-                                               NSOURCES,
-                                               d_debug);
+                                               NSOURCES //,d_debug
+                                               );
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_add_sources_kernel");
@@ -218,7 +219,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(5,5,5);
 
-  float* d_debug;
+  //float* d_debug;
   // float* h_debug = (float*)calloc(128,sizeof(float));
   // cudaMalloc((void**)&d_debug,128*sizeof(float));
   // cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
@@ -230,8 +231,8 @@
                                                *myrank,
                                                mp->d_islice_selected_source,mp->d_ispec_selected_source,
                                                mp->d_ispec_is_elastic,
-                                               NSOURCES,
-                                               d_debug);
+                                               NSOURCES //,d_debug
+                                               );
 
   // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
   // for(int i=0;i<10;i++) {
@@ -249,7 +250,8 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool, int* ispec_selected_rec,
+__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool,
+                                                        int* ispec_selected_rec,
                                                         int irec_master_noise,
                                                         realw* accel,
                                                         realw* noise_sourcearray,
@@ -285,12 +287,17 @@
   int it = *it_f-1; // -1 for Fortran -> C indexing differences
   int irec_master_noise = *irec_master_noise_f;
   int myrank = *myrank_f;
+
   dim3 grid(1,1,1);
   dim3 threads(125,1,1);
+
   if(myrank == islice_selected_rec[irec_master_noise-1]) {
-    add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool, mp->d_ispec_selected_rec,
-                    irec_master_noise, mp->d_accel,
-                    mp->d_noise_sourcearray, it);
+    add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool,
+                                                              mp->d_ispec_selected_rec,
+                                                              irec_master_noise,
+                                                              mp->d_accel,
+                                                              mp->d_noise_sourcearray,
+                                                              it);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel");
@@ -304,48 +311,52 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void add_sources_SIM_TYPE_2_OR_3_kernel(float* accel, int nrec,
-                                                   float* adj_sourcearrays,
-                                                   int* ibool,
-                                                   int* ispec_is_inner,
-                                                   int* ispec_selected_rec,
-                                                   int phase_is_inner,
-                                                   int* islice_selected_rec,
-                                                   int* pre_computed_irec,
-                                                   int nadj_rec_local,
-                                                   int NTSTEP_BETWEEN_ADJSRC,
-                                                   int myrank,
-                                                   int* debugi,
-                                                   float* debugf) {
+__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(float* accel,
+                                                     int nrec,
+                                                     float* adj_sourcearrays,
+                                                     int* ibool,
+                                                     int* ispec_is_inner,
+                                                     int* ispec_is_elastic,
+                                                     int* ispec_selected_rec,
+                                                     int phase_is_inner,
+                                                     int* islice_selected_rec,
+                                                     int* pre_computed_irec,
+                                                     int nadj_rec_local //,int myrank //,int* debugi,float* debugf
+                                                     ) {
+
   int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
-  if(irec_local<nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
 
+  if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+
     int irec = pre_computed_irec[irec_local];
 
-    int ispec_selected = ispec_selected_rec[irec]-1;
-    if(ispec_is_inner[ispec_selected] == phase_is_inner) {
-      int i = threadIdx.x;
-      int j = threadIdx.y;
-      int k = threadIdx.z;
-      int iglob = ibool[i+5*(j+5*(k+5*ispec_selected))]-1;
+    int ispec = ispec_selected_rec[irec]-1;
+    if( ispec_is_elastic[ispec] ){
 
-      // atomic operations are absolutely necessary for correctness!
-      atomicAdd(&(accel[0+3*iglob]),adj_sourcearrays[INDEX5(5,5,5,3,
-                  i,j,k,
-                  0,
-                  irec_local)]);
+      if(ispec_is_inner[ispec] == phase_is_inner) {
+        int i = threadIdx.x;
+        int j = threadIdx.y;
+        int k = threadIdx.z;
+        //int iglob = ibool[i+5*(j+5*(k+5*ispec))]-1;
+        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
 
-      atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
-                 i,j,k,
-                 1,
-                 irec_local)]);
+        // atomic operations are absolutely necessary for correctness!
+        atomicAdd(&(accel[0+3*iglob]),adj_sourcearrays[INDEX5(5,5,5,3,
+                                                              i,j,k,
+                                                              0,
+                                                              irec_local)]);
 
-      atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
-                i,j,k,
-                2,
-                irec_local)]);
-    }
+        atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
+                                                             i,j,k,
+                                                             1,
+                                                             irec_local)]);
 
+        atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+                                                            i,j,k,
+                                                            2,
+                                                            irec_local)]);
+      }
+    } // ispec_is_elastic
   }
 
 }
@@ -353,55 +364,72 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(add_sources_sim_type_2_or_3,
-              ADD_SOURCES_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
-                                           float* h_adj_sourcearrays,
-                                           int* size_adj_sourcearrays, int* ispec_is_inner,
-                                           int* phase_is_inner, int* ispec_selected_rec,
-                                           int* ibool,
-                                           int* myrank, int* nrec, int* time_index,
-                                           int* h_islice_selected_rec,int* nadj_rec_local,
-                                           int* NTSTEP_BETWEEN_READ_ADJSRC) {
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+                                               float* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_elastic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
 
-TRACE("add_sources_sim_type_2_or_3");
+TRACE("add_sources_el_sim_type_2_or_3");
 
-  if(*nadj_rec_local > 0) {
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-    Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-    int rank;
-    MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+  // checks
+  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
 
-    // make sure grid dimension is less than 65535 in x dimension
-    int num_blocks_x = *nadj_rec_local;
-    int num_blocks_y = 1;
-    while(num_blocks_x > 65535) {
-      num_blocks_x = ceil(num_blocks_x/2.0);
-      num_blocks_y = num_blocks_y*2;
-    }
-    dim3 grid(num_blocks_x,num_blocks_y,1);
-    dim3 threads(5,5,5);
+  //int rank;
+  //MPI_Comm_rank(MPI_COMM_WORLD,&rank);
 
-    float* d_adj_sourcearrays;
-    print_CUDA_error_if_any(cudaMalloc((void**)&d_adj_sourcearrays,
-                                       (*nadj_rec_local)*3*125*sizeof(float)),1);
-    float* h_adj_sourcearrays_slice = (float*)malloc((*nadj_rec_local)*3*125*sizeof(float));
+  // make sure grid dimension is less than 65535 in x dimension
+  int num_blocks_x = mp->nadj_rec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
 
-    int* h_pre_computed_irec = new int[*nadj_rec_local];
-    int* d_pre_computed_irec;
-    cudaMalloc((void**)&d_pre_computed_irec,(*nadj_rec_local)*sizeof(int));
+  dim3 grid(num_blocks_x,num_blocks_y,1);
+  dim3 threads(5,5,5);
 
-    // build slice of adj_sourcearrays because full array is *very* large.
-    int irec_local = 0;
-    for(int irec = 0;irec<*nrec;irec++) {
-      if(*myrank == h_islice_selected_rec[irec]) {
-        irec_local++;
-        h_pre_computed_irec[irec_local-1] = irec;
-        if(ispec_is_inner[ispec_selected_rec[irec]-1] == *phase_is_inner) {
-          for(int k=0;k<5;k++) {
-            for(int j=0;j<5;j++) {
-              for(int i=0;i<5;i++) {
+  //float* d_adj_sourcearrays;
+  //print_CUDA_error_if_any(cudaMalloc((void**)&d_adj_sourcearrays,
+  //                                   (*nadj_rec_local)*3*125*sizeof(float)),1);
 
-                h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+  //float* h_adj_sourcearrays_slice = (float*)malloc((*nadj_rec_local)*3*125*sizeof(float));
+
+  //int* h_pre_computed_irec = new int[*nadj_rec_local];
+
+  //int* d_pre_computed_irec;
+  //cudaMalloc((void**)&d_pre_computed_irec,(*nadj_rec_local)*sizeof(int));
+
+  // build slice of adj_sourcearrays because full array is *very* large.
+  // note: this extracts array values for local adjoint sources at given time step "time_index"
+  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
+  int ispec,i,j,k;
+  int irec_local = 0;
+  for(int irec = 0; irec < *nrec; irec++) {
+    if(*myrank == h_islice_selected_rec[irec]) {
+      irec_local++;
+      //h_pre_computed_irec[irec_local-1] = irec;
+
+      // takes only elastic sources
+      ispec = h_ispec_selected_rec[irec]-1;
+      if( h_ispec_is_elastic[ispec] ){
+
+        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+          for(k=0;k<5;k++) {
+            for(j=0;j<5;j++) {
+              for(i=0;i<5;i++) {
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
                                                 i,j,k,0,
                                                 irec_local-1)]
                         = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
@@ -411,7 +439,7 @@
                                                     *time_index-1,
                                                     0,i,j,k)];
 
-                h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
                                                 i,j,k,1,
                                                 irec_local-1)]
                         = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
@@ -421,7 +449,7 @@
                                                     *time_index-1,
                                                     1,i,j,k)];
 
-                h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
                                                 i,j,k,2,
                                                 irec_local-1)]
                         = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
@@ -433,100 +461,107 @@
               }
             }
           }
-        }
-      }
+        } // phase_is_inner
+      } // h_ispec_is_elastic
     }
-    // printf("irec_local vs. *nadj_rec_local -> %d vs. %d\n",irec_local,*nadj_rec_local);
-    // for(int ispec=0;ispec<(*nadj_rec_local);ispec++) {
-    //   for(int i=0;i<5;i++)
-    //     for(int j=0;j<5;j++)
-    //  for(int k=0;k<5;k++) {
-    //    h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,ispec)] =
-    //      h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
-    //              ispec,
-    //              *time_index-1,
-    //              0,
-    //              i,j,k)];
-    //    h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,ispec)] =
-    //      h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
-    //              ispec,
-    //              *time_index-1,
-    //              1,
-    //              i,j,k)];
-    //    h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,ispec)] =
-    //      h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_ADJSRC,3,5,5,
-    //              ispec,
-    //              *time_index-1,
-    //              2,
-    //              i,j,k)];
-    //  }
+  }
+  // check all local sources were added
+  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
 
-    // }
+  // printf("irec_local vs. *nadj_rec_local -> %d vs. %d\n",irec_local,*nadj_rec_local);
+  // for(int ispec=0;ispec<(*nadj_rec_local);ispec++) {
+  //   for(int i=0;i<5;i++)
+  //     for(int j=0;j<5;j++)
+  //  for(int k=0;k<5;k++) {
+  //    h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,ispec)] =
+  //      h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
+  //              ispec,
+  //              *time_index-1,
+  //              0,
+  //              i,j,k)];
+  //    h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,ispec)] =
+  //      h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
+  //              ispec,
+  //              *time_index-1,
+  //              1,
+  //              i,j,k)];
+  //    h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,ispec)] =
+  //      h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_ADJSRC,3,5,5,
+  //              ispec,
+  //              *time_index-1,
+  //              2,
+  //              i,j,k)];
+  //  }
 
-    cudaMemcpy(d_adj_sourcearrays, h_adj_sourcearrays_slice,(*nadj_rec_local)*3*125*sizeof(float),
-         cudaMemcpyHostToDevice);
+  // }
 
+  // copies extracted array values onto GPU
+  cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+             (mp->nadj_rec_local)*3*125*sizeof(float),cudaMemcpyHostToDevice);
 
-    // the irec_local variable needs to be precomputed (as
-    // h_pre_comp..), because normally it is in the loop updating accel,
-    // and due to how it's incremented, it cannot be parallelized
 
-    // int irec_local=0;
-    // for(int irec=0;irec<*nrec;irec++) {
-    //   if(*myrank == h_islice_selected_rec[irec]) {
-    //     h_pre_computed_irec_local_index[irec] = irec_local;
-    //     irec_local++;
-    //     if(irec_local==1) {
-    //  // printf("%d:first useful irec==%d\n",rank,irec);
-    //     }
-    //   }
-    //   else h_pre_computed_irec_local_index[irec] = 0;
-    // }
-    cudaMemcpy(d_pre_computed_irec,h_pre_computed_irec,
-         (*nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice);
-    // pause_for_debugger(1);
-    int* d_debugi, *h_debugi;
-    float* d_debugf, *h_debugf;
-    h_debugi = (int*)calloc(num_blocks_x,sizeof(int));
-    cudaMalloc((void**)&d_debugi,num_blocks_x*sizeof(int));
-    cudaMemcpy(d_debugi,h_debugi,num_blocks_x*sizeof(int),cudaMemcpyHostToDevice);
-    h_debugf = (float*)calloc(num_blocks_x,sizeof(float));
-    cudaMalloc((void**)&d_debugf,num_blocks_x*sizeof(float));
-    cudaMemcpy(d_debugf,h_debugf,num_blocks_x*sizeof(float),cudaMemcpyHostToDevice);
+  // the irec_local variable needs to be precomputed (as
+  // h_pre_comp..), because normally it is in the loop updating accel,
+  // and due to how it's incremented, it cannot be parallelized
 
-    add_sources_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel, *nrec,
-               d_adj_sourcearrays, mp->d_ibool,
-               mp->d_ispec_is_inner,
-               mp->d_ispec_selected_rec,
-               *phase_is_inner,
-               mp->d_islice_selected_rec,
-               d_pre_computed_irec,
-               *nadj_rec_local,
-               *NTSTEP_BETWEEN_READ_ADJSRC,
-               *myrank,
-               d_debugi,d_debugf);
+  // int irec_local=0;
+  // for(int irec=0;irec<*nrec;irec++) {
+  //   if(*myrank == h_islice_selected_rec[irec]) {
+  //     h_pre_computed_irec_local_index[irec] = irec_local;
+  //     irec_local++;
+  //     if(irec_local==1) {
+  //  // printf("%d:first useful irec==%d\n",rank,irec);
+  //     }
+  //   }
+  //   else h_pre_computed_irec_local_index[irec] = 0;
+  // }
+  //cudaMemcpy(mp->d_pre_computed_irec,mp->h_pre_computed_irec,
+  //           (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice);
 
-    cudaMemcpy(h_debugi,d_debugi,num_blocks_x*sizeof(int),cudaMemcpyDeviceToHost);
-    cudaMemcpy(h_debugf,d_debugf,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost);
+  // pause_for_debugger(1);
+  //int* d_debugi, *h_debugi;
+  //float* d_debugf, *h_debugf;
+  //h_debugi = (int*)calloc(num_blocks_x,sizeof(int));
+  //cudaMalloc((void**)&d_debugi,num_blocks_x*sizeof(int));
+  //cudaMemcpy(d_debugi,h_debugi,num_blocks_x*sizeof(int),cudaMemcpyHostToDevice);
+  //h_debugf = (float*)calloc(num_blocks_x,sizeof(float));
+  //cudaMalloc((void**)&d_debugf,num_blocks_x*sizeof(float));
+  //cudaMemcpy(d_debugf,h_debugf,num_blocks_x*sizeof(float),cudaMemcpyHostToDevice);
 
-    // printf("%d: pre_com0:%d\n",rank,h_pre_computed_irec_local_index[0]);
-    // printf("%d: pre_com1:%d\n",rank,h_pre_computed_irec_local_index[1]);
-    // printf("%d: pre_com2:%d\n",rank,h_pre_computed_irec_local_index[2]);
-    // for(int i=156;i<(156+30);i++) {
-    //   if(rank==0) printf("%d:debug[%d] = i/f = %d / %e\n",rank,i,h_debugi[i],h_debugf[i]);
-    // }
+  add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel,
+                                                         *nrec,
+                                                         mp->d_adj_sourcearrays,
+                                                         mp->d_ibool,
+                                                         mp->d_ispec_is_inner,
+                                                         mp->d_ispec_is_elastic,
+                                                         mp->d_ispec_selected_rec,
+                                                         *phase_is_inner,
+                                                         mp->d_islice_selected_rec,
+                                                         mp->d_pre_computed_irec,
+                                                         mp->nadj_rec_local //,*myrank //,d_debugi,d_debugf
+                                                         );
 
+  //cudaMemcpy(h_debugi,d_debugi,num_blocks_x*sizeof(int),cudaMemcpyDeviceToHost);
+  //cudaMemcpy(h_debugf,d_debugf,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost);
+
+  // printf("%d: pre_com0:%d\n",rank,h_pre_computed_irec_local_index[0]);
+  // printf("%d: pre_com1:%d\n",rank,h_pre_computed_irec_local_index[1]);
+  // printf("%d: pre_com2:%d\n",rank,h_pre_computed_irec_local_index[2]);
+  // for(int i=156;i<(156+30);i++) {
+  //   if(rank==0) printf("%d:debug[%d] = i/f = %d / %e\n",rank,i,h_debugi[i],h_debugf[i]);
+  // }
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-    // MPI_Barrier(MPI_COMM_WORLD);
-    exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
+  // MPI_Barrier(MPI_COMM_WORLD);
+  exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
 
-    // printf("Proc %d exiting with successful kernel\n",rank);
-    // exit(1);
+  // printf("Proc %d exiting with successful kernel\n",rank);
+  // exit(1);
 #endif
-    delete h_pre_computed_irec;
-    cudaFree(d_adj_sourcearrays);
-    cudaFree(d_pre_computed_irec);
-  }
+  //cudaFree(d_adj_sourcearrays);
+  //cudaFree(d_pre_computed_irec);
+  //free(h_adj_sourcearrays_slice);
+  //delete h_pre_computed_irec;
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -711,65 +746,61 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void add_sources_acoustic_SIM_TYPE_2_OR_3_kernel(float* potential_dot_dot_acoustic,
-                                                            int nrec,
-                                                            int pre_computed_index,
-                                                            float* adj_sourcearrays,
-                                                            int* ibool,
-                                                            int* ispec_is_inner,
-                                                            int* ispec_is_acoustic,
-                                                            float* kappastore,
-                                                            int* ispec_selected_rec,
-                                                            int phase_is_inner,
-                                                            int* islice_selected_rec,
-                                                            int* pre_computed_irec_local_index,
-                                                            int nadj_rec_local,
-                                                            int NTSTEP_BETWEEN_ADJSRC,
-                                                            int myrank) {
+__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(float* potential_dot_dot_acoustic,
+                                                      int nrec,
+                                                      float* adj_sourcearrays,
+                                                      int* ibool,
+                                                      int* ispec_is_inner,
+                                                      int* ispec_is_acoustic,
+                                                      int* ispec_selected_rec,
+                                                      int phase_is_inner,
+                                                      int* islice_selected_rec,
+                                                      int* pre_computed_irec,
+                                                      int nadj_rec_local,
+                                                      float* kappastore) {
 
-  int irec = blockIdx.x + gridDim.x*blockIdx.y;
+  int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
 
-  //float kappal;
-  int i,j,k,iglob,ispec;
+  // because of grid shape, irec_local can be too big
+  if(irec_local < nadj_rec_local) {
 
-  // because of grid shape, irec can be too big
-  if(irec<nrec) {
+    int irec = pre_computed_irec[irec_local];
 
-    // adds source only if this proc carries the sources
-    if( myrank == islice_selected_rec[irec] ){
+    int ispec = ispec_selected_rec[irec]-1;
+    if( ispec_is_acoustic[ispec] ){
 
-      // adds acoustic source
-      ispec = ispec_selected_rec[irec]-1;
-      if( ispec_is_acoustic[ispec] ){
+      // checks if element is in phase_is_inner run
+      if(ispec_is_inner[ispec] == phase_is_inner) {
+        int i = threadIdx.x;
+        int j = threadIdx.y;
+        int k = threadIdx.z;
+        int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
 
-        // checks if element is in phase_is_inner run
-        if(ispec_is_inner[ispec] == phase_is_inner) {
-          i = threadIdx.x;
-          j = threadIdx.y;
-          k = threadIdx.z;
-          iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+        //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
 
-          //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+        //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+        //                                            pre_computed_irec_local_index[irec],
+        //                                            pre_computed_index,
+        //                                            0,
+        //                                            i,j,k)]/kappal;
 
-          //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
-          //                                            pre_computed_irec_local_index[irec],
-          //                                            pre_computed_index,
-          //                                            0,
-          //                                            i,j,k)]/kappal;
+        // beware, for acoustic medium, a pressure source would be taking the negative
+        // and divide by Kappa of the fluid;
+        // this would have to be done when constructing the adjoint source.
+        //
+        // note: we take the first component of the adj_sourcearrays
+        //          the idea is to have e.g. a pressure source, where all 3 components would be the same
 
-          // beware, for acoustic medium, a pressure source would be taking the negative
-          // and divide by Kappa of the fluid;
-          // this would have to be done when constructing the adjoint source.
-          //
-          // note: we take the first component of the adj_sourcearrays
-          //          the idea is to have e.g. a pressure source, where all 3 components would be the same
+        atomicAdd(&potential_dot_dot_acoustic[iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+                                                                             i,j,k,
+                                                                             0,
+                                                                             irec_local)] // / kappal
+                                                                             );
 
-          atomicAdd(&potential_dot_dot_acoustic[iglob],
-                    +adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
-                                             pre_computed_irec_local_index[irec],pre_computed_index-1,
-                                             0,i,j,k)] // / kappal
-                                             );
-        }
+                  //+adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+                  //                         pre_computed_irec_local_index[irec],pre_computed_index-1,
+                  //                         0,i,j,k)] // / kappal
+                  //                         );
       }
     }
   }
@@ -781,85 +812,111 @@
 extern "C"
 void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
               ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
-                                                         float* h_adj_sourcearrays,
-                                                         int* size_adj_sourcearrays,
-                                                         int* phase_is_inner,
-                                                         int* myrank,
-                                                         int* nrec,
-                                                         int* pre_computed_index,
-                                                         int* h_islice_selected_rec,
-                                                         int* nadj_rec_local,
-                                                         int* NTSTEP_BETWEEN_ADJSRC) {
+                                               float* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_acoustic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_READ_ADJSRC) {
 
 TRACE("add_sources_ac_sim_2_or_3_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
+  // checks
+  if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
+
   // make sure grid dimension is less than 65535 in x dimension
-  int num_blocks_x = *nrec;
+  int num_blocks_x = mp->nadj_rec_local;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = ceil(num_blocks_x/2.0);
     num_blocks_y = num_blocks_y*2;
   }
+
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(5,5,5);
 
-  // copies source arrays
-  // daniel: todo workaround -- adj_sourcearrays can be very big, but here only at
-  //             specific time it (pre_computed_irec_local_index) is actually needed...
-  float* d_adj_sourcearrays;
-  print_CUDA_error_if_any(cudaMalloc((void**)&d_adj_sourcearrays,
-                                     (*size_adj_sourcearrays)*sizeof(float)),731);
-  print_CUDA_error_if_any(cudaMemcpy(d_adj_sourcearrays, h_adj_sourcearrays,
-                                     (*size_adj_sourcearrays)*sizeof(float),cudaMemcpyHostToDevice),732);
+  // build slice of adj_sourcearrays because full array is *very* large.
+  // note: this extracts array values for local adjoint sources at given time step "time_index"
+  //          from large adj_sourcearrays array into h_adj_sourcearrays_slice
+  int ispec,i,j,k;
+  int irec_local = 0;
+  for(int irec = 0; irec < *nrec; irec++) {
+    if(*myrank == h_islice_selected_rec[irec]) {
+      irec_local++;
 
-  //int* h_pre_computed_irec_local_index = new int[*nadj_rec_local];
-  int* h_pre_computed_irec_local_index = (int*) calloc(*nrec,sizeof(int));
+      // takes only acoustic sources
+      ispec = h_ispec_selected_rec[irec]-1;
+      if( h_ispec_is_acoustic[ispec] ){
 
-  int* d_pre_computed_irec_local_index;
-  print_CUDA_error_if_any(cudaMalloc((void**)&d_pre_computed_irec_local_index,
-                                     (*nrec)*sizeof(int)),741);
+        if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+          for(k=0;k<5;k++) {
+            for(j=0;j<5;j++) {
+              for(i=0;i<5;i++) {
 
-  // the irec_local variable needs to be precomputed (as
-  // h_pre_comp..), because normally it is in the loop updating accel,
-  // and due to how it's incremented, it cannot be parallized
-  int irec_local=0;
-  for(int irec=0;irec<*nrec;irec++) {
-    if(*myrank == h_islice_selected_rec[irec]) {
-      h_pre_computed_irec_local_index[irec] = irec_local;
-      irec_local++;
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                    i,j,k,0,
+                                                    irec_local-1)]
+                = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,
+                                            *time_index-1,
+                                            0,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                    i,j,k,1,
+                                                    irec_local-1)]
+                = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,
+                                            *time_index-1,
+                                            1,i,j,k)];
+
+                mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+                                                    i,j,k,2,
+                                                    irec_local-1)]
+                = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+                                            *NTSTEP_BETWEEN_READ_ADJSRC,
+                                            3,5,5,
+                                            irec_local-1,
+                                            *time_index-1,
+                                            2,i,j,k)];
+              }
+            }
+          }
+        } // phase_is_inner
+      } // h_ispec_is_acoustic
     }
-    else h_pre_computed_irec_local_index[irec] = 0;
   }
+  // check all local sources were added
+  if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
 
-  //daniel
-  //printf("irec local: rank=%d irec_local=%d nadj_rec_local=%d nrec=%d \n",*myrank,irec_local,*nadj_rec_local,*nrec);
+  // copies extracted array values onto GPU
+  cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+             (mp->nadj_rec_local)*3*125*sizeof(float),cudaMemcpyHostToDevice);
 
-  print_CUDA_error_if_any(cudaMemcpy(d_pre_computed_irec_local_index,h_pre_computed_irec_local_index,
-                                     (*nrec)*sizeof(int),cudaMemcpyHostToDevice),742);
+  // launches cuda kernel for acoustic adjoint sources
+  add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                          *nrec,
+                                                          mp->d_adj_sourcearrays,
+                                                          mp->d_ibool,
+                                                          mp->d_ispec_is_inner,
+                                                          mp->d_ispec_is_acoustic,
+                                                          mp->d_ispec_selected_rec,
+                                                          *phase_is_inner,
+                                                          mp->d_islice_selected_rec,
+                                                          mp->d_pre_computed_irec,
+                                                          mp->nadj_rec_local,
+                                                          mp->d_kappastore);
 
-  add_sources_acoustic_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                                                *nrec,
-                                                                *pre_computed_index,
-                                                                d_adj_sourcearrays,
-                                                                mp->d_ibool,
-                                                                mp->d_ispec_is_inner,
-                                                                mp->d_ispec_is_acoustic,
-                                                                mp->d_kappastore,
-                                                                mp->d_ispec_selected_rec,
-                                                                *phase_is_inner,
-                                                                mp->d_islice_selected_rec,
-                                                                d_pre_computed_irec_local_index,
-                                                                *nadj_rec_local,
-                                                                *NTSTEP_BETWEEN_ADJSRC,
-                                                                *myrank);
-
-  //delete h_pre_computed_irec_local_index;
-  free(h_pre_computed_irec_local_index);
-  cudaFree(d_adj_sourcearrays);
-  cudaFree(d_pre_computed_irec_local_index);
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
 #endif

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2011-10-31 21:45:35 UTC (rev 19137)
@@ -93,14 +93,9 @@
       // (normal points outwards of acoustic element)
       displ_n = displ_x*nx + displ_y*ny + displ_z*nz;
 
-
       // gets associated, weighted jacobian
       jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
 
-      //daniel
-      //if( igll == 0 ) printf("gpu: %i %i %i %i %i %e \n",i,j,k,ispec,iglob,jacobianw);
-
-
       // continuity of pressure and normal displacement on global point
 
       // note: newark time scheme together with definition of scalar potential:
@@ -147,10 +142,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-//daniel
-// printf("gpu: %i %i %i \n",num_coupling_ac_el_faces,SIMULATION_TYPE,phase_is_inner);
-
-
+  // launches GPU kernel
   compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_displ,
                                                        mp->d_potential_dot_dot_acoustic,
                                                        num_coupling_ac_el_faces,
@@ -237,10 +229,6 @@
       // gets associated, weighted jacobian
       jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
 
-      //daniel
-      //if( igll == 0 ) printf("gpu: %i %i %i %i %i %e \n",i,j,k,ispec,iglob,jacobianw);
-
-
       // continuity of displacement and pressure on global point
       //
       // note: newark time scheme together with definition of scalar potential:
@@ -288,10 +276,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  //daniel
-  // printf("gpu: %i %i %i \n",num_coupling_ac_el_faces,SIMULATION_TYPE,phase_is_inner);
-
-
+  // launches GPU kernel
   compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
                                                        mp->d_accel,
                                                        num_coupling_ac_el_faces,

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu	2011-10-31 21:45:35 UTC (rev 19137)
@@ -467,7 +467,7 @@
             offset1 = K*NGLL2+J*NGLLX+l;
             temp1l += s_dummy_loc[offset1]*hp1;
 
-            // daniel: not more assumes that hprime_xx = hprime_yy = hprime_zz
+            //no more assumes that hprime_xx = hprime_yy = hprime_zz
             hp2 = hprime_yy[l*NGLLX+J];
             offset2 = K*NGLL2+l*NGLLX+I;
             temp2l += s_dummy_loc[offset2]*hp2;
@@ -547,7 +547,7 @@
             offset1 = K*NGLL2+J*NGLLX+l;
             temp1l += s_temp1[offset1]*fac1;
 
-            //daniel: not more assumes hprimewgll_xx = hprimewgll_yy = hprimewgll_zz
+            //no more assumes hprimewgll_xx = hprimewgll_yy = hprimewgll_zz
             fac2 = hprimewgll_yy[J*NGLLX+l];
             offset2 = K*NGLL2+l*NGLLX+I;
             temp2l += s_temp2[offset2]*fac2;

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-10-31 21:45:35 UTC (rev 19137)
@@ -41,11 +41,11 @@
 
 //  cuda constant arrays
 __constant__ float d_hprime_xx[NGLL2];
-__constant__ float d_hprime_yy[NGLL2];  // daniel: remove only if certain: check if hprime_yy == hprime_xx
-__constant__ float d_hprime_zz[NGLL2];  // daniel: check if hprime_zz == hprime_xx
+__constant__ float d_hprime_yy[NGLL2];
+__constant__ float d_hprime_zz[NGLL2];
 __constant__ float d_hprimewgll_xx[NGLL2];
-__constant__ float d_hprimewgll_yy[NGLL2]; // daniel: check if hprimewgll_yy == hprimewgll_xx
-__constant__ float d_hprimewgll_zz[NGLL2]; // daniel: remove only if certain...
+__constant__ float d_hprimewgll_yy[NGLL2];
+__constant__ float d_hprimewgll_zz[NGLL2];
 __constant__ float d_wgllwgll_xy[NGLL2];
 __constant__ float d_wgllwgll_xz[NGLL2];
 __constant__ float d_wgllwgll_yz[NGLL2];
@@ -64,7 +64,7 @@
                               float* d_etax, float* d_etay, float* d_etaz,
                               float* d_gammax, float* d_gammay, float* d_gammaz,
                               float* d_kappav, float* d_muv,
-                              float* d_debug,
+                              //float* d_debug,
                               int COMPUTE_AND_STORE_STRAIN,
                               float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
                               float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,
@@ -471,7 +471,7 @@
 
   // debugging
   //printf("Starting with grid %dx%d for %d blocks\n",num_blocks_x,num_blocks_y,nb_blocks_to_compute);
-  float* d_debug;
+//  float* d_debug;
 //    float* h_debug;
 //    h_debug = (float*)calloc(128,sizeof(float));
 //    cudaMalloc((void**)&d_debug,128*sizeof(float));
@@ -494,7 +494,7 @@
                                                mp->d_etax, mp->d_etay, mp->d_etaz,
                                                mp->d_gammax, mp->d_gammay, mp->d_gammaz,
                                                mp->d_kappav, mp->d_muv,
-                                               d_debug,
+                                               //d_debug,
                                                COMPUTE_AND_STORE_STRAIN,
                                                mp->d_epsilondev_xx,
                                                mp->d_epsilondev_yy,
@@ -535,7 +535,7 @@
                                                  mp->d_etax, mp->d_etay, mp->d_etaz,
                                                  mp->d_gammax, mp->d_gammay, mp->d_gammaz,
                                                  mp->d_kappav, mp->d_muv,
-                                                 d_debug,
+                                                 //d_debug,
                                                  COMPUTE_AND_STORE_STRAIN,
                                                  mp->d_b_epsilondev_xx,
                                                  mp->d_b_epsilondev_yy,
@@ -611,7 +611,7 @@
                               float* d_etax, float* d_etay, float* d_etaz,
                               float* d_gammax, float* d_gammay, float* d_gammaz,
                               float* d_kappav, float* d_muv,
-                              float* d_debug,
+                              //float* d_debug,
                               int COMPUTE_AND_STORE_STRAIN,
                               float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
                               float* epsilondev_xz,float* epsilondev_yz,
@@ -1450,12 +1450,11 @@
     //if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob);
 
     // only update this global point once
-    // daniel: todo - workaround to not use the temporary update array
-    //            atomicExch returns the old value, i.e. 0 indicates that we still have to do this point
 
+    // daniel: TODO - there might be better ways to implement a mutex like below,
+    //            and find a workaround to not use the temporary update array.
+    //            atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point
 
-    //if(igll == 0 ) printf("updated_dof %d %d\n",iglob,updated_dof_ocean_load[iglob]);
-
     if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
 
       // get normal
@@ -1470,7 +1469,7 @@
 
       additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
 
-      // daniel: probably wouldn't need atomicAdd anymore, but just to be sure...
+      // probably wouldn't need atomicAdd anymore, but just to be sure...
       atomicAdd(&accel[iglob*3], + additional_term * nx);
       atomicAdd(&accel[iglob*3+1], + additional_term * ny);
       atomicAdd(&accel[iglob*3+2], + additional_term * nz);

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2011-10-31 21:45:35 UTC (rev 19137)
@@ -65,8 +65,8 @@
                                            float* kappa_kl,
                                            float* epsilon_trace_over_3,
                                            float* b_epsilon_trace_over_3,
-                                           int NSPEC_AB,
-                                           float* d_debug) {
+                                           int NSPEC_AB //,float* d_debug
+                                           ) {
 
   int ispec = blockIdx.x + blockIdx.y*gridDim.x;
 
@@ -147,13 +147,11 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  float* d_debug;
-  /*
-  float* h_debug;
-  h_debug = (float*)calloc(128,sizeof(float));
-  cudaMalloc((void**)&d_debug,128*sizeof(float));
-  cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-  */
+  //float* d_debug;
+  //float* h_debug;
+  //h_debug = (float*)calloc(128,sizeof(float));
+  //cudaMalloc((void**)&d_debug,128*sizeof(float));
+  //cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
 
   compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
                                                mp->d_accel, mp->d_b_displ,
@@ -173,8 +171,8 @@
                                                mp->d_kappa_kl,
                                                mp->d_epsilon_trace_over_3,
                                                mp->d_b_epsilon_trace_over_3,
-                                               mp->NSPEC_AB,
-                                               d_debug);
+                                               mp->NSPEC_AB //,d_debug
+                                               );
   /*
   cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
   cudaFree(d_debug);
@@ -221,8 +219,8 @@
                                                            float* normal_z_noise,
                                                            float* Sigma_kl,
                                                            float deltat,
-                                                           int num_free_surface_faces,
-                                                           float* d_debug) {
+                                                           int num_free_surface_faces //,float* d_debug
+                                                           ) {
   int iface = blockIdx.x + blockIdx.y*gridDim.x;
 
   if(iface < num_free_surface_faces) {
@@ -264,18 +262,17 @@
 void FC_FUNC_(compute_kernels_strgth_noise_cu,
               COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
                                                     float* h_noise_surface_movie,
-                                                    int* num_free_surface_faces_f,
                                                     float* deltat) {
 
 TRACE("compute_kernels_strgth_noise_cu");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-  int num_free_surface_faces = *num_free_surface_faces_f;
 
-  cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,3*25*num_free_surface_faces*sizeof(float),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+             3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice);
 
 
-  int num_blocks_x = num_free_surface_faces;
+  int num_blocks_x = mp->num_free_surface_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = ceil(num_blocks_x/2.0);
@@ -286,7 +283,7 @@
   dim3 threads(25,1,1);
 
   // float* h_debug = (float*)calloc(128,sizeof(float));
-  float* d_debug;
+  //float* d_debug;
   // cudaMalloc((void**)&d_debug,128*sizeof(float));
   // cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
 
@@ -299,8 +296,8 @@
                                                                mp->d_normal_y_noise,
                                                                mp->d_normal_z_noise,
                                                                mp->d_Sigma_kl,*deltat,
-                                                               num_free_surface_faces,
-                                                               d_debug);
+                                                               mp->num_free_surface_faces //,d_debug
+                                                               );
 
   // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
   // for(int i=0;i<8;i++) {
@@ -310,7 +307,6 @@
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_kernels_strength_noise_cuda_kernel");
 #endif
-
 }
 
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2011-10-31 21:45:35 UTC (rev 19137)
@@ -92,9 +92,6 @@
       // gets associated, weighted jacobian
       jacobianw = abs_boundary_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
 
-//daniel
-//if( igll == 0 ) printf("gpu: %i %i %i %i %i %e %e %e\n",i,j,k,ispec,iglob,rhol,kappal,jacobianw);
-
       // Sommerfeld condition
       atomicAdd(&potential_dot_dot_acoustic[iglob],-potential_dot_acoustic[iglob]*jacobianw/cpl/rhol);
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2011-10-31 21:45:35 UTC (rev 19137)
@@ -89,7 +89,7 @@
 
 #define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*128
 
-//daniel: TODO check speed of alternatives
+//daniel: TODO -- check speed of these alternative definitions
 //#define INDEX2(xsize,x,y) x + (y)*xsize
 //#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
 //#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
@@ -107,6 +107,8 @@
 
 void exit_on_cuda_error(char* kernel_name);
 
+void exit_on_error(char* info);
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // cuda constant arrays
@@ -207,6 +209,11 @@
   float* d_station_seismo_field;
   float* h_station_seismo_field;
 
+  int nadj_rec_local;
+  float* d_adj_sourcearrays;
+  float* h_adj_sourcearrays_slice;
+  int* d_pre_computed_irec;
+
   // surface elements (to save for noise tomography and acoustic simulations)
   int* d_free_surface_ispec;
   int* d_free_surface_ijk;

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2011-10-31 21:45:35 UTC (rev 19137)
@@ -40,31 +40,6 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void transfer_surface_to_host_kernel(int* free_surface_ispec,int* free_surface_ijk,
-                                                int num_free_surface_faces, int* ibool,
-                                                realw* displ, realw* noise_surface_movie) {
-  int igll = threadIdx.x;
-  int iface = blockIdx.x + blockIdx.y*gridDim.x;
-
-  // int id = tx + blockIdx.x*blockDim.x + blockIdx.y*blockDim.x*gridDim.x;
-
-  if(iface < num_free_surface_faces) {
-    int ispec = free_surface_ispec[iface]-1; //-1 for C-based indexing
-
-    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
-    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
-    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
-
-    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-
-    noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)] = displ[iglob*3];
-    noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)] = displ[iglob*3+1];
-    noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)] = displ[iglob*3+2];
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
 extern "C"
 void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){
 TRACE("fortranflush");
@@ -124,15 +99,43 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+__global__ void transfer_surface_to_host_kernel(int* free_surface_ispec,
+                                                int* free_surface_ijk,
+                                                int num_free_surface_faces,
+                                                int* ibool,
+                                                realw* displ,
+                                                realw* noise_surface_movie) {
+  int igll = threadIdx.x;
+  int iface = blockIdx.x + blockIdx.y*gridDim.x;
+
+  // int id = tx + blockIdx.x*blockDim.x + blockIdx.y*blockDim.x*gridDim.x;
+
+  if(iface < num_free_surface_faces) {
+    int ispec = free_surface_ispec[iface]-1; //-1 for C-based indexing
+
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+    noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)] = displ[iglob*3];
+    noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)] = displ[iglob*3+1];
+    noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)] = displ[iglob*3+2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
 extern "C"
 void FC_FUNC_(transfer_surface_to_host,
               TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
-                                        realw* h_noise_surface_movie,
-                                        int* num_free_surface_faces) {
+                                        realw* h_noise_surface_movie) {
 TRACE("transfer_surface_to_host");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-  int num_blocks_x = *num_free_surface_faces;
+
+  int num_blocks_x = mp->num_free_surface_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = ceil(num_blocks_x/2.0);
@@ -143,13 +146,13 @@
 
   transfer_surface_to_host_kernel<<<grid,threads>>>(mp->d_free_surface_ispec,
                                                     mp->d_free_surface_ijk,
-                                                    *num_free_surface_faces,
+                                                    mp->num_free_surface_faces,
                                                     mp->d_ibool,
                                                     mp->d_displ,
                                                     mp->d_noise_surface_movie);
 
   cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
-             3*25*(*num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
+             3*25*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("transfer_surface_to_host");
@@ -167,9 +170,8 @@
                                                          realw* normal_y_noise,
                                                          realw* normal_z_noise,
                                                          realw* mask_noise,
-                                                         realw* free_surface_jacobian2Dw,
-                                                         realw* wgllwgll_xy,
-                                                         float* d_debug) {
+                                                         realw* free_surface_jacobian2Dw //,float* d_debug
+                                                         ) {
 
   int iface = blockIdx.x + gridDim.x*blockIdx.y; // surface element id
 
@@ -234,22 +236,24 @@
 extern "C"
 void FC_FUNC_(noise_read_add_surface_movie_cu,
               NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
-                                                 realw* h_noise_surface_movie,
-                                                 int* num_free_surface_faces_f,
-                                                 int* NOISE_TOMOGRAPHYf) {
+                                               realw* h_noise_surface_movie,
+                                               int* NOISE_TOMOGRAPHYf) {
 TRACE("noise_read_add_surface_movie_cu");
 
   // EPIK_TRACER("noise_read_add_surface_movie_cu");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  int num_free_surface_faces = *num_free_surface_faces_f;
   int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf;
-  float* d_noise_surface_movie;
-  cudaMalloc((void**)&d_noise_surface_movie,3*25*num_free_surface_faces*sizeof(float));
-  cudaMemcpy(d_noise_surface_movie, h_noise_surface_movie,
-             3*25*num_free_surface_faces*sizeof(realw),cudaMemcpyHostToDevice);
 
-  int num_blocks_x = num_free_surface_faces;
+  //float* d_noise_surface_movie;
+  //cudaMalloc((void**)&d_noise_surface_movie,3*25*num_free_surface_faces*sizeof(float));
+  //cudaMemcpy(d_noise_surface_movie, h_noise_surface_movie,
+  //           3*25*num_free_surface_faces*sizeof(realw),cudaMemcpyHostToDevice);
+
+  cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+             3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice);
+
+  int num_blocks_x = mp->num_free_surface_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = ceil(num_blocks_x/2.0);
@@ -259,50 +263,46 @@
   dim3 threads(25,1,1);
 
   // float* h_debug = (float*)calloc(128,sizeof(float));
-  float* d_debug;
+  //float* d_debug;
   // cudaMalloc((void**)&d_debug,128*sizeof(float));
   // cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
 
   if(NOISE_TOMOGRAPHY == 2) { // add surface source to forward field
     noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_accel,
-                     mp->d_ibool,
-                     mp->d_free_surface_ispec,
-                     mp->d_free_surface_ijk,
-                     num_free_surface_faces,
-                     d_noise_surface_movie,
-                     mp->d_normal_x_noise,
-                     mp->d_normal_y_noise,
-                     mp->d_normal_z_noise,
-                     mp->d_mask_noise,
-                     mp->d_free_surface_jacobian2Dw,
-                     mp->d_wgllwgll_xy,
-                     d_debug);
+                                                               mp->d_ibool,
+                                                               mp->d_free_surface_ispec,
+                                                               mp->d_free_surface_ijk,
+                                                               mp->num_free_surface_faces,
+                                                               mp->d_noise_surface_movie,
+                                                               mp->d_normal_x_noise,
+                                                               mp->d_normal_y_noise,
+                                                               mp->d_normal_z_noise,
+                                                               mp->d_mask_noise,
+                                                               mp->d_free_surface_jacobian2Dw //,d_debug
+                                                               );
   }
   else if(NOISE_TOMOGRAPHY == 3) { // add surface source to adjoint (backward) field
     noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
-                     mp->d_ibool,
-                     mp->d_free_surface_ispec,
-                     mp->d_free_surface_ijk,
-                     num_free_surface_faces,
-                     d_noise_surface_movie,
-                     mp->d_normal_x_noise,
-                     mp->d_normal_y_noise,
-                     mp->d_normal_z_noise,
-                     mp->d_mask_noise,
-                     mp->d_free_surface_jacobian2Dw,
-                     mp->d_wgllwgll_xy,
-                     d_debug);
+                                                               mp->d_ibool,
+                                                               mp->d_free_surface_ispec,
+                                                               mp->d_free_surface_ijk,
+                                                               mp->num_free_surface_faces,
+                                                               mp->d_noise_surface_movie,
+                                                               mp->d_normal_x_noise,
+                                                               mp->d_normal_y_noise,
+                                                               mp->d_normal_z_noise,
+                                                               mp->d_mask_noise,
+                                                               mp->d_free_surface_jacobian2Dw //,d_debug
+                                                               );
   }
 
-
   // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
   // for(int i=0;i<8;i++) {
   // printf("debug[%d]= %e\n",i,h_debug[i]);
   // }
   // MPI_Abort(MPI_COMM_WORLD,1);
+  //cudaFree(d_noise_surface_movie);
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("noise_read_add_surface_movie_cuda_kernel");
 #endif
-
-  cudaFree(d_noise_surface_movie);
 }

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-10-31 21:45:35 UTC (rev 19137)
@@ -221,7 +221,7 @@
 
 
 /* ----------------------------------------------------------------------------------------------- */
-//daniel
+//daniel: helper function
 /*
 __global__ void check_phase_ispec_kernel(int num_phase_ispec,
                                          int* phase_ispec,
@@ -290,7 +290,7 @@
 */
 
 /* ----------------------------------------------------------------------------------------------- */
-//daniel
+//daniel: helper function
 /*
 __global__ void check_ispec_is_kernel(int NSPEC_AB,
                                       int* ispec_is,
@@ -353,7 +353,7 @@
 }
 */
 /* ----------------------------------------------------------------------------------------------- */
-//daniel
+//daniel: helper function
 /*
 __global__ void check_array_ispec_kernel(int num_array_ispec,
                                          int* array_ispec,
@@ -481,7 +481,7 @@
   }
 
   // allocates mesh parameter structure
-  Mesh* mp = (Mesh*)malloc(sizeof(Mesh));
+  Mesh* mp = (Mesh*) malloc( sizeof(Mesh) );
   if (mp == NULL) exit_on_error("error allocating mesh pointer");
   *Mesh_pointer = (long)mp;
 
@@ -664,7 +664,10 @@
               PREPARE_SIM2_OR_3_CONST_DEVICE)(
                                               long* Mesh_pointer_f,
                                               int* islice_selected_rec,
-                                              int* islice_selected_rec_size) {
+                                              int* islice_selected_rec_size,
+                                              int* nadj_rec_local,
+                                              int* nrec,
+                                              int* myrank) {
 
 TRACE("prepare_sim2_or_3_const_device");
 
@@ -673,11 +676,44 @@
   // allocates arrays for receivers
   print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
                                      *islice_selected_rec_size*sizeof(int)),7001);
-
   // copies arrays to GPU device
   print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_rec,islice_selected_rec,
                                      *islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),7002);
 
+  // adjoint source arrays
+  mp->nadj_rec_local = *nadj_rec_local;
+  if( mp->nadj_rec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
+                                       (mp->nadj_rec_local)*3*125*sizeof(float)),7003);
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
+                                       (mp->nadj_rec_local)*sizeof(int)),7004);
+
+    // prepares local irec array:
+    // the irec_local variable needs to be precomputed (as
+    // h_pre_comp..), because normally it is in the loop updating accel,
+    // and due to how it's incremented, it cannot be parallelized
+    int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) );
+    if( h_pre_computed_irec == NULL ) exit_on_error("prepare_sim2_or_3_const_device: h_pre_computed_irec not allocated\n");
+
+    int irec_local = 0;
+    for(int irec = 0; irec < *nrec; irec++) {
+      if(*myrank == islice_selected_rec[irec]) {
+        irec_local++;
+        h_pre_computed_irec[irec_local-1] = irec;
+      }
+    }
+    if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
+    // copies values onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
+                                       (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),7010);
+    free(h_pre_computed_irec);
+
+    // temporary array to prepare extracted source array values
+    mp->h_adj_sourcearrays_slice = (float*) malloc( (mp->nadj_rec_local)*3*125*sizeof(float) );
+    if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+
+  }
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_sim2_or_3_const_device");
 #endif
@@ -785,7 +821,7 @@
   if( mp->nrec_local > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
                                        mp->nrec_local*125*sizeof(float)),9107);
-    mp->h_station_seismo_potential = (float*)malloc(mp->nrec_local*125*sizeof(float));
+    mp->h_station_seismo_potential = (float*) malloc( mp->nrec_local*125*sizeof(float) );
     if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
   }
 
@@ -848,7 +884,7 @@
 
   // initializes kernel values to zero
   print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,
-                                     125*mp->NSPEC_AB*sizeof(float)),9019);  
+                                     125*mp->NSPEC_AB*sizeof(float)),9019);
   print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,
                                      125*mp->NSPEC_AB*sizeof(float)),9020);
 
@@ -945,7 +981,8 @@
   if( mp->nrec_local > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
                                      3*125*(mp->nrec_local)*sizeof(float)),8015);
-    mp->h_station_seismo_field = (float*)malloc(3*125*(mp->nrec_local)*sizeof(float));
+    mp->h_station_seismo_field = (float*) malloc( 3*125*(mp->nrec_local)*sizeof(float) );
+    if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
   }
 
   // absorbing conditions
@@ -1055,7 +1092,7 @@
 
   }
 
-  
+
   if( *OCEANS ){
     // oceans needs a free surface
     mp->num_free_surface_faces = *num_free_surface_faces;
@@ -1131,12 +1168,12 @@
 
   // initializes kernel values to zero
   print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,
-                                     125*mp->NSPEC_AB*sizeof(float)),8207);  
+                                     125*mp->NSPEC_AB*sizeof(float)),8207);
   print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,
-                                     125*mp->NSPEC_AB*sizeof(float)),8208);  
+                                     125*mp->NSPEC_AB*sizeof(float)),8208);
   print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,
-                                     125*mp->NSPEC_AB*sizeof(float)),8209);  
-  
+                                     125*mp->NSPEC_AB*sizeof(float)),8209);
+
   // strains used for attenuation and kernel simulations
   if( *COMPUTE_AND_STORE_STRAIN ){
     // strains
@@ -1319,8 +1356,8 @@
                                        125*(mp->NSPEC_AB)*sizeof(float)),4401);
     // initializes kernel values to zero
     print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
-                                       125*mp->NSPEC_AB*sizeof(float)),4403);  
-                                       
+                                       125*mp->NSPEC_AB*sizeof(float)),4403);
+
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1520,7 +1557,14 @@
   } // ELASTIC_SIMULATION
 
   // purely adjoint & kernel array
-  if( *SIMULATION_TYPE == 2 || *SIMULATION_TYPE == 3 ) cudaFree(mp->d_islice_selected_rec);
+  if( *SIMULATION_TYPE == 2 || *SIMULATION_TYPE == 3 ){
+    cudaFree(mp->d_islice_selected_rec);
+    if(mp->nadj_rec_local > 0 ){
+      cudaFree(mp->d_adj_sourcearrays);
+      cudaFree(mp->d_pre_computed_irec);
+      free(mp->h_adj_sourcearrays_slice);
+    }
+  }
 
   // NOISE arrays
   if( *NOISE_TOMOGRAPHY > 0 ){

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2011-10-31 21:45:35 UTC (rev 19137)
@@ -86,10 +86,10 @@
   FILE* fp; int it;
   printf("Opening %s for analysis\n",filename);
   fp = fopen(filename,"rb");
-  char* errorstr;
+  //char* errorstr;
   if(fp == 0) {
-    errorstr = strerror(errno);
-    printf("FILE ERROR:%s\n",errorstr);
+    //errorstr = (char*) strerror(errno);
+    printf("FILE ERROR:%s\n",strerror(errno));
     perror("file error\n");
     exit(1);
   }  
@@ -165,18 +165,20 @@
 
   FILE* fp_cpu;
   fp_cpu = fopen(cpu_file,"rb");
-  char* errorstr;
+  //char* errorstr;
   if(fp_cpu == 0) {
-    errorstr = strerror(errno);
-    printf("CPU FILE ERROR:%s\n",errorstr);
+    //errorstr = (char*) strerror(errno);
+    //printf("CPU FILE ERROR:%s\n",errorstr);
+    printf("CPU FILE ERROR:%s\n",strerror(errno));    
     perror("cpu file error\n");
   }
   FILE* fp_gpu;
   fp_gpu = fopen(gpu_file,"rb");
 
   if(fp_gpu == NULL) {
-    errorstr = strerror(errno);
-    printf("GPU FILE ERROR:%s\n",errorstr);
+    //errorstr = (char*) strerror(errno);
+    //printf("GPU FILE ERROR:%s\n",errorstr);
+    printf("GPU FILE ERROR:%s\n",strerror(errno));    
     perror("gpu file error\n");
   }
   

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2011-10-31 21:45:35 UTC (rev 19137)
@@ -70,14 +70,18 @@
                                                 int* irec_master_noise_f, 
                                                 int* islice_selected_rec){}
 
-void FC_FUNC_(add_sources_sim_type_2_or_3,
-              ADD_SOURCES_SIM_TYPE_2_OR_3)(long* Mesh_pointer, 
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer, 
                                            float* h_adj_sourcearrays,
-                                           int* size_adj_sourcearrays, int* ispec_is_inner,
-                                           int* phase_is_inner, int* ispec_selected_rec,
-                                           int* ibool,
-                                           int* myrank, int* nrec, int* time_index,
-                                           int* h_islice_selected_rec,int* nadj_rec_local,
+                                           int* phase_is_inner,
+                                           int* h_ispec_is_inner,
+                                           int* h_ispec_is_elastic,                                            
+                                           int* h_ispec_selected_rec,
+                                           int* myrank, 
+                                           int* nrec, 
+                                           int* time_index,
+                                           int* h_islice_selected_rec,
+                                           int* nadj_rec_local,
                                            int* NTSTEP_BETWEEN_READ_ADJSRC){}
 
 void FC_FUNC_(compute_add_sources_ac_cuda,
@@ -100,15 +104,17 @@
 
 void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
               ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer, 
-                                                         float* h_adj_sourcearrays,
-                                                         int* size_adj_sourcearrays, 
-                                                         int* phase_is_inner,
-                                                         int* myrank, 
-                                                         int* nrec, 
-                                                         int* pre_computed_index,
-                                                         int* h_islice_selected_rec,
-                                                         int* nadj_rec_local,
-                                                         int* NTSTEP_BETWEEN_ADJSRC){}
+                                               float* h_adj_sourcearrays,
+                                               int* phase_is_inner,
+                                               int* h_ispec_is_inner,
+                                               int* h_ispec_is_acoustic,
+                                               int* h_ispec_selected_rec,
+                                               int* myrank,
+                                               int* nrec,
+                                               int* time_index,
+                                               int* h_islice_selected_rec,
+                                               int* nadj_rec_local,
+                                               int* NTSTEP_BETWEEN_ADJSRC){}
 
 /* from compute_coupling_cuda.cu */
 							
@@ -232,9 +238,8 @@
 
 void FC_FUNC_(compute_kernels_strgth_noise_cu,
               COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer, 
-                                                    float* h_noise_surface_movie,
-                                                    int* num_free_surface_faces_f,
-						   float* deltat){}
+                                               float* h_noise_surface_movie,
+                                               float* deltat){}
 
 void FC_FUNC_(compute_kernels_acoustic_cuda,
               COMPUTE_KERNELS_ACOUSTIC_CUDA)(
@@ -301,13 +306,11 @@
 
 void FC_FUNC_(transfer_surface_to_host,
               TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
-                                        realw* h_noise_surface_movie,
-                                        int* num_free_surface_faces){}
+                                        realw* h_noise_surface_movie){}
 
 void FC_FUNC_(noise_read_add_surface_movie_cu,
               NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f, 
                                                realw* h_noise_surface_movie, 
-                                               int* num_free_surface_faces_f,
                                                int* NOISE_TOMOGRAPHYf){}
 
 						
@@ -363,7 +366,10 @@
               PREPARE_SIM2_OR_3_CONST_DEVICE)(
                                               long* Mesh_pointer_f,
                                               int* islice_selected_rec,
-                                              int* islice_selected_rec_size){}
+                                              int* islice_selected_rec_size,
+                                              int* nadj_rec_local,
+                                              int* nrec,
+                                              int* myrank){}
 
 void FC_FUNC_(prepare_fields_acoustic_device,
               PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f, 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2011-10-31 21:45:35 UTC (rev 19137)
@@ -48,7 +48,8 @@
                                                             int* ibool,
                                                             float* station_seismo_field,
                                                             float* desired_field,
-                                                            int nrec_local,int* debug_index) {
+                                                            int nrec_local //,int* debug_index
+                                                            ) {
   int blockID = blockIdx.x + blockIdx.y*gridDim.x;
   if(blockID<nrec_local) {
     //int nodeID = threadIdx.x + blockID*blockDim.x;
@@ -98,7 +99,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  int* d_debug_index;
+  //int* d_debug_index;
   //int* h_debug_index;
   //cudaMalloc((void**)&d_debug_index,125*sizeof(int));
   //h_debug_index = (int*)calloc(125,sizeof(int));
@@ -107,11 +108,12 @@
 
   // prepare field transfer array on device
   transfer_stations_fields_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
-                d_ispec_selected,
-                mp->d_ibool,
-                mp->d_station_seismo_field,
-                d_field,
-                mp->nrec_local,d_debug_index);
+                                                                d_ispec_selected,
+                                                                mp->d_ibool,
+                                                                mp->d_station_seismo_field,
+                                                                d_field,
+                                                                mp->nrec_local //,d_debug_index
+                                                                );
 
   //cudaMemcpy(h_debug_index,d_debug_index,125*sizeof(int),cudaMemcpyDeviceToHost);
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/check_mesh_quality.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/check_mesh_quality.f90	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/check_mesh_quality.f90	2011-10-31 21:45:35 UTC (rev 19137)
@@ -90,6 +90,7 @@
   !logical :: USE_OPENDX
 
   !character(len=256):: line
+  integer,dimension(1) :: tmp_ispec_max_skewness,tmp_ispec_max_skewness_MPI
 
 
   if (myrank == 0) then
@@ -167,7 +168,8 @@
 
 
   if((myrank == skewness_max_rank) .and. (myrank /= 0)) then
-     call send_i_t(ispec_max_skewness,1,0)
+     tmp_ispec_max_skewness(1) = ispec_max_skewness
+     call send_i_t(tmp_ispec_max_skewness,1,0)
   end if
 
 
@@ -175,7 +177,8 @@
 
 
      if(skewness_max_rank /= myrank) then
-        call recv_i_t(ispec_max_skewness_MPI,1,skewness_max_rank)
+        call recv_i_t(tmp_ispec_max_skewness_MPI,1,skewness_max_rank)
+        ispec_max_skewness_MPI = tmp_ispec_max_skewness_MPI(1)
      else
         ispec_max_skewness_MPI = ispec_max_skewness
      end if

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90	2011-10-31 21:45:35 UTC (rev 19137)
@@ -71,6 +71,7 @@
 
 
   logical :: has_vs_zero
+  real(kind=CUSTOM_REAL),dimension(1) :: tmp_val
 
   ! initializations
   if( DT <= 0.0d0) then
@@ -296,13 +297,16 @@
       model_speed_max = vsmax_glob
     endif
   endif
-  call bcast_all_cr(model_speed_max,1)
+  tmp_val(1) = model_speed_max
+  call bcast_all_cr(tmp_val,1)
+  model_speed_max = tmp_val(1)
 
   ! returns minimum period
   if( myrank == 0 ) min_resolved_period = pmax_glob
-  call bcast_all_cr(min_resolved_period,1)
+  tmp_val(1) = min_resolved_period
+  call bcast_all_cr(tmp_val,1)
+  min_resolved_period = tmp_val(1)
 
-
   end subroutine check_mesh_resolution
 
 !

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c	2011-10-31 21:45:35 UTC (rev 19137)
@@ -137,7 +137,7 @@
   }
 
 /*
-//daniel: debug checks file size
+//debug checks file size
 // see: 
 //https://www.securecoding.cert.org/confluence/display/seccode/FIO19-C.+Do+not+use+fseek()+and+ftell()+to+compute+the+size+of+a+file
   printf("file size: %lld \n",*filesize);
@@ -253,7 +253,7 @@
   ret = 0;
 
 /*
-//daniel: debug
+//debug
   float dat[*length/4];
   memcpy(dat,buffer,*length);
   printf("buffer length: %d %d\n",*length,*index);
@@ -295,10 +295,8 @@
     }
   }
 
-//daniel: debug
-//  printf("buffer done length: %d %d\n",donelen,*length);
-  
-
+  //debug
+  //  printf("buffer done length: %d %d\n",donelen,*length);
 }
 
 //void
@@ -355,7 +353,7 @@
   }
 
 /*
-//daniel: debug
+// debug
   printf("position: %lld %d %d \n",pos,*length,*index);
   printf("buffer done length: %d %d\n",donelen,*length);
   float dat[*length/4];
@@ -367,7 +365,6 @@
     printf("return buffer: %d %e \n",i,dat[i]);
   }
 */
-  
 }
 
 
@@ -428,7 +425,7 @@
   }
 
   /*
-   // daniel: debug check filesize below or above 2 GB
+   // debug check filesize below or above 2 GB
    //            filesize gives bytes needed; 4-byte integer limited to +- 2,147,483,648 bytes ~ 2 GB
    float s = *filesize / 1024. / 1024. / 1024.;
    if( s > 2.0 ){

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90	2011-10-31 21:45:35 UTC (rev 19137)
@@ -339,12 +339,7 @@
 
       if( it < NSTEP ) then
         ! receivers act as sources
-        if( GPU_MODE) then
-          call add_sources_ac_sim_2_or_3_cuda(Mesh_pointer, adj_sourcearrays, &
-                 size(adj_sourcearrays), phase_is_inner, myrank, nrec, &
-                 NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC),&
-                 islice_selected_rec, nadj_rec_local, NTSTEP_BETWEEN_READ_ADJSRC)
-        else
+        if( .NOT. GPU_MODE ) then
           irec_local = 0
           do irec = 1,nrec
             ! add the source (only if this proc carries the source)
@@ -378,6 +373,15 @@
               endif
             endif
           enddo ! nrec
+        else
+          ! on GPU
+          call add_sources_ac_sim_2_or_3_cuda(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
+                                             ispec_is_inner,ispec_is_acoustic, &
+                                             ispec_selected_rec,myrank,nrec, &
+                                             NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
+                                             islice_selected_rec,nadj_rec_local, &
+                                             NTSTEP_BETWEEN_READ_ADJSRC)
+
         endif ! GPU_MODE
       endif ! it
     endif ! nadj_rec_local > 0

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90	2011-10-31 21:45:35 UTC (rev 19137)
@@ -238,29 +238,32 @@
 ! adjoint simulations
   if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
 
-     ! read in adjoint sources block by block (for memory consideration)
-     ! e.g., in exploration experiments, both the number of receivers (nrec) and
-     ! the number of time steps (NSTEP) are huge,
-     ! which may cause problems since we have a large array:
-     !   adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ)
+    ! adds adjoint source in this partitions
+    if( nadj_rec_local > 0 ) then
 
-     ! figure out if we need to read in a chunk of the adjoint source at this timestep
-     it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )   !chunk_number
-     ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0))
+      ! read in adjoint sources block by block (for memory consideration)
+      ! e.g., in exploration experiments, both the number of receivers (nrec) and
+      ! the number of time steps (NSTEP) are huge,
+      ! which may cause problems since we have a large array:
+      !   adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ)
 
-     ! needs to read in a new chunk/block of the adjoint source
-     ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner'
-     ! we first do calculations for the boudaries, and then start communication
-     ! with other partitions while calculate for the inner part
-     ! this must be done carefully, otherwise the adjoint sources may be added twice
-     if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then
+      ! figure out if we need to read in a chunk of the adjoint source at this timestep
+      it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )   !chunk_number
+      ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0))
 
+      ! needs to read in a new chunk/block of the adjoint source
+      ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner'
+      ! we first do calculations for the boudaries, and then start communication
+      ! with other partitions while calculate for the inner part
+      ! this must be done carefully, otherwise the adjoint sources may be added twice
+      if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then
+
         ! allocates temporary source array
         allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier)
         if( ier /= 0 ) stop 'error allocating array adj_sourcearray'
 
         if (.not. SU_FORMAT) then
-!!! read ascii adjoint sources
+      !!! read ascii adjoint sources
            irec_local = 0
            do irec = 1, nrec
               ! compute source arrays
@@ -279,7 +282,7 @@
               endif
            enddo
         else
-!!! read SU adjoint sources
+      !!! read SU adjoint sources
            ! range of the block we need to read
            it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
            it_end   = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
@@ -322,49 +325,53 @@
 
         deallocate(adj_sourcearray)
 
-     endif ! if(ibool_read_adj_arrays)
+      endif ! if(ibool_read_adj_arrays)
 
-     if( it < NSTEP ) then
+      if( it < NSTEP ) then
 
         if(.NOT. GPU_MODE) then
 
-           ! receivers act as sources
-           irec_local = 0
-           do irec = 1,nrec
+          ! receivers act as sources
+          irec_local = 0
+          do irec = 1,nrec
 
-              ! add the source (only if this proc carries the source)
-              if (myrank == islice_selected_rec(irec)) then
-                 irec_local = irec_local + 1
+            ! add the source (only if this proc carries the source)
+            if (myrank == islice_selected_rec(irec)) then
+              irec_local = irec_local + 1
 
-                 ! checks if element is in phase_is_inner run
-                 if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then
+              ispec = ispec_selected_rec(irec)
+              if( ispec_is_elastic(ispec) ) then
 
-                    ! adds source array
-                    do k = 1,NGLLZ
-                       do j = 1,NGLLY
-                          do i = 1,NGLLX
-                             iglob = ibool(i,j,k,ispec_selected_rec(irec))
+                ! checks if element is in phase_is_inner run
+                if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then
 
-                             accel(:,iglob) = accel(:,iglob)  &
+                  ! adds source array
+                  do k = 1,NGLLZ
+                    do j = 1,NGLLY
+                      do i = 1,NGLLX
+                        iglob = ibool(i,j,k,ispec_selected_rec(irec))
+
+                        accel(:,iglob) = accel(:,iglob)  &
                                   + adj_sourcearrays(irec_local, &
                                   NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
                                   :,i,j,k)
-                          enddo
-                       enddo
+                      enddo
                     enddo
-
-                 endif ! phase_is_inner
-              endif
-           enddo ! nrec
+                  enddo
+                endif ! phase_is_inner
+              endif ! ispec_is_elastic
+            endif
+          enddo ! nrec
         else ! GPU_MODE == .true.
-           call add_sources_sim_type_2_or_3(Mesh_pointer, adj_sourcearrays, &
-                                          size(adj_sourcearrays), ispec_is_inner,&
-                                          phase_is_inner, ispec_selected_rec,ibool,myrank, nrec, &
-                                          NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC),&
-                                          islice_selected_rec, nadj_rec_local, NTSTEP_BETWEEN_READ_ADJSRC)
+           call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
+                                            ispec_is_inner,ispec_is_elastic, &
+                                            ispec_selected_rec,myrank,nrec, &
+                                            NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
+                                            islice_selected_rec,nadj_rec_local, &
+                                            NTSTEP_BETWEEN_READ_ADJSRC)
         endif ! GPU_MODE
-     endif ! it
-
+      endif ! it
+    endif ! nadj_rec_local
   endif !adjoint
 
 ! note:  b_displ() is read in after Newark time scheme, thus

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90	2011-10-31 21:45:35 UTC (rev 19137)
@@ -443,9 +443,7 @@
 
 ! distributes routines according to chosen NGLLX in constants.h
 
-!daniel
-! note:
-! i put it here rather than in compute_forces_elastic_Dev.f90 because compiler complains that:
+!daniel: note -- i put it here rather than in compute_forces_elastic_Dev.f90 because compiler complains that:
 ! " The storage extent of the dummy argument exceeds that of the actual argument. "
 
 subroutine compute_forces_elastic_Dev_sim1(iphase)

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90	2011-10-31 21:45:35 UTC (rev 19137)
@@ -45,14 +45,14 @@
 
 module user_noise_distribution
 
-!daniel: TODO -- this default value will produce errors
-!            when using with the default example/noise_tomography
+!daniel: TODO -- setting USE_PIERO_DISTRIBUTION = .true. will produce errors
+!            when using with the default example in "example/noise_tomography/"
 !            i left it here so that Max can run his example without changing this every time...
   logical,parameter :: USE_PIERO_DISTRIBUTION = .true.
 
 contains
 
-! wrapper function  
+! wrapper function
 ! this subroutine must be modified by USERS for their own noise distribution
 
   subroutine noise_distribution_direction(xcoord_in,ycoord_in,zcoord_in, &
@@ -116,7 +116,7 @@
   ! dummy assign to avoid compiler warnings
   ldummy = xcoord_in
   ldummy = ycoord_in
-  ldummy = zcoord_in  
+  ldummy = zcoord_in
 
   end subroutine noise_distribution_direction_d
 
@@ -208,7 +208,7 @@
 
   end subroutine noise_distribution_dir_non_uni
 
-  
+
 end module user_noise_distribution
 
 
@@ -250,7 +250,7 @@
 
   logical, dimension(NSPEC_AB_VAL) :: ispec_is_acoustic
 
-!daniel: from global code...
+  !from global code...
   !integer, dimension(NSPEC2D_TOP_VAL) :: ibelm_top ! equals free_surface_ispec
   !integer :: NSPEC2D_TOP_VAL ! equals num_free_surface_faces
   !integer :: nspec_top ! equals num_free_surface_faces
@@ -297,7 +297,7 @@
   ! noise distribution and noise direction
   ipoin = 0
 
-  !daniel: from global code, carefull: ngllz must not be face on top...
+  !from global code, carefull: ngllz must not be face on top...
   !  do ispec2D = 1, nspec_top
   !    ispec = ibelm_top(ispec2D)
   !    k = NGLLZ
@@ -321,13 +321,13 @@
 
         ipoin = ipoin + 1
         iglob = ibool(i,j,k,ispec)
-        
+
         ! this subroutine must be modified by USERS in module user_noise_distribution
         call noise_distribution_direction(xstore(iglob), &
                                           ystore(iglob),zstore(iglob), &
                                           normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, &
-                                          mask_noise_out)        
-        
+                                          mask_noise_out)
+
         normal_x_noise(ipoin) = normal_x_noise_out
         normal_y_noise(ipoin) = normal_y_noise_out
         normal_z_noise(ipoin) = normal_z_noise_out
@@ -653,7 +653,7 @@
   real(kind=CUSTOM_REAL),dimension(NDIM,NGLLSQUARE,num_free_surface_faces) :: noise_surface_movie
   integer(kind=8) :: Mesh_pointer
   logical :: GPU_MODE
-  
+
   ! writes out wavefield at surface
   if( num_free_surface_faces > 0 ) then
 
@@ -675,7 +675,7 @@
        enddo
     ! TODO: Check if transfer_surface_to_hose is compatible with newer version above
     else ! GPU_MODE == 1
-       call transfer_surface_to_host(Mesh_pointer, noise_surface_movie, num_free_surface_faces)
+       call transfer_surface_to_host(Mesh_pointer,noise_surface_movie)
     endif
 
     ! save surface motion to disk
@@ -743,8 +743,7 @@
     call read_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it)
 
     if(GPU_MODE) then
-       call noise_read_add_surface_movie_cu(Mesh_pointer, noise_surface_movie,&
-                                            num_free_surface_faces,NOISE_TOMOGRAPHY)
+       call noise_read_add_surface_movie_cu(Mesh_pointer, noise_surface_movie,NOISE_TOMOGRAPHY)
     else ! GPU_MODE==0
 
        ! get coordinates of surface mesh and surface displacement
@@ -780,7 +779,7 @@
     endif ! GPU_MODE
 
   endif
-  
+
   end subroutine noise_read_add_surface_movie
 
 
@@ -870,7 +869,7 @@
        enddo
 
     else ! GPU_MODE==1
-       call compute_kernels_strgth_noise_cu(Mesh_pointer, noise_surface_movie,num_free_surface_faces,deltat)
+       call compute_kernels_strgth_noise_cu(Mesh_pointer,noise_surface_movie,deltat)
     endif ! GPU_MODE
 
   endif

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-10-31 21:45:35 UTC (rev 19137)
@@ -603,7 +603,7 @@
         filesize = b_reclen_potential
         filesize = filesize*NSTEP
 
-        ! daniel: debug check size limit
+        ! debug check size limit
         !if( NSTEP > 2147483647 / b_reclen_potential ) then
         !  print *,'file size needed exceeds integer 4-byte limit: ',b_reclen_potential,NSTEP
         !  print *,'  ',CUSTOM_REAL, NGLLSQUARE, num_abs_boundary_faces,NSTEP
@@ -842,7 +842,8 @@
   ! prepares needed receiver array for adjoint runs
   if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) &
     call prepare_sim2_or_3_const_device(Mesh_pointer, &
-                                  islice_selected_rec,size(islice_selected_rec))
+                                       islice_selected_rec,size(islice_selected_rec), &
+                                       nadj_rec_local,nrec,myrank)
 
   ! prepares fields on GPU for noise simulations
   if ( NOISE_TOMOGRAPHY > 0 ) then

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2011-10-31 21:45:35 UTC (rev 19137)
@@ -323,7 +323,7 @@
 ! locate inner and outer elements
   call rmd_setup_inner_outer_elemnts()
 
-!daniel: todo mesh coloring
+!daniel: TODO -- mesh coloring
 !  call rmd_setup_color_perm()
 
 ! gets model dimensions
@@ -513,7 +513,7 @@
 !-------------------------------------------------------------------------------------------------
 !
 
-!daniel: todo mesh coloring
+!daniel: TODO -- mesh coloring
   subroutine rmd_setup_color_perm()
 
   use specfem_par
@@ -697,7 +697,7 @@
 
   ! sorts array according to permutation
   ! SORT_MESH_INNER_OUTER
-!daniel
+!daniel: TODO -- mesh permutation
   if( .false. ) then
 
     ! permutation of ibool

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90	2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90	2011-10-31 21:45:35 UTC (rev 19137)
@@ -804,7 +804,7 @@
  implicit none
 
  character(len=256) procname,final_LOCAL_PATH
- integer :: irec_local,irec 
+ integer :: irec_local,irec
 
  ! headers
  integer,parameter :: nheader=240      ! 240 bytes