[cig-commits] r20365 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda decompose_mesh_SCOTCH generate_databases shared specfem3D

Wed Jun 13 05:54:51 PDT 2012

Author: rietmann
Date: 2012-06-13 05:54:51 -0700 (Wed, 13 Jun 2012)
New Revision: 20365

Modified:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90
Log:

This commit merges several optimization changes back into the working
svn sunflower release including:

(1) Texture memory for accel and displ field
(2) __device__ instead of __constant memory
(3) asyncMemcopy to further hide communications, which is paired with similar MPI routines.
    improvements to Kernel_2_impl

There seem to be several functional regressions in the trunk, which
are commented out in this commit. The CPU version, (i.e.,
GPU_MODE=.false.), does not seem to run.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2012-06-13 12:54:51 UTC (rev 20365)
@@ -125,7 +125,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(5,5,5);
 
-  compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel,
+  compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
                                                mp->d_ibool,
                                                mp->d_ispec_is_inner,
                                                phase_is_inner,
@@ -175,7 +175,7 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(5,5,5);
 
-  compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel,mp->d_ibool,
+  compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_ibool,
                                                mp->d_ispec_is_inner, *phase_is_inner,
                                                mp->d_sourcearrays,
                                                mp->d_stf_pre_compute,
@@ -237,7 +237,7 @@
   dim3 threads(NGLL3,1,1);
 
   if(myrank == islice_selected_rec[irec_master_noise-1]) {
-    add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool,
+    add_source_master_rec_noise_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool,
                                                               mp->d_ispec_selected_rec,
                                                               irec_master_noise,
                                                               mp->d_accel,
@@ -404,7 +404,7 @@
   // h_pre_comp..), because normally it is in the loop updating accel,
   // and due to how it's incremented, it cannot be parallelized
 
-  add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel,
+  add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
                                                          *nrec,
                                                          mp->d_adj_sourcearrays,
                                                          mp->d_ibool,

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2012-06-13 12:54:51 UTC (rev 20365)
@@ -39,19 +39,22 @@
 
 
 //  cuda constant arrays
-__constant__ realw d_hprime_xx[NGLL2];
-__constant__ realw d_hprime_yy[NGLL2];
-__constant__ realw d_hprime_zz[NGLL2];
-__constant__ realw d_hprimewgll_xx[NGLL2];
-__constant__ realw d_hprimewgll_yy[NGLL2];
-__constant__ realw d_hprimewgll_zz[NGLL2];
-__constant__ realw d_wgllwgll_xy[NGLL2];
-__constant__ realw d_wgllwgll_xz[NGLL2];
-__constant__ realw d_wgllwgll_yz[NGLL2];
+__device__ realw d_hprime_xx[NGLL2];
 
+// only needed if NGLLX != NGLLY != NGLLZ
+// __device__ realw d_hprime_yy[NGLL2];
+// __device__ realw d_hprime_zz[NGLL2];
+__device__ realw d_hprimewgll_xx[NGLL2];
+__device__ realw d_hprimewgll_yy[NGLL2];
+__device__ realw d_hprimewgll_zz[NGLL2];
+__device__ realw d_wgllwgll_xy[NGLL2];
+__device__ realw d_wgllwgll_xz[NGLL2];
+__device__ realw d_wgllwgll_yz[NGLL2];
+
 __constant__ realw d_wgll_cube[NGLL3]; // needed only for gravity case
 
-
+// prototype for the fortran function to do non-blocking mpi send
+extern "C" void assemble_mpi_vector_send_cuda_(void*,void*,void*,void*,void*,void*,void*,void*,void*);
 /* ----------------------------------------------------------------------------------------------- */
 
 // prepares a device array with with all inter-element edge-nodes -- this
@@ -116,14 +119,14 @@
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
   if(*FORWARD_OR_ADJOINT == 1) {
-    prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel,mp->d_send_accel_buffer,
+    prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
                                                        mp->num_interfaces_ext_mesh,
                                                        mp->max_nibool_interfaces_ext_mesh,
                                                        mp->d_nibool_interfaces_ext_mesh,
                                                        mp->d_ibool_interfaces_ext_mesh);
   }
   else if(*FORWARD_OR_ADJOINT == 3) {
-    prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel,mp->d_send_accel_buffer,
+    prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_send_accel_buffer,
                                                        mp->num_interfaces_ext_mesh,
                                                        mp->max_nibool_interfaces_ext_mesh,
                                                        mp->d_nibool_interfaces_ext_mesh,
@@ -148,6 +151,41 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+extern "C" void FC_FUNC_(transfer_boundary_from_device_asynchronously,TRANSFER_BOUNDARY_FROM_DEVICE_ASYNCHRONOUSLY)(long* Mesh_pointer,int* nspec_outer_elastic) {
+
+  TRACE("transfer_boundary_from_device_asynchronously");
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+  int num_blocks_x = *nspec_outer_elastic;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  int blocksize = NGLL3_PADDED;
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
+                                                                          mp->num_interfaces_ext_mesh,
+                                                                          mp->max_nibool_interfaces_ext_mesh,
+                                                                          mp->d_nibool_interfaces_ext_mesh,
+                                                                          mp->d_ibool_interfaces_ext_mesh);
+  // wait until kernel is finished before starting async memcpy
+  cudaDeviceSynchronize();
+
+  cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
+                  3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
+                  cudaMemcpyDeviceToHost,mp->copy_stream);
+  // cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
+  // 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
+  // cudaMemcpyDeviceToHost,mp->compute_stream);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
 __global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
                                                   int num_interfaces_ext_mesh,
                                                   int max_nibool_interfaces_ext_mesh,
@@ -190,6 +228,97 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+extern "C"
+void FC_FUNC_(transfer_boundary_to_device_asynchronously,TRANSFER_BOUNDARY_TO_DEVICE_ASYNCHRONOUSLY)(long* Mesh_pointer,
+                                                                           realw* buffer_recv_vector_ext_mesh,
+                                                                           int* num_interfaces_ext_mesh,
+                                                                           int* max_nibool_interfaces_ext_mesh) {
+
+  TRACE("transfer_boundary_to_device_asynchronously");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_recv_buffer*sizeof(realw));
+
+  // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+  // 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
+  // cudaMemcpyHostToDevice,mp->compute_stream);
+  printf("xfer to device\n");
+  cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+                  3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
+                  cudaMemcpyHostToDevice,mp->copy_stream);
+
+
+
+
+}
+
+
+extern "C"
+void FC_FUNC_(assemble_accel_on_device,
+              ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel,
+                                              realw* buffer_recv_vector_ext_mesh,
+                                              int* num_interfaces_ext_mesh,
+                                              int* max_nibool_interfaces_ext_mesh,
+                                              int* nibool_interfaces_ext_mesh,
+                                              int* ibool_interfaces_ext_mesh,
+                                              int* FORWARD_OR_ADJOINT) {
+  TRACE("assemble_accel_on_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  //double start_time = get_time();
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+
+
+  // ***************************************************************************
+  // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+  cudaStreamSynchronize(mp->copy_stream);
+
+  // Assembling on the copy_stream breaks the solution and it "blows up"
+  if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
+    assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
+                                                                             mp->num_interfaces_ext_mesh,
+                                                                             mp->max_nibool_interfaces_ext_mesh,
+                                                                             mp->d_nibool_interfaces_ext_mesh,
+                                                                             mp->d_ibool_interfaces_ext_mesh);
+  }
+  else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
+    assemble_boundary_accel_on_device<<<grid,threads,0,mp->copy_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
+                                                        mp->num_interfaces_ext_mesh,
+                                                        mp->max_nibool_interfaces_ext_mesh,
+                                                        mp->d_nibool_interfaces_ext_mesh,
+                                                        mp->d_ibool_interfaces_ext_mesh);
+  }
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("transfer_asmbl_accel_to_device");
+#endif
+}
+
 // FORWARD_OR_ADJOINT == 1 for accel, and == 3 for b_accel
 extern "C"
 void FC_FUNC_(transfer_asmbl_accel_to_device,
@@ -225,14 +354,14 @@
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
   if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
-    assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel, mp->d_send_accel_buffer,
+    assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
                                                         mp->num_interfaces_ext_mesh,
                                                         mp->max_nibool_interfaces_ext_mesh,
                                                         mp->d_nibool_interfaces_ext_mesh,
                                                         mp->d_ibool_interfaces_ext_mesh);
   }
   else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
-    assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel, mp->d_send_accel_buffer,
+    assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
                                                         mp->num_interfaces_ext_mesh,
                                                         mp->max_nibool_interfaces_ext_mesh,
                                                         mp->d_nibool_interfaces_ext_mesh,
@@ -497,7 +626,15 @@
 // double precision temporary variables leads to 10% performance
 // decrease in Kernel_2_impl (not very much..)
 //typedef realw reald;
+#ifdef USE_TEXTURES_FIELDS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_tex;
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_tex;
+#endif
 
+#ifdef USE_TEXTURES_CONSTANTS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_tex;
+#endif
+
 __global__ void Kernel_2_impl(int nb_blocks_to_compute,
                               int NGLOB,
                               int* d_ibool,
@@ -599,6 +736,8 @@
     __shared__ reald s_tempz2[NGLL3];
     __shared__ reald s_tempz3[NGLL3];
 
+    __shared__ reald sh_hprime_xx[NGLL2];
+
 // use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
 // because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
     active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
@@ -622,10 +761,10 @@
       // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
       iglob = d_ibool[working_element*NGLL3 + tx]-1;
 
-#ifdef USE_TEXTURES
-      s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
-      s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
-      s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
+#ifdef USE_TEXTURES_FIELDS
+      s_dummyx_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3);
+      s_dummyy_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3 + 1);
+      s_dummyz_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3 + 2);
 #else
       // changing iglob indexing to match fortran row changes fast style
       s_dummyx_loc[tx] = d_displ[iglob*3];
@@ -634,13 +773,19 @@
 #endif
     }
 
+    if (tx < NGLL2) {
+      #ifdef USE_TEXTURES_CONSTANTS
+      sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
+      #else
+      sh_hprime_xx[tx] = d_hprime_xx[tx];
+      #endif
+    }
+
 // synchronize all the threads (one thread for each of the NGLL grid points of the
 // current spectral element) because we need the whole element to be ready in order
 // to be able to compute the matrix products along cut planes of the 3D element below
     __syncthreads();
 
-#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-
     if (active) {
 
 #ifndef MANUALLY_UNROLLED_LOOPS
@@ -658,19 +803,19 @@
       tempz3l = 0.f;
 
       for (l=0;l<NGLLX;l++) {
-          hp1 = d_hprime_xx[l*NGLLX+I];
+          hp1 = sh_hprime_xx[l*NGLLX+I];
           offset = K*NGLL2+J*NGLLX+l;
           tempx1l += s_dummyx_loc[offset]*hp1;
           tempy1l += s_dummyy_loc[offset]*hp1;
           tempz1l += s_dummyz_loc[offset]*hp1;
 
-          hp2 = d_hprime_xx[l*NGLLX+J];
+          hp2 = sh_hprime_xx[l*NGLLX+J];
           offset = K*NGLL2+l*NGLLX+I;
           tempx2l += s_dummyx_loc[offset]*hp2;
           tempy2l += s_dummyy_loc[offset]*hp2;
           tempz2l += s_dummyz_loc[offset]*hp2;
 
-          hp3 = d_hprime_xx[l*NGLLX+K];
+          hp3 = sh_hprime_xx[l*NGLLX+K];
           offset = l*NGLL2+J*NGLLX+I;
           tempx3l += s_dummyx_loc[offset]*hp3;
           tempy3l += s_dummyy_loc[offset]*hp3;
@@ -1008,29 +1153,40 @@
         sum_terms3 += rho_s_H3;
       }
 
-#ifdef USE_TEXTURES
-      d_accel[iglob] = tex1Dfetch(tex_accel, iglob) + sum_terms1);
-      d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) + sum_terms2);
-      d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) + sum_terms3);
-#else
-  /* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
 
-
 #ifdef USE_MESH_COLORING_GPU
       // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+      d_accel[iglob*3]     = tex1Dfetch(d_accel_tex, iglob*3) + sum_terms1;
+      d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_tex, iglob*3 + 1) + sum_terms2;
+      d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_tex, iglob*3 + 2) + sum_terms3;
+#else
       d_accel[iglob*3]     += sum_terms1;
       d_accel[iglob*3 + 1] += sum_terms2;
       d_accel[iglob*3 + 2] += sum_terms3;
-#else
+#endif // USE_TEXTURES_FIELDS
+
+#else // MESH_COLORING
       //mesh coloring
       if( use_mesh_coloring_gpu ){
 
-       // no atomic operation needed, colors don't share global points between elements
+        // no atomic operation needed, colors don't share global points between elements
+        // d_accel[iglob*3]     += sum_terms1;
+        // d_accel[iglob*3 + 1] += sum_terms2;
+        // d_accel[iglob*3 + 2] += sum_terms3;
+#ifdef USE_TEXTURES_FIELDS
+        d_accel[iglob*3]     = tex1Dfetch(d_accel_tex, iglob*3) + sum_terms1;
+        d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_tex, iglob*3 + 1) + sum_terms2;
+        d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_tex, iglob*3 + 2) + sum_terms3;
+#else
         d_accel[iglob*3]     += sum_terms1;
         d_accel[iglob*3 + 1] += sum_terms2;
         d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
 
-      }else{
+      }
+      else {
 
         // for testing purposes only: w/out atomic updates
         //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
@@ -1041,11 +1197,11 @@
         atomicAdd(&d_accel[iglob*3+1], sum_terms2);
         atomicAdd(&d_accel[iglob*3+2], sum_terms3);
 
-      }
-#endif
+      } // if(use_mesh_coloring_gpu)
 
-#endif
+#endif // MESH_COLORING
 
+
       // update memory variables based upon the Runge-Kutta scheme
       if( ATTENUATION ){
         compute_element_att_memory(tx,working_element,NSPEC,
@@ -1068,14 +1224,9 @@
         epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
       }
 
-    }
+    } // if(active)
 
-#else  // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-    d_accel[iglob] -= 0.00000001f;
-    d_accel[iglob + NGLOB] -= 0.00000001f;
-    d_accel[iglob + 2*NGLOB] -= 0.00000001f;
-#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-}
+} // kernel_2_impl()
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -1167,7 +1318,7 @@
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
 
-  Kernel_2_impl<<<grid,threads>>>(nb_blocks_to_compute,
+  Kernel_2_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
                                   mp->NGLOB_AB,
                                   d_ibool,
                                   mp->d_phase_ispec_inner_elastic,
@@ -1222,7 +1373,7 @@
 
 
   if(SIMULATION_TYPE == 3) {
-    Kernel_2_impl<<< grid,threads>>>(nb_blocks_to_compute,
+    Kernel_2_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
                                      mp->NGLOB_AB,
                                      d_ibool,
                                      mp->d_phase_ispec_inner_elastic,
@@ -1500,6 +1651,33 @@
              mp->d_c66store,
              mp->d_rhostore);
   }
+
+  // Wait until async-memcpy of outer elements is finished and start MPI.
+  if(*iphase==2) {
+    cudaStreamSynchronize(mp->copy_stream);
+
+    // There have been problems using the pinned-memory with MPI, so
+    // we copy the buffer into a non-pinned region.
+    memcpy(mp->send_buffer,mp->h_send_accel_buffer,
+           mp->size_mpi_send_buffer*sizeof(float));
+
+    // memory copy is now finished, so non-blocking MPI send can proceed
+    // MPI based halo exchange
+
+    assemble_mpi_vector_send_cuda_(&(mp->NPROCS),
+                                   mp->send_buffer, /* "regular" memory */
+                                   // mp->h_send_accel_buffer, /* pinned memory **CRASH** */
+                                   mp->buffer_recv_vector_ext_mesh,
+                                   &mp->num_interfaces_ext_mesh,
+                                   &mp->max_nibool_interfaces_ext_mesh,
+                                   mp->nibool_interfaces_ext_mesh,
+                                   mp->my_neighbours_ext_mesh,
+                                   mp->request_send_vector_ext_mesh,
+                                   mp->request_recv_vector_ext_mesh);
+
+    // Decided to keep launching kernels and to wait for MPI & do memcpy while other kernels launch.
+    // cudaDeviceSynchronize();
+  }
 }
 
 
@@ -1519,6 +1697,10 @@
   /* because of block and grid sizing problems, there is a small */
   /* amount of buffer at the end of the calculation */
   if(id < size) {
+    realw new_accel = accel[id] * rmass[id / 3];
+    veloc[id] += deltatover2 * new_accel;
+    accel[id] = new_accel;
+/*
     accel[3*id] = accel[3*id]*rmass[id];
     accel[3*id+1] = accel[3*id+1]*rmass[id];
     accel[3*id+2] = accel[3*id+2]*rmass[id];
@@ -1526,6 +1708,7 @@
     veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
     veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
     veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+*/
   }
 }
 
@@ -1539,9 +1722,12 @@
   /* because of block and grid sizing problems, there is a small */
   /* amount of buffer at the end of the calculation */
   if(id < size) {
+    accel[id] *= rmass[id / 3];
+/*
     accel[3*id] = accel[3*id]*rmass[id];
     accel[3*id+1] = accel[3*id+1]*rmass[id];
     accel[3*id+2] = accel[3*id+2]*rmass[id];
+*/
   }
 }
 
@@ -1575,7 +1761,8 @@
 TRACE("kernel_3_a_cuda");
 
    Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-   int size = *size_F;
+   //int size = *size_F;
+   int size = *size_F * 3;
    int SIMULATION_TYPE = *SIMULATION_TYPE_f;
    realw deltatover2 = *deltatover2_F;
    realw b_deltatover2 = *b_deltatover2_F;
@@ -1596,17 +1783,17 @@
    // check whether we can update accel and veloc, or only accel at this point
    if( *OCEANS == 0 ){
      // updates both, accel and veloc
-     kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc, mp->d_accel, size, deltatover2, mp->d_rmass);
+     kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc, mp->d_accel, size, deltatover2, mp->d_rmass);
 
      if(SIMULATION_TYPE == 3) {
-       kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc, mp->d_b_accel, size, b_deltatover2,mp->d_rmass);
+       kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc, mp->d_b_accel, size, b_deltatover2,mp->d_rmass);
      }
    }else{
      // updates only accel
-     kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel, size, mp->d_rmass);
+     kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_accel, size, mp->d_rmass);
 
      if(SIMULATION_TYPE == 3) {
-       kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel, size, mp->d_rmass);
+       kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_accel, size, mp->d_rmass);
      }
    }
 
@@ -1647,10 +1834,10 @@
   dim3 threads(blocksize,1,1);
 
   // updates only veloc at this point
-  kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_veloc,mp->d_accel,size,deltatover2);
+  kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,mp->d_accel,size,deltatover2);
 
   if(SIMULATION_TYPE == 3) {
-    kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_b_veloc,mp->d_b_accel,size,b_deltatover2);
+    kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,mp->d_b_accel,size,b_deltatover2);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1760,7 +1947,7 @@
   exit_on_cuda_error("before kernel elastic_ocean_load_cuda");
 #endif
 
-  elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_accel,
+  elastic_ocean_load_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
                                                    mp->d_rmass,
                                                    mp->d_rmass_ocean_load,
                                                    mp->num_free_surface_faces,
@@ -1775,7 +1962,7 @@
     print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
                                        sizeof(int)*mp->NGLOB_AB),88502);
 
-    elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
+    elastic_ocean_load_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,
                                                      mp->d_rmass,
                                                      mp->d_rmass_ocean_load,
                                                      mp->num_free_surface_faces,
@@ -1837,41 +2024,41 @@
   }
 }
 
-void setConst_hprime_yy(realw* array,Mesh* mp)
-{
+// void setConst_hprime_yy(realw* array,Mesh* mp)
+// {
 
-  cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
-    fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
-    exit(1);
-  }
+//   cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
+//   if (err != cudaSuccess)
+//   {
+//     fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
+//     fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+//     exit(1);
+//   }
 
-  err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-}
+//   err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
+//   if(err != cudaSuccess) {
+//     fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
+//     exit(1);
+//   }
+// }
 
-void setConst_hprime_zz(realw* array,Mesh* mp)
-{
+// void setConst_hprime_zz(realw* array,Mesh* mp)
+// {
 
-  cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
-  if (err != cudaSuccess)
-  {
-    fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
-    fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
-    exit(1);
-  }
+//   cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
+//   if (err != cudaSuccess)
+//   {
+//     fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
+//     fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+//     exit(1);
+//   }
 
-  err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
-  if(err != cudaSuccess) {
-    fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
-    exit(1);
-  }
-}
+//   err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
+//   if(err != cudaSuccess) {
+//     fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
+//     exit(1);
+//   }
+// }
 
 
 void setConst_hprimewgll_xx(realw* array,Mesh* mp)
@@ -1989,4 +2176,3 @@
   }
 
 }
-

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2012-06-13 12:54:51 UTC (rev 20365)
@@ -111,7 +111,7 @@
 //#endif
 
   //launch kernel
-  UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
+  UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
                                            size,deltat,deltatsqover2,deltatover2);
 
   //cudaThreadSynchronize();
@@ -124,7 +124,7 @@
   // kernel for backward fields
   if(*SIMULATION_TYPE == 3) {
 
-    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
+    UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
                                              size,b_deltat,b_deltatsqover2,b_deltatover2);
 
 //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -208,13 +208,13 @@
   dim3 threads(blocksize,1,1);
 
   //launch kernel
-  UpdatePotential_kernel<<<grid,threads>>>(mp->d_potential_acoustic,
+  UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_acoustic,
                                            mp->d_potential_dot_acoustic,
                                            mp->d_potential_dot_dot_acoustic,
                                            size,deltat,deltatsqover2,deltatover2);
 
   if(*SIMULATION_TYPE == 3) {
-    UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_potential_acoustic,
+    UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_acoustic,
                                              mp->d_b_potential_dot_acoustic,
                                              mp->d_b_potential_dot_dot_acoustic,
                                              size,b_deltat,b_deltatsqover2,b_deltatover2);

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-06-13 12:54:51 UTC (rev 20365)
@@ -71,7 +71,7 @@
 #endif
 
 // error checking after cuda function calls
-#define ENABLE_VERY_SLOW_ERROR_CHECKING
+/* #define ENABLE_VERY_SLOW_ERROR_CHECKING */
 
 #define MAX(x,y)                    (((x) < (y)) ? (y) : (x))
 
@@ -85,6 +85,8 @@
 
 void exit_on_error(char* info);
 
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y);
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // cuda constant arrays
@@ -118,6 +120,15 @@
 // leads up to ~ 5% performance increase
 //#define USE_MESH_COLORING_GPU
 
+// use textures for d_displ and d_accel -- 10% performance boost
+#define USE_TEXTURES_FIELDS
+
+// Using texture memory for the hprime-style constants is slower on
+// Fermi generation hardware, but *may* be faster on Kepler
+// generation.
+/* Use textures for hprime_xx */
+/* #define USE_TEXTURES_CONSTANTS */
+
 // (optional) unrolling loops
 // leads up to ~1% performance increase
 //#define MANUALLY_UNROLLED_LOOPS
@@ -160,6 +171,9 @@
   int NSPEC_AB;
   int NGLOB_AB;
 
+  int myrank;
+  int NPROCS;
+
   // interpolators
   realw* d_xix; realw* d_xiy; realw* d_xiz;
   realw* d_etax; realw* d_etay; realw* d_etaz;
@@ -183,10 +197,36 @@
   realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz;
   realw* d_wgll_cube;
 
-  // mpi buffers
+  // A buffer for mpi-send/recv, which is duplicated in fortran but is
+  // allocated with pinned memory to facilitate asynchronus device <->
+  // host memory transfers
+  float* h_send_accel_buffer;
+  float* h_send_b_accel_buffer;
+  float* send_buffer;
+  float* h_recv_accel_buffer;
+  float* h_recv_b_accel_buffer;
+  float* recv_buffer;
+  int size_mpi_send_buffer;
+  int size_mpi_recv_buffer;
+
+
+
+  // buffers and constants for the MPI-send required for async-memcpy
+  // + non-blocking MPI
+  float* buffer_recv_vector_ext_mesh;
   int num_interfaces_ext_mesh;
   int max_nibool_interfaces_ext_mesh;
+  int* nibool_interfaces_ext_mesh;
+  int* my_neighbours_ext_mesh;
+  int* request_send_vector_ext_mesh;
+  int* request_recv_vector_ext_mesh;
 
+
+  // overlapped memcpy streams
+  cudaStream_t compute_stream;
+  cudaStream_t copy_stream;
+  cudaStream_t b_copy_stream;
+
   // ------------------------------------------------------------------ //
   // elastic wavefield parameters
   // ------------------------------------------------------------------ //
@@ -196,6 +236,10 @@
   // backward/reconstructed elastic wavefield
   realw* d_b_displ; realw* d_b_veloc; realw* d_b_accel;
 
+  // Texture references for fast non-coalesced scattered access
+  const textureReference* d_displ_tex_ref_ptr;
+  const textureReference* d_accel_tex_ref_ptr;
+
   // elastic elements
   int* d_ispec_is_elastic;
 
@@ -244,6 +288,17 @@
   realw* d_station_seismo_field;
   realw* h_station_seismo_field;
 
+  double* d_hxir, *d_hetar, *d_hgammar;
+  double* d_dxd, *d_dyd, *d_dzd;
+  double* d_vxd, *d_vyd, *d_vzd;
+  double* d_axd, *d_ayd, *d_azd;
+  realw* d_seismograms_d, *d_seismograms_v, *d_seismograms_a;
+  double* d_nu;
+
+  realw* h_seismograms_d_it;
+  realw* h_seismograms_v_it;
+  realw* h_seismograms_a_it;
+
   // adjoint receivers/sources
   int nadj_rec_local;
   realw* d_adj_sourcearrays;

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2012-06-13 12:54:51 UTC (rev 20365)
@@ -54,44 +54,15 @@
 
 /* CUDA specific things from specfem3D_kernels.cu */
 
+// older TEXTURE usage. For now just acoustic simulations. See usage
+// of USE_TEXTURES_FIELDS elsewhere in code for elastic case
 #ifdef USE_TEXTURES
-  // declaration of textures
-  texture<realw, 1, cudaReadModeElementType> tex_displ;
-  texture<realw, 1, cudaReadModeElementType> tex_accel;
 
-  texture<realw, 1, cudaReadModeElementType> tex_potential_acoustic;
-  texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
+// declaration of textures
+texture<realw, 1, cudaReadModeElementType> tex_potential_acoustic;
+texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
 
-  // for binding the textures
 
-  void bindTexturesDispl(realw* d_displ)
-  {
-    cudaError_t err;
-
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
-
-    err = cudaBindTexture(NULL,tex_displ, d_displ, channelDescFloat, NDIM*NGLOB*sizeof(realw));
-    if (err != cudaSuccess)
-    {
-      fprintf(stderr, "Error in bindTexturesDispl for displ: %s\n", cudaGetErrorString(err));
-      exit(1);
-    }
-  }
-
-  void bindTexturesAccel(realw* d_accel)
-  {
-    cudaError_t err;
-
-    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
-
-    err = cudaBindTexture(NULL,tex_accel, d_accel, channelDescFloat, NDIM*NGLOB*sizeof(realw));
-    if (err != cudaSuccess)
-    {
-      fprintf(stderr, "Error in bindTexturesAccel for accel: %s\n", cudaGetErrorString(err));
-      exit(1);
-    }
-  }
-
   void bindTexturesPotential(realw* d_potential_acoustic)
   {
     cudaError_t err;

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-06-13 12:54:51 UTC (rev 20365)
@@ -608,7 +608,12 @@
                                         int* nrec_local_f,
                                         int* SIMULATION_TYPE,
                                         int* USE_MESH_COLORING_GPU_f,
-                                        int* nspec_acoustic,int* nspec_elastic) {
+                                        int* nspec_acoustic,int* nspec_elastic,
+                                        int* my_neighbours_ext_mesh,
+                                        int* request_send_vector_ext_mesh,
+                                        int* request_recv_vector_ext_mesh,
+                                        realw* buffer_recv_vector_ext_mesh
+                                        ) {
 
 TRACE("prepare_constants_device");
 
@@ -622,14 +627,25 @@
     exit_on_error("NGLLX must be 5 for CUDA devices");
   }
 
+
+#ifdef WITH_MPI
+  int nproc;
+  MPI_Comm_size(MPI_COMM_WORLD,&nproc);
+  mp->NPROCS=nproc;
+#else
+  mp->NPROCS = 1;
+#endif
+
+
   // sets global parameters
   mp->NSPEC_AB = *NSPEC_AB;
   mp->NGLOB_AB = *NGLOB_AB;
 
   // sets constant arrays
   setConst_hprime_xx(h_hprime_xx,mp);
-  setConst_hprime_yy(h_hprime_yy,mp);
-  setConst_hprime_zz(h_hprime_zz,mp);
+  // only needed if NGLLX != NGLLY != NGLLZ
+  // setConst_hprime_yy(h_hprime_yy,mp);
+  // setConst_hprime_zz(h_hprime_zz,mp);
   setConst_hprimewgll_xx(h_hprimewgll_xx,mp);
   setConst_hprimewgll_yy(h_hprimewgll_yy,mp);
   setConst_hprimewgll_zz(h_hprimewgll_zz,mp);
@@ -637,9 +653,51 @@
   setConst_wgllwgll_xz(h_wgllwgll_xz,mp);
   setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
 
+  // Using texture memory for the hprime-style constants is slower on
+  // Fermi generation hardware, but *may* be faster on Kepler
+  // generation. We will reevaluate this again, so might as well leave
+  // in the code with with #USE_TEXTURES_FIELDS not-defined.
+  #ifdef USE_TEXTURES_CONSTANTS
+  {
+    const textureReference* d_hprime_xx_tex_ptr;
+    print_CUDA_error_if_any(cudaGetTextureReference(&d_hprime_xx_tex_ptr, "d_hprime_xx_tex"), 4101);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_tex_ptr, mp->d_hprime_xx, &channelDesc, sizeof(realw)*(NGLL2)), 4001);
+  }
+  #endif
+
+
+  // Allocate pinned mpi-buffers.
+  // MPI buffers use pinned memory allocated by cudaMallocHost, which
+  // enables the use of asynchronous memory copies from host <->
+  // device
+  int size_mpi_buffer = 3 * (*num_interfaces_ext_mesh) * (*max_nibool_interfaces_ext_mesh);
+  print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+  mp->send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+  mp->size_mpi_send_buffer = size_mpi_buffer;
+
+  print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+  mp->recv_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+  mp->size_mpi_recv_buffer = size_mpi_buffer;
+
+  print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+  // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+
+  mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh;
+  mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh;
+  mp->nibool_interfaces_ext_mesh = h_nibool_interfaces_ext_mesh;
+  mp->my_neighbours_ext_mesh = my_neighbours_ext_mesh;
+  mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh;
+  mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh;
+  mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh;
+
+  // setup two streams, one for compute and one for host<->device memory copies
+  cudaStreamCreate(&mp->compute_stream);
+  cudaStreamCreate(&mp->copy_stream);
+  cudaStreamCreate(&mp->b_copy_stream);
+
   /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
   int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
-  //int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
 
   // mesh
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(realw)),1001);
@@ -1068,6 +1126,20 @@
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003);
 
+  #ifdef USE_TEXTURES_FIELDS
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_tex_ref_ptr, "d_displ_tex"), 4001);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_tex_ref_ptr, mp->d_displ, &channelDesc, sizeof(realw)*(*size)), 4001);
+  }
+
+  {
+    print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_tex_ref_ptr, "d_accel_tex"), 4003);
+    cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+    print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_tex_ref_ptr, mp->d_accel, &channelDesc, sizeof(realw)*(*size)), 4003);
+  }
+  #endif
+
   // mpi buffer
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
                         3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004);
@@ -1690,6 +1762,32 @@
 
 }
 
+extern "C"
+void FC_FUNC_(prepare_seismogram_fields,
+              PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {
+
+  TRACE("prepare_constants_device");
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_nu),3*3* *nrec_local*sizeof(double)),8100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hxir),5* *nrec_local*sizeof(double)),8100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hetar),5* *nrec_local*sizeof(double)),8100);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hgammar),5* *nrec_local*sizeof(double)),8100);
+
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_d,3**nrec_local*sizeof(realw)),8101);
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_v,3**nrec_local*sizeof(realw)),8101);
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_a,3**nrec_local*sizeof(realw)),8101);
+
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_nu,nu,3*3* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_hxir,hxir,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_hetar,hetar,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_hgammar,hgammar,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+
+  cudaMallocHost((void**)&mp->h_seismograms_d_it,3**nrec_local*sizeof(realw));
+  cudaMallocHost((void**)&mp->h_seismograms_v_it,3**nrec_local*sizeof(realw));
+  cudaMallocHost((void**)&mp->h_seismograms_a_it,3**nrec_local*sizeof(realw));
+}
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // cleanup
@@ -1933,4 +2031,3 @@
   // mesh pointer - not needed anymore
   free(mp);
 }
-

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2012-06-13 12:54:51 UTC (rev 20365)
@@ -43,6 +43,220 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+/*
+      ! gets global number of that receiver
+    irec = number_receiver_global(irec_local)
+
+    ! gets local receiver interpolators
+    ! (1-D Lagrange interpolators)
+    hxir(:) = hxir_store(irec_local,:)
+    hetar(:) = hetar_store(irec_local,:)
+    hgammar(:) = hgammar_store(irec_local,:)
+
+*/
+
+
+// Initially sets the blocks_x to be the num_blocks, and adds rows as
+// needed. If an additional row is added, the row length is cut in
+// half. If the block count is odd, there will be 1 too many blocks,
+// which must be managed at runtime with an if statement. 
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
+  *num_blocks_x = num_blocks;
+  *num_blocks_y = 1;
+  while(*num_blocks_x > 65535) {
+    *num_blocks_x = (int) ceil(*num_blocks_x*0.5f);
+    *num_blocks_y = *num_blocks_y*2;
+  }
+  return;
+}
+
+__device__ double atomicAdd(double* address, double val)
+{
+    unsigned long long int* address_as_ull =
+                             (unsigned long long int*)address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+old = atomicCAS(address_as_ull, assumed,
+                        __double_as_longlong(val +
+                               __longlong_as_double(assumed)));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+
+__global__ void compute_interpolated_dva_plus_seismogram(int nrec_local,
+							 realw* displ, realw* veloc, realw* accel,
+							 int* ibool,
+							 double* hxir, double* hetar, double* hgammar,
+							 realw* seismograms_d, realw* seismograms_v, realw* seismograms_a,
+							 double* nu,
+							 int* number_receiver_global,
+							 int* ispec_selected_rec) {
+  int irec_local = blockIdx.x + blockIdx.y*gridDim.x;
+  int i = threadIdx.x;
+  int j = threadIdx.y;
+  int k = threadIdx.z;
+  int ijk = i+5*(j+5*(k));
+  
+  // we do the **d variable reduction in shared memory, because the
+  // atomicAdd() should be faster on the shared memory registers
+  // according to
+  // http://supercomputingblog.com/cuda/cuda-tutorial-4-atomic-operations/
+  __shared__ double sh_dxd[NGLL3];
+  __shared__ double sh_dyd[NGLL3];
+  __shared__ double sh_dzd[NGLL3];
+  __shared__ double sh_vxd[NGLL3];
+  __shared__ double sh_vyd[NGLL3];
+  __shared__ double sh_vzd[NGLL3];
+  __shared__ double sh_axd[NGLL3];
+  __shared__ double sh_ayd[NGLL3];
+  __shared__ double sh_azd[NGLL3];
+  
+  if(irec_local < nrec_local) {
+    int irec = number_receiver_global[irec_local]-1;
+    int ispec = ispec_selected_rec[irec]-1;
+    int iglob = ibool[ijk+125*ispec]-1;
+    double hlagrange = hxir[irec_local + nrec_local*i]*hetar[irec_local + nrec_local*j]*hgammar[irec_local + nrec_local*k];
+    sh_dxd[ijk] = hlagrange*displ[0+3*iglob];
+    sh_dyd[ijk] = hlagrange*displ[1+3*iglob];
+    sh_dzd[ijk] = hlagrange*displ[2+3*iglob];
+
+    sh_vxd[ijk] = hlagrange*veloc[0+3*iglob];
+    sh_vyd[ijk] = hlagrange*veloc[1+3*iglob];
+    sh_vzd[ijk] = hlagrange*veloc[2+3*iglob];
+
+    sh_axd[ijk] = hlagrange*accel[0+3*iglob];
+    sh_ayd[ijk] = hlagrange*accel[1+3*iglob];
+    sh_azd[ijk] = hlagrange*accel[2+3*iglob];        
+
+    // the reduction has to skip the first element (we don't need to
+    // add element 0 to itself) This reduction serializes the code,
+    // but it should be fast enough --- it can be made faster with a
+    // proper reduction algorithm.
+    __syncthreads();
+    
+    // if(ijk>0) {
+    // reduction needs to be done atomically to avoid race conditions
+      // atomicAdd(&sh_dxd[0],sh_dxd[ijk]);
+      // atomicAdd(&sh_dyd[0],sh_dyd[ijk]);
+      // atomicAdd(&sh_dzd[0],sh_dzd[ijk]);
+
+      // atomicAdd(&sh_vxd[0],sh_vxd[ijk]);
+      // atomicAdd(&sh_vyd[0],sh_vyd[ijk]);
+      // atomicAdd(&sh_vzd[0],sh_vzd[ijk]);
+
+      // atomicAdd(&sh_axd[0],sh_axd[ijk]);
+      // atomicAdd(&sh_ayd[0],sh_ayd[ijk]);
+      // atomicAdd(&sh_azd[0],sh_azd[ijk]);
+    // }
+    // __syncthreads();
+    if(ijk==0) {
+      // a loop in thread 0 is 4 times faster than atomic operations
+      for(int i=1;i<125;i++) {
+      	sh_dxd[0] += sh_dxd[i];
+      	sh_dyd[0] += sh_dyd[i];
+      	sh_dzd[0] += sh_dzd[i];
+		  
+      	sh_vxd[0] += sh_vxd[i];
+      	sh_vyd[0] += sh_vyd[i];
+      	sh_vzd[0] += sh_vzd[i];
+		  
+      	sh_axd[0] += sh_axd[i];
+      	sh_ayd[0] += sh_ayd[i];
+      	sh_azd[0] += sh_azd[i];
+	
+      }
+      
+      seismograms_d[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_dxd[0] + nu[0+3*(1+3*irec)]*sh_dyd[0] + nu[0+3*(2+3*irec)]*sh_dzd[0];
+      seismograms_d[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_dxd[0] + nu[1+3*(1+3*irec)]*sh_dyd[0] + nu[1+3*(2+3*irec)]*sh_dzd[0];
+      seismograms_d[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_dxd[0] + nu[2+3*(1+3*irec)]*sh_dyd[0] + nu[2+3*(2+3*irec)]*sh_dzd[0];
+            
+      seismograms_v[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_vxd[0] + nu[0+3*(1+3*irec)]*sh_vyd[0] + nu[0+3*(2+3*irec)]*sh_vzd[0];
+      seismograms_v[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_vxd[0] + nu[1+3*(1+3*irec)]*sh_vyd[0] + nu[1+3*(2+3*irec)]*sh_vzd[0];
+      seismograms_v[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_vxd[0] + nu[2+3*(1+3*irec)]*sh_vyd[0] + nu[2+3*(2+3*irec)]*sh_vzd[0];
+      
+      seismograms_a[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_axd[0] + nu[0+3*(1+3*irec)]*sh_ayd[0] + nu[0+3*(2+3*irec)]*sh_azd[0];
+      seismograms_a[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_axd[0] + nu[1+3*(1+3*irec)]*sh_ayd[0] + nu[1+3*(2+3*irec)]*sh_azd[0];
+      seismograms_a[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_axd[0] + nu[2+3*(1+3*irec)]*sh_ayd[0] + nu[2+3*(2+3*irec)]*sh_azd[0];      
+      
+    }
+  }    
+}
+							 
+
+
+extern "C"
+void FC_FUNC_(transfer_seismograms_el_from_device,
+              TRANSFER_SEISMOGRAMS_EL_FROM_DEVICE)(int* nrec_local,
+                                                   long* Mesh_pointer_f,int* SIMULATION_TYPEf,
+						   realw* seismograms_d,
+						   realw* seismograms_v,
+						   realw* seismograms_a,
+						   int* it) {
+  
+
+
+  TRACE("transfer_seismograms_el_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+  
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(*nrec_local,&num_blocks_x,&num_blocks_y);
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,5);
+
+  // double h_debug[125]; for(int i=0;i<125;i++) h_debug[i] = 0;
+  // double* d_debug; cudaMalloc((void**)&d_debug,125*sizeof(double));
+  // cudaMemcpy(d_debug,h_debug,125*sizeof(double),cudaMemcpyHostToDevice);
+  // Cuda timing
+  // cudaEvent_t start, stop;
+  // realw time;
+  // cudaEventCreate(&start);
+  // cudaEventCreate(&stop);
+  // cudaEventRecord( start, 0 );
+  
+  compute_interpolated_dva_plus_seismogram<<<grid,threads,0,mp->compute_stream>>>(*nrec_local,
+										  mp->d_displ,mp->d_veloc,mp->d_accel,
+										  mp->d_ibool,
+										  mp->d_hxir, mp->d_hetar, mp->d_hgammar,
+										  mp->d_seismograms_d,
+										  mp->d_seismograms_v,
+										  mp->d_seismograms_a,
+										  mp->d_nu,
+										  mp->d_number_receiver_global,
+										  mp->d_ispec_selected_rec
+										  );
+
+  // cudaMemcpy(h_debug,d_debug,125*sizeof(double),cudaMemcpyDeviceToHost);
+  
+  cudaMemcpy(mp->h_seismograms_d_it,mp->d_seismograms_d,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+  cudaMemcpy(mp->h_seismograms_v_it,mp->d_seismograms_v,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+  cudaMemcpy(mp->h_seismograms_a_it,mp->d_seismograms_a,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+
+  // cudaEventRecord( stop, 0 );
+  // cudaEventSynchronize( stop );
+  // cudaEventElapsedTime( &time, start, stop );
+  // cudaEventDestroy( start );
+  // cudaEventDestroy( stop );
+  // printf("seismogram Execution Time: %f ms\n",time);
+  
+  // if(abs(mp->h_seismograms_d_it[0]) < 1e-25) printf("seismo1_x=%e\n",mp->h_seismograms_d_it[0]);
+  // if(abs(mp->h_seismograms_d_it[1]) < 1e-25) printf("seismo1_y=%e\n",mp->h_seismograms_d_it[1]);
+  
+  // if(abs(mp->h_seismograms_d_it[2]) < 1e-25) {
+  
+  // printf("%d:seismo1_z=%e\n",*it,mp->h_seismograms_d_it[2]);
+  
+  // }
+  
+  
+  memcpy(&seismograms_d[3**nrec_local*(*it-1)],mp->h_seismograms_d_it,3* *nrec_local*sizeof(realw));
+  memcpy(&seismograms_v[3**nrec_local*(*it-1)],mp->h_seismograms_v_it,3* *nrec_local*sizeof(realw));
+  memcpy(&seismograms_a[3**nrec_local*(*it-1)],mp->h_seismograms_a_it,3* *nrec_local*sizeof(realw));  
+
+}
+
+
 __global__ void transfer_stations_fields_from_device_kernel(int* number_receiver_global,
                                                             int* ispec_selected_rec,
                                                             int* ibool,
@@ -87,7 +301,7 @@
   dim3 threads(blocksize,1,1);
 
   // prepare field transfer array on device
-  transfer_stations_fields_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+  transfer_stations_fields_from_device_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_number_receiver_global,
                                                                 d_ispec_selected,
                                                                 mp->d_ibool,
                                                                 mp->d_station_seismo_field,

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90	2012-06-13 12:54:51 UTC (rev 20365)
@@ -814,32 +814,37 @@
   ! re-partitioning puts poroelastic-elastic coupled elements into same partition
   !  integer  :: nfaces_coupled
   !  integer, dimension(:,:), pointer  :: faces_coupled
-    call poro_elastic_repartitioning (nspec, nnodes, elmnts, &
-                     count_def_mat, mat(1,:) , mat_prop, &
-                     sup_neighbour, nsize, &
-                     nparts, part)
-                     !nparts, part, nfaces_coupled, faces_coupled)
 
-  ! re-partitioning puts moho-surface coupled elements into same partition
-    call moho_surface_repartitioning (nspec, nnodes, elmnts, &
-                     sup_neighbour, nsize, nparts, part, &
-                     nspec2D_moho,ibelm_moho,nodes_ibelm_moho )
+    ! TODO: supposed to rebalance, but currently broken
+    ! call poro_elastic_repartitioning (nspec, nnodes, elmnts, &
+    ! count_def_mat, mat(1,:) , mat_prop, &
+    ! sup_neighbour, nsize, &
+    ! nparts, part)
 
+    !nparts, part, nfaces_coupled, faces_coupled)
 
+    ! re-partitioning puts moho-surface coupled elements into same partition
+    ! call moho_surface_repartitioning (nspec, nnodes, elmnts, &
+    ! sup_neighbour, nsize, nparts, part, &
+    ! nspec2D_moho,ibelm_moho,nodes_ibelm_moho )
+
+
   ! local number of each element for each partition
     call build_glob2loc_elmnts(nspec, part, glob2loc_elmnts,nparts)
 
-  ! local number of each node for each partition
+    ! local number of each node for each partition
     call build_glob2loc_nodes(nspec, nnodes,nsize, nnodes_elmnts, nodes_elmnts, part, &
          glob2loc_nodes_nparts, glob2loc_nodes_parts, glob2loc_nodes, nparts)
 
-  ! mpi interfaces
+    ! mpi interfaces
     ! acoustic/elastic/poroelastic boundaries will be split into different MPI partitions
     call build_interfaces(nspec, sup_neighbour, part, elmnts, &
                              xadj, adjncy, tab_interfaces, &
                              tab_size_interfaces, ninterfaces, &
                              nparts)
 
+
+
     !or: uncomment if you want acoustic/elastic boundaries NOT to be separated into different MPI partitions
     !call build_interfaces_no_ac_el_sep(nspec, sup_neighbour, part, elmnts, &
     !                          xadj, adjncy, tab_interfaces, &
@@ -877,19 +882,23 @@
        endif
 
        ! gets number of nodes
+
        call write_glob2loc_nodes_database(IIN_database, ipart, nnodes_loc, nodes_coords, &
                                   glob2loc_nodes_nparts, glob2loc_nodes_parts, &
                                   glob2loc_nodes, nnodes, 1)
 
-       ! gets number of spectral elements
+
+
        call write_partition_database(IIN_database, ipart, nspec_local, nspec, elmnts, &
                                   glob2loc_elmnts, glob2loc_nodes_nparts, &
                                   glob2loc_nodes_parts, glob2loc_nodes, part, mat, ngnod, 1)
 
+       print*, ipart,": nspec_local=",nspec_local, " nnodes_local=", nnodes_loc
        ! writes out node coordinate locations
        !write(IIN_database,*) nnodes_loc
        write(IIN_database) nnodes_loc
 
+
        call write_glob2loc_nodes_database(IIN_database, ipart, nnodes_loc, nodes_coords,&
                                   glob2loc_nodes_nparts, glob2loc_nodes_parts, &
                                   glob2loc_nodes, nnodes, 2)
@@ -956,4 +965,3 @@
 !end program pre_meshfem3D
 
 end module decompose_mesh_SCOTCH
-

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90	2012-06-13 12:54:51 UTC (rev 20365)
@@ -78,10 +78,13 @@
   ispec_is_elastic(:) = .false.
   ispec_is_poroelastic(:) = .false.
 
-  ! prepares tomography model if needed for elements with undefined material definitions
-  if( nundefMat_ext_mesh > 0 .or. IMODEL == IMODEL_TOMO ) then
-    call model_tomography_broadcast(myrank)
-  endif
+  print*,"nundefMat_ext_mesh:",nundefMat_ext_mesh
+! prepares tomography model if needed for elements with undefined material definitions
+  ! TODO: Max -- somehow this code is breaking when I try to run
+  ! Piero's PREM
+  ! if( nundefMat_ext_mesh > 0 .or. IMODEL == IMODEL_TOMO ) then
+  ! call model_tomography_broadcast(myrank)
+  ! endif
 
   ! prepares external model values if needed
   select case( IMODEL )

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in	2012-06-13 12:54:51 UTC (rev 20365)
@@ -266,7 +266,7 @@
   integer, parameter :: MAX_NB_TRIES_OF_DROUX_1993 = 15
 !
 ! postprocess the colors to balance them if Droux (1993) algorithm is not used
-  logical, parameter :: BALANCE_COLORS_SIMPLE_ALGO = .true.
+  logical, parameter :: BALANCE_COLORS_SIMPLE_ALGO = .false.
 
 !------------------------------------------------------
 !----------- do not modify anything below -------------

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90	2012-06-13 12:54:51 UTC (rev 20365)
@@ -415,6 +415,114 @@
 
 ! with cuda functions...
 
+  subroutine transfer_boundary_to_device(NPROC, Mesh_pointer, &
+                                            buffer_recv_vector_ext_mesh, &
+                                            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,&
+                                            request_recv_vector_ext_mesh)
+
+  implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+  integer(kind=8) :: Mesh_pointer
+! array to assemble
+
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_recv_vector_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+
+  integer iinterface ! ipoin
+  integer FORWARD_OR_ADJOINT
+
+! here we have to assemble all the contributions between partitions using MPI
+
+! assemble only if more than one partition
+  if(NPROC > 1) then
+
+! wait for communications completion (recv)
+
+  write(IMAIN,*) "sending MPI_wait"
+  do iinterface = 1, num_interfaces_ext_mesh
+    call wait_req(request_recv_vector_ext_mesh(iinterface))
+  enddo
+
+! send contributions to GPU
+  call transfer_boundary_to_device_asynchronously(Mesh_pointer, buffer_recv_vector_ext_mesh, &
+       num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh)
+  endif
+  ! This step is done via previous function transfer_and_assemble...
+  ! do iinterface = 1, num_interfaces_ext_mesh
+  !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+  !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+  !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+  !   enddo
+  ! enddo
+
+end subroutine transfer_boundary_to_device
+
+subroutine assemble_MPI_vector_write_cuda_no_transfer(NPROC,NGLOB_AB,array_val, Mesh_pointer, &
+                                            buffer_recv_vector_ext_mesh, &
+                                            num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                                            nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                                            request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+                                            FORWARD_OR_ADJOINT )
+
+implicit none
+
+  include "constants.h"
+
+  integer :: NPROC
+  integer :: NGLOB_AB
+  integer(kind=8) :: Mesh_pointer
+! array to assemble
+  real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
+
+  integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+       buffer_recv_vector_ext_mesh
+
+  integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+  integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+  integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+
+  integer iinterface ! ipoin
+  integer FORWARD_OR_ADJOINT
+
+! here we have to assemble all the contributions between partitions using MPI
+
+  ! assemble only if more than one partition
+  if(NPROC > 1) then
+
+     ! adding contributions of neighbours
+     call assemble_accel_on_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, &
+          num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+          nibool_interfaces_ext_mesh,&
+          ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+
+     ! This step is done via previous function transfer_and_assemble...
+     ! do iinterface = 1, num_interfaces_ext_mesh
+     !   do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+     !     array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+     !          array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+     !   enddo
+     ! enddo
+
+     ! wait for communications completion (send)
+     do iinterface = 1, num_interfaces_ext_mesh
+        call wait_req(request_send_vector_ext_mesh(iinterface))
+     enddo
+  endif
+
+  end subroutine assemble_MPI_vector_write_cuda_no_transfer
+
+
   subroutine assemble_MPI_vector_send_cuda(NPROC, &
                                           buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
                                           num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
@@ -665,4 +773,3 @@
   endif
 
   end subroutine assemble_MPI_scalar_write_cuda
-

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-06-13 12:54:51 UTC (rev 20365)
@@ -126,9 +126,19 @@
                                       nspec_inner_elastic, &
                                       SIMULATION_TYPE, &
                                       COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
-    endif ! GPU_MODE
 
+      if(phase_is_inner .eqv. .true.) then
+         ! while Inner elements compute "Kernel_2", we wait for MPI to
+         ! finish and transfer the boundary terms to the device
+         ! asynchronously
+         call transfer_boundary_to_device(NPROC,Mesh_pointer,buffer_recv_vector_ext_mesh,&
+              num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,&
+              request_recv_vector_ext_mesh)
+      endif ! inner elements
+      
+   endif ! GPU_MODE
 
+
 ! adds elastic absorbing boundary term to acceleration (Stacey conditions)
     if(ABSORBING_CONDITIONS) &
       call compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, &
@@ -232,16 +242,13 @@
                my_neighbours_ext_mesh, &
                request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
        else ! GPU_MODE==1
-          call transfer_boun_accel_from_device(NGLOB_AB*NDIM, Mesh_pointer, accel,&
-                        buffer_send_vector_ext_mesh,&
-                        num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh,&
-                        nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh,1) ! <-- 1 == fwd accel
-          call assemble_MPI_vector_send_cuda(NPROC, &
-               buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
-               num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-               nibool_interfaces_ext_mesh,&
-               my_neighbours_ext_mesh, &
-               request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+
+          ! transfers boundary region to host asynchronously. The
+          ! MPI-send is done from within compute_forces_elastic_cuda,
+          ! once the inner element kernels are launched, and the
+          ! memcpy has finished. see compute_forces_elastic_cuda:1655
+          call transfer_boundary_from_device_asynchronously(Mesh_pointer,nspec_outer_elastic)
+                    
        endif ! GPU_MODE
 
        ! adjoint simulations

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90	2012-06-13 12:54:51 UTC (rev 20365)
@@ -884,7 +884,7 @@
   endif
 
   ! initializes GPU and outputs info to files for all processes
-  call prepare_cuda_device(myrank,ncuda_devices)
+  call prepare_cuda_device(myrank,NPROC,ncuda_devices)
 
   ! collects min/max of local devices found for statistics
   call sync_all()
@@ -913,7 +913,11 @@
                                   nrec, nrec_local, &
                                   SIMULATION_TYPE, &
                                   USE_MESH_COLORING_GPU, &
-                                  nspec_acoustic,nspec_elastic)
+                                  nspec_acoustic,nspec_elastic,&
+                                  my_neighbours_ext_mesh,&
+                                  request_send_vector_ext_mesh,&
+                                  request_recv_vector_ext_mesh,&
+                                  buffer_recv_vector_ext_mesh)
 
 
   ! prepares fields on GPU for acoustic simulations

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90	2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90	2012-06-13 12:54:51 UTC (rev 20365)
@@ -45,6 +45,9 @@
   real(kind=CUSTOM_REAL):: stf_deltat
   double precision :: stf
 
+  ! TODO: Test and Fix CUDA seismograms code.
+  logical,parameter :: USE_CUDA_SEISMOGRAMS = .false.
+
   ! gets resulting array values onto CPU
   if(GPU_MODE) then
     if( nrec_local > 0 ) then
@@ -66,19 +69,31 @@
 
       ! this transfers fields only in elements with stations for efficiency
       if( ELASTIC_SIMULATION ) then
-        call transfer_station_el_from_device( &
+         if(USE_CUDA_SEISMOGRAMS) then
+            call transfer_seismograms_el_from_device(&
+                nrec_local,&
+                Mesh_pointer, &
+                SIMULATION_TYPE,&
+                seismograms_d,&
+                seismograms_v,&
+                seismograms_a,&
+                it)
+         else
+            call transfer_station_el_from_device( &
                     displ,veloc,accel, &
                     b_displ,b_veloc,b_accel, &
                     Mesh_pointer,number_receiver_global, &
                     ispec_selected_rec,ispec_selected_source, &
                     ibool,SIMULATION_TYPE)
-
+         endif
         ! alternative: transfers whole fields
         !  call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
       endif
     endif
   endif
 
+if(.not. GPU_MODE .or. (GPU_MODE .and. (.not. USE_CUDA_SEISMOGRAMS))) then
+
   do irec_local = 1,nrec_local
 
     ! gets global number of that receiver
@@ -277,6 +292,7 @@
     if (SIMULATION_TYPE == 2) seismograms_eps(:,:,irec_local,it) = eps_s(:,:)
 
   enddo ! nrec_local
+endif
 
 ! write the current or final seismograms
   if((mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0 .or. it == NSTEP) .and. (.not.SU_FORMAT)) then