[cig-commits] r20512 - in seismo/3D/SPECFEM3D/trunk/src: cuda specfem3D

Tue Jul 10 11:22:43 PDT 2012

Author: danielpeter
Date: 2012-07-10 11:22:43 -0700 (Tue, 10 Jul 2012)
New Revision: 20512

Modified:
   seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
Log:
bug fix in stability check for large meshes on GPUs; updates GPU handling for single mpi-process runs; outcomments unused GPU mesh constants

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu	2012-07-10 18:22:43 UTC (rev 20512)
@@ -303,12 +303,13 @@
    */
 
   // reduction example:
-  __shared__ realw sdata[256] ;
+  __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
 
   // load shared mem
   unsigned int tid = threadIdx.x;
-  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
-
+  unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  unsigned int i = tid + bx*blockDim.x;  
+  
   // loads absolute values into shared memory
   sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ;
 
@@ -327,7 +328,7 @@
   }
 
   // write result for this block to global mem
-  if (tid == 0) d_max[blockIdx.x] = sdata[0];
+  if (tid == 0) d_max[bx] = sdata[0];
 
 }
 
@@ -381,15 +382,21 @@
   // way 2 b: timing Elapsed time: 1.236916e-03
   // launch simple reduction kernel
   realw* h_max;
-  int blocksize = 256;
-
-  int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->NGLOB_AB)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;  
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  
   //printf("num_blocks_x %i \n",num_blocks_x);
 
-  h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
-  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
+  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
 
-  dim3 grid(num_blocks_x,1);
+  dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
   if(*SIMULATION_TYPE == 1 ){
@@ -404,11 +411,12 @@
                                          d_max);
   }
 
-  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),222);
 
   // determines max for all blocks
   max = h_max[0];
-  for(int i=1;i<num_blocks_x;i++) {
+  for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
     if( max < h_max[i]) max = h_max[i];
   }
 
@@ -457,7 +465,7 @@
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("after get_norm_acoustic_from_device");
+  exit_on_cuda_error("get_norm_acoustic_from_device");
 #endif
 }
 
@@ -470,11 +478,12 @@
 __global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
 
   // reduction example:
-  __shared__ realw sdata[256] ;
+  __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
 
   // load shared mem
   unsigned int tid = threadIdx.x;
-  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+  unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  unsigned int i = tid + bx*blockDim.x;  
 
   // loads values into shared memory: assume array is a vector array
   sdata[tid] = (i < size) ? sqrt(array[i*3]*array[i*3]
@@ -496,7 +505,7 @@
   }
 
   // write result for this block to global mem
-  if (tid == 0) d_max[blockIdx.x] = sdata[0];
+  if (tid == 0) d_max[bx] = sdata[0];
 
 }
 
@@ -505,8 +514,8 @@
 extern "C"
 void FC_FUNC_(get_norm_elastic_from_device,
               GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
-                                                 long* Mesh_pointer_f,
-                                                 int* SIMULATION_TYPE) {
+                                            long* Mesh_pointer_f,
+                                            int* SIMULATION_TYPE) {
 
   TRACE("get_norm_elastic_from_device");
   //double start_time = get_time();
@@ -515,19 +524,24 @@
   realw max;
   realw *d_max;
 
-  max = 0;
+  max = 0.0;
 
   // launch simple reduction kernel
   realw* h_max;
-  int blocksize = 256;
-
-  int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->NGLOB_AB)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;  
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  
   //printf("num_blocks_x %i \n",num_blocks_x);
+  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
 
-  h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
-  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
-
-  dim3 grid(num_blocks_x,1);
+  dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
   if(*SIMULATION_TYPE == 1 ){
@@ -542,11 +556,18 @@
                                                 d_max);
   }
 
-  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("kernel get_norm_elastic_from_device");
+#endif
+  
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
+                                     cudaMemcpyDeviceToHost),222);
 
   // determines max for all blocks
   max = h_max[0];
-  for(int i=1;i<num_blocks_x;i++) {
+  for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
     if( max < h_max[i]) max = h_max[i];
   }
 
@@ -559,7 +580,7 @@
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("after get_norm_elastic_from_device");
+  exit_on_cuda_error("get_norm_elastic_from_device");
 #endif
 }
 

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu	2012-07-10 18:22:43 UTC (rev 20512)
@@ -185,6 +185,8 @@
   // cudaEventCreate(&start);
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
+  
+  if( *num_interfaces_ext_mesh == 0 ) return;
 
   // copies buffer onto GPU
   cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu	2012-07-10 18:22:43 UTC (rev 20512)
@@ -104,51 +104,53 @@
 
   if( *num_interfaces_ext_mesh == 0 ) return;
 
-  int blocksize = BLOCKSIZE_TRANSFER;
-  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  if( mp->size_mpi_buffer > 0 ){
+    int blocksize = BLOCKSIZE_TRANSFER;
+    int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+    int num_blocks_x = size_padded/blocksize;
+    int num_blocks_y = 1;
+    while(num_blocks_x > 65535) {
+      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+      num_blocks_y = num_blocks_y*2;
+    }
 
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
+    dim3 grid(num_blocks_x,num_blocks_y);
+    dim3 threads(blocksize,1,1);
 
-  //timing for memory xfer
-  // cudaEvent_t start, stop;
-  // realw time;
-  // cudaEventCreate(&start);
-  // cudaEventCreate(&stop);
-  // cudaEventRecord( start, 0 );
-  if(*FORWARD_OR_ADJOINT == 1) {
-    prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
-                                                       mp->num_interfaces_ext_mesh,
-                                                       mp->max_nibool_interfaces_ext_mesh,
-                                                       mp->d_nibool_interfaces_ext_mesh,
-                                                       mp->d_ibool_interfaces_ext_mesh);
-  }
-  else if(*FORWARD_OR_ADJOINT == 3) {
-    prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_send_accel_buffer,
-                                                       mp->num_interfaces_ext_mesh,
-                                                       mp->max_nibool_interfaces_ext_mesh,
-                                                       mp->d_nibool_interfaces_ext_mesh,
-                                                       mp->d_ibool_interfaces_ext_mesh);
-  }
+    //timing for memory xfer
+    // cudaEvent_t start, stop;
+    // realw time;
+    // cudaEventCreate(&start);
+    // cudaEventCreate(&stop);
+    // cudaEventRecord( start, 0 );
+    if(*FORWARD_OR_ADJOINT == 1) {
+      prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
+                                                                              mp->num_interfaces_ext_mesh,
+                                                                              mp->max_nibool_interfaces_ext_mesh,
+                                                                              mp->d_nibool_interfaces_ext_mesh,
+                                                                              mp->d_ibool_interfaces_ext_mesh);
+    }
+    else if(*FORWARD_OR_ADJOINT == 3) {
+      prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_send_accel_buffer,
+                                                                              mp->num_interfaces_ext_mesh,
+                                                                              mp->max_nibool_interfaces_ext_mesh,
+                                                                              mp->d_nibool_interfaces_ext_mesh,
+                                                                              mp->d_ibool_interfaces_ext_mesh);
+    }
 
+    // copies buffer from GPU to CPU host
+    cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
+               mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost);
 
-  cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
-             3*mp->max_nibool_interfaces_ext_mesh*mp->num_interfaces_ext_mesh*sizeof(realw),
-             cudaMemcpyDeviceToHost);
-
-  // finish timing of kernel+memcpy
-  // cudaEventRecord( stop, 0 );
-  // cudaEventSynchronize( stop );
-  // cudaEventElapsedTime( &time, start, stop );
-  // cudaEventDestroy( start );
-  // cudaEventDestroy( stop );
-  // printf("boundary xfer d->h Time: %f ms\n",time);
+    // finish timing of kernel+memcpy
+    // cudaEventRecord( stop, 0 );
+    // cudaEventSynchronize( stop );
+    // cudaEventElapsedTime( &time, start, stop );
+    // cudaEventDestroy( start );
+    // cudaEventDestroy( stop );
+    // printf("boundary xfer d->h Time: %f ms\n",time);
+  }
+  
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("transfer_boun_accel_from_device");
 #endif
@@ -167,17 +169,19 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
+  if( mp->size_mpi_buffer > 0 ){
+  
 //daniel: todo - check below with this...
- int blocksize = BLOCKSIZE_TRANSFER;
- int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+    int blocksize = BLOCKSIZE_TRANSFER;
+    int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+    int num_blocks_x = size_padded/blocksize;
+    int num_blocks_y = 1;
+    while(num_blocks_x > 65535) {
+      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+      num_blocks_y = num_blocks_y*2;
+    }
+    dim3 grid(num_blocks_x,num_blocks_y);
+    dim3 threads(blocksize,1,1);
 
 /*
 //daniel: todo - check originally used...
@@ -193,25 +197,24 @@
   dim3 threads(blocksize,1,1);
 */
 
-  prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
+    prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
                                                                           mp->num_interfaces_ext_mesh,
                                                                           mp->max_nibool_interfaces_ext_mesh,
                                                                           mp->d_nibool_interfaces_ext_mesh,
                                                                           mp->d_ibool_interfaces_ext_mesh);
   // wait until kernel is finished before starting async memcpy
 #if CUDA_VERSION >= 4000
-  cudaDeviceSynchronize();
+    cudaDeviceSynchronize();
 #else
-  cudaThreadSynchronize();
+    cudaThreadSynchronize();
 #endif
 
-  cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
-                  3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
-                  cudaMemcpyDeviceToHost,mp->copy_stream);
-  // cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
-  // 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
-  // cudaMemcpyDeviceToHost,mp->compute_stream);
-
+    cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
+                  mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost,mp->copy_stream);
+    // cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
+    // 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
+    // cudaMemcpyDeviceToHost,mp->compute_stream);
+  }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -271,19 +274,16 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_recv_buffer*sizeof(realw));
-
-  // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
-  // 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
-  // cudaMemcpyHostToDevice,mp->compute_stream);
-  //printf("xfer to device\n");
-  cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
-                  3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
-                  cudaMemcpyHostToDevice,mp->copy_stream);
-
-
-
-
+  if( mp->size_mpi_buffer > 0 ){
+    // copy on host memory
+    memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_buffer*sizeof(realw));
+  
+    // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+    //        mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->compute_stream);
+    //printf("xfer to device\n");
+    cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+                    mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->copy_stream);
+  }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -371,55 +371,58 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously?
-  if(*FORWARD_OR_ADJOINT == 1 ){
-    // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
-    cudaStreamSynchronize(mp->copy_stream);
-  }
-  else if(*FORWARD_OR_ADJOINT == 3 ){
-    cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
-             3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
-             cudaMemcpyHostToDevice);
-  }
+  if( mp->size_mpi_buffer > 0 ){
+  
+    //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously?
+    if(*FORWARD_OR_ADJOINT == 1 ){
+      // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+      cudaStreamSynchronize(mp->copy_stream);
+    }
+    else if(*FORWARD_OR_ADJOINT == 3 ){
+      cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+                 mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice);
+    }
 
-  int blocksize = BLOCKSIZE_TRANSFER;
-  int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+    int blocksize = BLOCKSIZE_TRANSFER;
+    int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+    int num_blocks_x = size_padded/blocksize;
+    int num_blocks_y = 1;
+    while(num_blocks_x > 65535) {
+      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+      num_blocks_y = num_blocks_y*2;
+    }
 
-  //double start_time = get_time();
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-  // cudaEvent_t start, stop;
-  // realw time;
-  // cudaEventCreate(&start);
-  // cudaEventCreate(&stop);
-  // cudaEventRecord( start, 0 );
-  if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
-    assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
-                                                        mp->num_interfaces_ext_mesh,
-                                                        mp->max_nibool_interfaces_ext_mesh,
-                                                        mp->d_nibool_interfaces_ext_mesh,
-                                                        mp->d_ibool_interfaces_ext_mesh);
+    //double start_time = get_time();
+    dim3 grid(num_blocks_x,num_blocks_y);
+    dim3 threads(blocksize,1,1);
+    // cudaEvent_t start, stop;
+    // realw time;
+    // cudaEventCreate(&start);
+    // cudaEventCreate(&stop);
+    // cudaEventRecord( start, 0 );
+    if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
+      assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
+                                                                               mp->num_interfaces_ext_mesh,
+                                                                               mp->max_nibool_interfaces_ext_mesh,
+                                                                               mp->d_nibool_interfaces_ext_mesh,
+                                                                               mp->d_ibool_interfaces_ext_mesh);
+    }
+    else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
+      assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
+                                                                               mp->num_interfaces_ext_mesh,
+                                                                               mp->max_nibool_interfaces_ext_mesh,
+                                                                               mp->d_nibool_interfaces_ext_mesh,
+                                                                               mp->d_ibool_interfaces_ext_mesh);
+    }
+
+    // cudaEventRecord( stop, 0 );
+    // cudaEventSynchronize( stop );
+    // cudaEventElapsedTime( &time, start, stop );
+    // cudaEventDestroy( start );
+    // cudaEventDestroy( stop );
+    // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
   }
-  else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
-    assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
-                                                        mp->num_interfaces_ext_mesh,
-                                                        mp->max_nibool_interfaces_ext_mesh,
-                                                        mp->d_nibool_interfaces_ext_mesh,
-                                                        mp->d_ibool_interfaces_ext_mesh);
-  }
-
-  // cudaEventRecord( stop, 0 );
-  // cudaEventSynchronize( stop );
-  // cudaEventElapsedTime( &time, start, stop );
-  // cudaEventDestroy( start );
-  // cudaEventDestroy( stop );
-  // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
+  
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
@@ -1629,6 +1632,14 @@
       color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
       // for factor_common array
       color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
+
+      //daniel: we use the same stream, so kernels are executed one after the other
+      // synchronizes in case we run on only 1 process to avoid race-conditions
+      //if( mp->NPROC == 1 ){
+      //  // Wait until previous compute stream finishes.         
+      //  cudaStreamSynchronize(mp->compute_stream);        
+      //}
+      
     }
 
   }else{
@@ -1706,7 +1717,7 @@
 //    // There have been problems using the pinned-memory with MPI, so
 //    // we copy the buffer into a non-pinned region.
 //    memcpy(mp->send_buffer,mp->h_send_accel_buffer,
-//           mp->size_mpi_send_buffer*sizeof(float));
+//           mp->size_mpi_buffer*sizeof(float));
 //
 //    // memory copy is now finished, so non-blocking MPI send can proceed
 //    // MPI based halo exchange
@@ -1744,19 +1755,16 @@
   // Wait until async-memcpy of outer elements is finished and start MPI.
   if( *iphase != 2 ){ exit_on_cuda_error("sync_copy_from_device must be called for iphase == 2"); }
 
-  //if(*iphase==2) {
+  if( mp->size_mpi_buffer > 0 ){
+    // waits for asynchronous copy to finish
+    cudaStreamSynchronize(mp->copy_stream);
 
-  // waits for asynchronous copy to finish
-  cudaStreamSynchronize(mp->copy_stream);
-
-  // There have been problems using the pinned-memory with MPI, so
-  // we copy the buffer into a non-pinned region.
-  memcpy(send_buffer,mp->h_send_accel_buffer,
-         mp->size_mpi_send_buffer*sizeof(float));
-
+    // There have been problems using the pinned-memory with MPI, so
+    // we copy the buffer into a non-pinned region.
+    memcpy(send_buffer,mp->h_send_accel_buffer,
+           mp->size_mpi_buffer*sizeof(float));
+  }
   // memory copy is now finished, so non-blocking MPI send can proceed
-
-  //}
 }
 
 /* ----------------------------------------------------------------------------------------------- */

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu	2012-07-10 18:22:43 UTC (rev 20512)
@@ -28,15 +28,15 @@
 
 #include <stdio.h>
 #include <cuda.h>
-#include <cublas.h>
+//#include <cublas.h>
 
 #include "config.h"
 #include "mesh_constants_cuda.h"
 
 
-#define CUBLAS_ERROR(s,n)  if (s != CUBLAS_STATUS_SUCCESS) {  \
-fprintf (stderr, "CUBLAS Memory Write Error @ %d\n",n); \
-exit(EXIT_FAILURE); }
+//#define CUBLAS_ERROR(s,n)  if (s != CUBLAS_STATUS_SUCCESS) {  \
+//fprintf (stderr, "CUBLAS Memory Write Error @ %d\n",n); \
+//exit(EXIT_FAILURE); }
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -84,12 +84,6 @@
   //int i,device;
 
   int size = *size_F;
-  realw deltat = *deltat_F;
-  realw deltatsqover2 = *deltatsqover2_F;
-  realw deltatover2 = *deltatover2_F;
-  realw b_deltat = *b_deltat_F;
-  realw b_deltatsqover2 = *b_deltatsqover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
   //cublasStatus status;
 
   int blocksize = BLOCKSIZE_KERNEL1;
@@ -110,7 +104,11 @@
 //  exit_on_cuda_error("Before UpdateDispVeloc_kernel");
 //#endif
 
-  //launch kernel
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
+  
+  //launch kernel  
   UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
                                            size,deltat,deltatsqover2,deltatover2);
 
@@ -123,7 +121,10 @@
 
   // kernel for backward fields
   if(*SIMULATION_TYPE == 3) {
-
+    realw b_deltat = *b_deltat_F;
+    realw b_deltatsqover2 = *b_deltatsqover2_F;
+    realw b_deltatover2 = *b_deltatover2_F;
+    
     UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
                                              size,b_deltat,b_deltatsqover2,b_deltatover2);
 
@@ -186,12 +187,6 @@
 
   //int i,device;
   int size = *size_F;
-  realw deltat = *deltat_F;
-  realw deltatsqover2 = *deltatsqover2_F;
-  realw deltatover2 = *deltatover2_F;
-  realw b_deltat = *b_deltat_F;
-  realw b_deltatsqover2 = *b_deltatsqover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
   //cublasStatus status;
 
   int blocksize = BLOCKSIZE_KERNEL1;
@@ -207,6 +202,10 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
+  
   //launch kernel
   UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_acoustic,
                                            mp->d_potential_dot_acoustic,
@@ -214,6 +213,10 @@
                                            size,deltat,deltatsqover2,deltatover2);
 
   if(*SIMULATION_TYPE == 3) {
+    realw b_deltat = *b_deltat_F;
+    realw b_deltatsqover2 = *b_deltatsqover2_F;
+    realw b_deltatover2 = *b_deltatover2_F;
+    
     UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_acoustic,
                                              mp->d_b_potential_dot_acoustic,
                                              mp->d_b_potential_dot_dot_acoustic,

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h	2012-07-10 18:22:43 UTC (rev 20512)
@@ -71,8 +71,9 @@
 #endif
 
 // error checking after cuda function calls
-/* #define ENABLE_VERY_SLOW_ERROR_CHECKING */
+//#define ENABLE_VERY_SLOW_ERROR_CHECKING
 
+// helper functions
 #define MAX(x,y)                    (((x) < (y)) ? (y) : (x))
 
 double get_time();
@@ -124,7 +125,7 @@
 // requires CUDA version >= 4.0, see check below
 // Use textures for d_displ and d_accel -- 10% performance boost
 #define USE_TEXTURES_FIELDS
-//
+
 // Using texture memory for the hprime-style constants is slower on
 // Fermi generation hardware, but *may* be faster on Kepler
 // generation.
@@ -181,7 +182,7 @@
   int NGLOB_AB;
 
   int myrank;
-  int NPROCS;
+  int NPROC;
 
   // interpolators
   realw* d_xix; realw* d_xiy; realw* d_xiz;
@@ -215,26 +216,23 @@
   float* h_recv_accel_buffer;
   float* h_recv_b_accel_buffer;
   float* recv_buffer;
-  int size_mpi_send_buffer;
-  int size_mpi_recv_buffer;
+  int size_mpi_buffer;
 
-
-
   // buffers and constants for the MPI-send required for async-memcpy
   // + non-blocking MPI
-  float* buffer_recv_vector_ext_mesh;
+  //daniel: check if needed
+  //float* buffer_recv_vector_ext_mesh;
   int num_interfaces_ext_mesh;
   int max_nibool_interfaces_ext_mesh;
-  int* nibool_interfaces_ext_mesh;
-  int* my_neighbours_ext_mesh;
-  int* request_send_vector_ext_mesh;
-  int* request_recv_vector_ext_mesh;
+  //int* nibool_interfaces_ext_mesh;
+  //int* my_neighbours_ext_mesh;
+  //int* request_send_vector_ext_mesh;
+  //int* request_recv_vector_ext_mesh;
 
-
   // overlapped memcpy streams
   cudaStream_t compute_stream;
   cudaStream_t copy_stream;
-  cudaStream_t b_copy_stream;
+  //cudaStream_t b_copy_stream;
 
   // ------------------------------------------------------------------ //
   // elastic wavefield parameters

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2012-07-10 18:22:43 UTC (rev 20512)
@@ -80,7 +80,7 @@
     char hostname[256];
     gethostname(hostname, sizeof(hostname));
     printf("PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
-    FILE *file = fopen("/scratch/eiger/rietmann/attach_gdb.txt","w+");
+    FILE *file = fopen("./attach_gdb.txt","w+");
     if (file != NULL){
       fprintf(file,"PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
       fclose(file);
@@ -97,16 +97,35 @@
   // sync and check to catch errors from previous async operations
   cudaThreadSynchronize();
   cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-    {
-      fprintf(stderr,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
-      pause_for_debugger(0);
-      //free(kernel_name);
+  if (err != cudaSuccess){
+    fprintf(stderr,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
+
+    //debugging
+    //pause_for_debugger(0);
+
+    // outputs error file
+    FILE* fp;
+    int myrank;
+    char filename[BUFSIZ];  
 #ifdef WITH_MPI
-      MPI_Abort(MPI_COMM_WORLD,1);
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+    myrank = 0;
+#endif  
+    sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+    fp = fopen(filename,"a+");
+    if (fp != NULL){
+      fprintf(fp,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
+      fclose(fp);
+    }
+    
+    // stops program
+    //free(kernel_name);
+#ifdef WITH_MPI
+    MPI_Abort(MPI_COMM_WORLD,1);
 #endif
-      exit(EXIT_FAILURE);
-    }
+    exit(EXIT_FAILURE);
+  }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -115,7 +134,25 @@
 {
   printf("\nERROR: %s\n",info);
   fflush(stdout);
+  
+  // outputs error file
+  FILE* fp;
+  int myrank;
+  char filename[BUFSIZ];  
 #ifdef WITH_MPI
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+  myrank = 0;
+#endif  
+  sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+  fp = fopen(filename,"a+");
+  if (fp != NULL){
+    fprintf(fp,"ERROR: %s\n",info);
+    fclose(fp);
+  }
+  
+  // stops program
+#ifdef WITH_MPI
   MPI_Abort(MPI_COMM_WORLD,1);
 #endif
   //free(info);
@@ -131,7 +168,25 @@
   {
     printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
     fflush(stdout);
+
+    // outputs error file
+    FILE* fp;
+    int myrank;
+    char filename[BUFSIZ];  
 #ifdef WITH_MPI
+    MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+    myrank = 0;
+#endif  
+    sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+    fp = fopen(filename,"a+");
+    if (fp != NULL){
+      fprintf(fp,"\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
+      fclose(fp);
+    }
+
+    // stops program
+#ifdef WITH_MPI
     MPI_Abort(MPI_COMM_WORLD,1);
 #endif
     exit(EXIT_FAILURE);
@@ -491,9 +546,8 @@
   if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
   *ncuda_devices = device_count;
 
-
   // Sets the active device
-  if(device_count > 1) {
+  if(device_count >= 1) {
     // generalized for more GPUs per node
     // note: without previous context release, cudaSetDevice will complain with the cuda error
     //         "setting the device when a process is active is not allowed"
@@ -631,16 +685,15 @@
     exit_on_error("NGLLX must be 5 for CUDA devices");
   }
 
-
+  // sets number of processes
 #ifdef WITH_MPI
   int nproc;
   MPI_Comm_size(MPI_COMM_WORLD,&nproc);
-  mp->NPROCS=nproc;
+  mp->NPROC = nproc;
 #else
-  mp->NPROCS = 1;
+  mp->NPROC = 1;
 #endif
 
-
   // sets global parameters
   mp->NSPEC_AB = *NSPEC_AB;
   mp->NGLOB_AB = *NGLOB_AB;
@@ -670,36 +723,6 @@
   }
   #endif
 
-
-  // Allocate pinned mpi-buffers.
-  // MPI buffers use pinned memory allocated by cudaMallocHost, which
-  // enables the use of asynchronous memory copies from host <->
-  // device
-  int size_mpi_buffer = 3 * (*num_interfaces_ext_mesh) * (*max_nibool_interfaces_ext_mesh);
-  print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
-  mp->send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
-  mp->size_mpi_send_buffer = size_mpi_buffer;
-
-  print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
-  mp->recv_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
-  mp->size_mpi_recv_buffer = size_mpi_buffer;
-
-  print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
-  // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
-
-  mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh;
-  mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh;
-  mp->nibool_interfaces_ext_mesh = h_nibool_interfaces_ext_mesh;
-  mp->my_neighbours_ext_mesh = my_neighbours_ext_mesh;
-  mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh;
-  mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh;
-  mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh;
-
-  // setup two streams, one for compute and one for host<->device memory copies
-  cudaStreamCreate(&mp->compute_stream);
-  cudaStreamCreate(&mp->copy_stream);
-  cudaStreamCreate(&mp->b_copy_stream);
-
   /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
   int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
 
@@ -764,6 +787,41 @@
                                        cudaMemcpyHostToDevice),1204);
   }
 
+  // Allocate pinned mpi-buffers.
+  // MPI buffers use pinned memory allocated by cudaMallocHost, which
+  // enables the use of asynchronous memory copies from host <->
+  // device
+  int size_mpi_buffer = 3 * (mp->num_interfaces_ext_mesh) * (mp->max_nibool_interfaces_ext_mesh);
+  // send buffer
+  mp->size_mpi_buffer = size_mpi_buffer;  
+  if( mp->size_mpi_buffer > 0 ){
+    print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
+    mp->send_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
+    // adjoint
+    print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+    // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+
+    // receive buffer
+    print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
+    mp->recv_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));  
+  }
+
+  //daniel: check if needed
+  //mp->nibool_interfaces_ext_mesh = h_nibool_interfaces_ext_mesh;
+  //mp->my_neighbours_ext_mesh = my_neighbours_ext_mesh;
+  //mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh;
+  //mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh;
+  //mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh;
+  
+  // setup two streams, one for compute and one for host<->device memory copies
+  // compute stream
+  cudaStreamCreate(&mp->compute_stream);
+  // copy stream (needed to transfer mpi buffers)
+  if( mp->size_mpi_buffer > 0 ){
+    cudaStreamCreate(&mp->copy_stream);
+    //cudaStreamCreate(&mp->b_copy_stream);  
+  }
+  
   // inner elements
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,mp->NSPEC_AB*sizeof(int)),1205);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
@@ -906,9 +964,12 @@
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),2003);
 
   // mpi buffer
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
+  if( mp->num_interfaces_ext_mesh > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
                       (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),2004);
-
+  }
+  
+  // mass matrix
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
                                      sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
@@ -1145,9 +1206,11 @@
   #endif
 
   // mpi buffer
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
-                        3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004);
-
+  if( mp->size_mpi_buffer > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
+                                      mp->size_mpi_buffer*sizeof(realw)),4004);
+  }
+  
   // mass matrix
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
   // transfer element data

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90	2012-07-10 18:22:43 UTC (rev 20512)
@@ -446,15 +446,15 @@
   ! assemble only if more than one partition
   if(NPROC > 1) then
 
-  ! wait for communications completion (recv)
-  !write(IMAIN,*) "sending MPI_wait"
-  do iinterface = 1, num_interfaces_ext_mesh
-    call wait_req(request_recv_vector_ext_mesh(iinterface))
-  enddo
+    ! wait for communications completion (recv)
+    !write(IMAIN,*) "sending MPI_wait"
+    do iinterface = 1, num_interfaces_ext_mesh
+      call wait_req(request_recv_vector_ext_mesh(iinterface))
+    enddo
 
-  ! send contributions to GPU
-  call transfer_boundary_to_device_a(Mesh_pointer, buffer_recv_vector_ext_mesh, &
-                                    num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh)
+    ! send contributions to GPU
+    call transfer_boundary_to_device_a(Mesh_pointer, buffer_recv_vector_ext_mesh, &
+                                      num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh)
   endif
 
   ! This step is done via previous function transfer_and_assemble...

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90	2012-07-10 18:22:43 UTC (rev 20512)
@@ -255,13 +255,11 @@
                my_neighbours_ext_mesh, &
                request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
        else ! GPU_MODE==1
-
           ! transfers boundary region to host asynchronously. The
           ! MPI-send is done from within compute_forces_elastic_cuda,
           ! once the inner element kernels are launched, and the
           ! memcpy has finished. see compute_forces_elastic_cuda:1655
           call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic)
-
        endif ! GPU_MODE
 
        ! adjoint simulations
@@ -284,7 +282,6 @@
                   nibool_interfaces_ext_mesh,&
                   my_neighbours_ext_mesh, &
                   b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
-
           endif ! GPU
        endif !adjoint
 

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90	2012-07-10 18:22:43 UTC (rev 20512)
@@ -136,31 +136,6 @@
 
 !=====================================================================
 
-  subroutine it_print_elapsed_time()
-    use specfem_par
-    use specfem_par_elastic
-    use specfem_par_acoustic
-    implicit none
-
-    ! local parameters
-    double precision :: tCPU
-    integer :: ihours,iminutes,iseconds,int_tCPU
-
-    if(myrank == 0) then
-       ! elapsed time since beginning of the simulation
-       tCPU = wtime() - time_start
-       int_tCPU = int(tCPU)
-       ihours = int_tCPU / 3600
-       iminutes = (int_tCPU - 3600*ihours) / 60
-       iseconds = int_tCPU - 3600*ihours - 60*iminutes
-       write(IMAIN,*) 'Time-Loop Complete. Timing info:'
-       write(IMAIN,*) 'Total elapsed time in seconds = ',tCPU
-       write(IMAIN,"(' Total elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
-    endif
-  end subroutine it_print_elapsed_time
-
-!=====================================================================
-
   subroutine it_check_stability()
 
 ! computes the maximum of the norm of the displacement
@@ -209,8 +184,8 @@
     ! check stability of the code, exit if unstable
     ! negative values can occur with some compilers when the unstable value is greater
     ! than the greatest possible floating-point number of the machine
-    if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0) &
-      call exit_MPI(myrank,'forward simulation became unstable and blew up')
+    !if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0.0_CUSTOM_REAL) &
+    !  call exit_MPI(myrank,'single forward simulation became unstable and blew up')
 
     ! compute the maximum of the maxima for all the slices using an MPI reduction
     call max_all_cr(Usolidnorm,Usolidnorm_all)
@@ -262,8 +237,8 @@
     ! check stability of the code, exit if unstable
     ! negative values can occur with some compilers when the unstable value is greater
     ! than the greatest possible floating-point number of the machine
-    if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0) &
-      call exit_MPI(myrank,'backward simulation became unstable and blew up')
+    !if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0.0_CUSTOM_REAL) &
+    !  call exit_MPI(myrank,'single backward simulation became unstable and blew up')
 
     ! compute max of all slices
     call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
@@ -365,10 +340,10 @@
     ! check stability of the code, exit if unstable
     ! negative values can occur with some compilers when the unstable value is greater
     ! than the greatest possible floating-point number of the machine
-    if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0 &
-     .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0 &
-     .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0 &
-     .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0) &
+    if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0_CUSTOM_REAL &
+     .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0_CUSTOM_REAL &
+     .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0_CUSTOM_REAL &
+     .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0_CUSTOM_REAL) &
         call exit_MPI(myrank,'forward simulation became unstable and blew up')
     ! adjoint simulations
     if(SIMULATION_TYPE == 3 .and. (b_Usolidnorm_all > STABILITY_THRESHOLD &
@@ -745,7 +720,33 @@
 
   end subroutine it_store_attenuation_arrays
 
+!=====================================================================
 
+  subroutine it_print_elapsed_time()
+  
+    use specfem_par
+    use specfem_par_elastic
+    use specfem_par_acoustic
+    implicit none
+
+    ! local parameters
+    double precision :: tCPU
+    integer :: ihours,iminutes,iseconds,int_tCPU
+
+    if(myrank == 0) then
+       ! elapsed time since beginning of the simulation
+       tCPU = wtime() - time_start
+       int_tCPU = int(tCPU)
+       ihours = int_tCPU / 3600
+       iminutes = (int_tCPU - 3600*ihours) / 60
+       iseconds = int_tCPU - 3600*ihours - 60*iminutes
+       write(IMAIN,*) 'Time-Loop Complete. Timing info:'
+       write(IMAIN,*) 'Total elapsed time in seconds = ',tCPU
+       write(IMAIN,"(' Total elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+    endif
+
+  end subroutine it_print_elapsed_time
+
 !=====================================================================
 
   subroutine it_transfer_from_GPU()

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90	2012-07-10 18:22:43 UTC (rev 20512)
@@ -33,10 +33,92 @@
   use specfem_par_elastic
   use specfem_par_poroelastic
   use specfem_par_movie
+  implicit none  
+  ! local parameters
+  double precision :: tCPU
 
+  ! get MPI starting time
+  time_start = wtime()
+
+  ! user output infos
+  call prepare_timerun_user_output()
+
+  ! sets up mass matrices
+  call prepare_timerun_mass_matrices()
+
+  ! initializes arrays
+  call prepare_timerun_init_wavefield()
+
+  ! sets up time increments
+  call prepare_timerun_constants()
+
+  ! prepares attenuation arrays
+  call prepare_timerun_attenuation()
+
+  ! prepares gravity arrays
+  call prepare_timerun_gravity()
+
+  ! initializes PML arrays
+  if( ABSORBING_CONDITIONS  ) then
+    if (SIMULATION_TYPE /= 1 .and. ABSORB_USE_PML )  then
+      write(IMAIN,*) 'NOTE: adjoint simulations and PML not supported yet...'
+    else
+      if( ABSORB_USE_PML ) then
+        call PML_initialize()
+      endif
+    endif
+  endif
+
+  ! prepares ADJOINT simulations
+  call prepare_timerun_adjoint()
+
+  ! prepares noise simulations
+  call prepare_timerun_noise()
+
+  ! prepares GPU arrays
+  if(GPU_MODE) call prepare_timerun_GPU()
+
+#ifdef OPENMP_MODE
+  ! prepares arrays for OpenMP
+  call prepare_timerun_OpenMP()
+#endif
+
+  ! synchronize all the processes
+  call sync_all()
+
+  ! elapsed time since beginning of preparation
+  if(myrank == 0) then
+    tCPU = wtime() - time_start
+    write(IMAIN,*)
+    write(IMAIN,*) 'Elapsed time for preparing timerun in seconds = ',tCPU
+    write(IMAIN,*)
+    write(IMAIN,*) 'time loop:'
+    write(IMAIN,*)
+    write(IMAIN,*) '           time step: ',sngl(DT),' s'
+    write(IMAIN,*) 'number of time steps: ',NSTEP
+    write(IMAIN,*) 'total simulated time: ',sngl(NSTEP*DT),' seconds'
+    write(IMAIN,*) 'start time:',sngl(-t0),' seconds'
+    write(IMAIN,*)
+
+    !daniel debug: time estimation
+    ! elastic elements: time per element t_per_element = 1.40789368e-05 s
+    ! total time = nspec * nstep * t_per_element
+  endif  
+  
+  end subroutine prepare_timerun
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_user_output()
+
+  use specfem_par
+  use specfem_par_acoustic
+  use specfem_par_elastic
+  use specfem_par_poroelastic
+  use specfem_par_movie  
   implicit none
-  character(len=256) :: plot_file
-  integer :: ier
 
   ! flag for any movie simulation
   if( EXTERNAL_MESH_MOVIE_SURFACE .or. EXTERNAL_MESH_CREATE_SHAKEMAP .or. &
@@ -102,7 +184,6 @@
     else
       write(IMAIN,*) 'no poroelastic simulation'
     endif
-    write(IMAIN,*)
 
     write(IMAIN,*)
     if(MOVIE_SIMULATION) then
@@ -110,17 +191,98 @@
     else
       write(IMAIN,*) 'no movie simulation'
     endif
+
     write(IMAIN,*)
+    write(IMAIN,*)
 
   endif
 
+  end subroutine prepare_timerun_user_output
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_mass_matrices()
+
+  use specfem_par
+  use specfem_par_acoustic
+  use specfem_par_elastic
+  use specfem_par_poroelastic
+  implicit none
+
   ! synchronize all the processes before assembling the mass matrix
   ! to make sure all the nodes have finished to read their databases
   call sync_all()
 
-  ! sets up mass matrices
-  call prepare_timerun_mass_matrices()
+  ! the mass matrix needs to be assembled with MPI here once and for all
+  if(ACOUSTIC_SIMULATION) then
+    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_acoustic, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
+                        my_neighbours_ext_mesh)
 
+    ! fill mass matrix with fictitious non-zero values to make sure it can be inverted globally
+    where(rmass_acoustic <= 0._CUSTOM_REAL) rmass_acoustic = 1._CUSTOM_REAL
+    rmass_acoustic(:) = 1._CUSTOM_REAL / rmass_acoustic(:)
+
+  endif
+
+  if(ELASTIC_SIMULATION) then
+    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        my_neighbours_ext_mesh)
+
+    ! fill mass matrix with fictitious non-zero values to make sure it can be inverted globally
+    where(rmass <= 0._CUSTOM_REAL) rmass = 1._CUSTOM_REAL
+    rmass(:) = 1._CUSTOM_REAL / rmass(:)
+
+    if(OCEANS ) then
+      if( minval(rmass_ocean_load(:)) <= 0._CUSTOM_REAL) &
+        call exit_MPI(myrank,'negative ocean load mass matrix term')
+      rmass_ocean_load(:) = 1. / rmass_ocean_load(:)
+    endif
+
+  endif
+
+  if(POROELASTIC_SIMULATION) then
+    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_solid_poroelastic, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        my_neighbours_ext_mesh)
+
+    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_fluid_poroelastic, &
+                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+                        my_neighbours_ext_mesh)
+
+    ! fills mass matrix with fictitious non-zero values to make sure it can be inverted globally
+    where(rmass_solid_poroelastic <= 0._CUSTOM_REAL) rmass_solid_poroelastic = 1._CUSTOM_REAL
+    where(rmass_fluid_poroelastic <= 0._CUSTOM_REAL) rmass_fluid_poroelastic = 1._CUSTOM_REAL
+    rmass_solid_poroelastic(:) = 1._CUSTOM_REAL / rmass_solid_poroelastic(:)
+    rmass_fluid_poroelastic(:) = 1._CUSTOM_REAL / rmass_fluid_poroelastic(:)
+
+  endif
+
+  !debug
+  !if(myrank == 0) write(IMAIN,*) 'end assembling MPI mass matrix'
+
+  end subroutine prepare_timerun_mass_matrices
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_init_wavefield()
+
+! initializes arrays
+
+  use specfem_par
+  use specfem_par_acoustic
+  use specfem_par_elastic
+  use specfem_par_poroelastic
+  implicit none
+
   ! initialize acoustic arrays to zero
   if( ACOUSTIC_SIMULATION ) then
     potential_acoustic(:) = 0._CUSTOM_REAL
@@ -152,6 +314,23 @@
     if(FIX_UNDERFLOW_PROBLEM) displw_poroelastic(:,:) = VERYSMALLVAL
   endif
 
+  end subroutine prepare_timerun_init_wavefield
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+  subroutine prepare_timerun_constants()
+
+! precomputes constants for time integration
+
+  use specfem_par
+  implicit none
+  ! local parameters
+  character(len=256) :: plot_file
+  integer :: ier
+
   ! distinguish between single and double precision for reals
   if(CUSTOM_REAL == SIZE_REAL) then
     deltat = sngl(DT)
@@ -161,6 +340,19 @@
   deltatover2 = deltat/2._CUSTOM_REAL
   deltatsqover2 = deltat*deltat/2._CUSTOM_REAL
 
+  ! adjoint runs: timing
+  if (SIMULATION_TYPE == 3) then
+    ! backward/reconstructed wavefields: time stepping is in time-reversed sense
+    ! (negative time increments)
+    if(CUSTOM_REAL == SIZE_REAL) then
+      b_deltat = - sngl(DT)
+    else
+      b_deltat = - DT
+    endif
+    b_deltatover2 = b_deltat/2._CUSTOM_REAL
+    b_deltatsqover2 = b_deltat*b_deltat/2._CUSTOM_REAL
+  endif
+
   ! seismograms
   if (nrec_local > 0) then
     ! allocate seismogram array
@@ -177,26 +369,6 @@
     seismograms_a(:,:,:) = 0._CUSTOM_REAL
   endif
 
-  ! synchronize all the processes
-  call sync_all()
-
-  ! prepares attenuation arrays
-  call prepare_timerun_attenuation()
-
-  ! prepares gravity arrays
-  call prepare_timerun_gravity()
-
-  ! initializes PML arrays
-  if( ABSORBING_CONDITIONS  ) then
-    if (SIMULATION_TYPE /= 1 .and. ABSORB_USE_PML )  then
-      write(IMAIN,*) 'NOTE: adjoint simulations and PML not supported yet...'
-    else
-      if( ABSORB_USE_PML ) then
-        call PML_initialize()
-      endif
-    endif
-  endif
-
   ! opens source time function file
   if(PRINT_SOURCE_TIME_FUNCTION .and. myrank == 0) then
     ! print the source-time function
@@ -209,111 +381,16 @@
         write(plot_file,"('/plot_source_time_function',i2,'.txt')") NSOURCES
       endif
     endif
-    open(unit=IOSTF,file=trim(OUTPUT_FILES)//plot_file,status='unknown')
+    open(unit=IOSTF,file=trim(OUTPUT_FILES)//plot_file,status='unknown',iostat=ier)
+    if( ier /= 0 ) call exit_mpi(myrank,'error opening plot_source_time_function file')
   endif
 
-  ! user output
-  if(myrank == 0) then
-    write(IMAIN,*)
-    write(IMAIN,*) '           time step: ',sngl(DT),' s'
-    write(IMAIN,*) 'number of time steps: ',NSTEP
-    write(IMAIN,*) 'total simulated time: ',sngl(NSTEP*DT),' seconds'
-    write(IMAIN,*) 'start time:',sngl(-t0),' seconds'
-    write(IMAIN,*)
+  end subroutine prepare_timerun_constants
 
-    !daniel debug: time estimation
-    ! elastic elements: time per element t_per_element = 1.40789368e-05 s
-    ! total time = nspec * nstep * t_per_element
-
-  endif
-
-  ! prepares ADJOINT simulations
-  call prepare_timerun_adjoint()
-
-  ! prepares noise simulations
-  call prepare_timerun_noise()
-
-  ! prepares GPU arrays
-  if(GPU_MODE) call prepare_timerun_GPU()
-
-#ifdef OPENMP_MODE
-  ! prepares arrays for OpenMP
-  call prepare_timerun_OpenMP()
-#endif
-
-  end subroutine prepare_timerun
-
 !
 !-------------------------------------------------------------------------------------------------
 !
 
-  subroutine prepare_timerun_mass_matrices()
-
-  use specfem_par
-  use specfem_par_acoustic
-  use specfem_par_elastic
-  use specfem_par_poroelastic
-  implicit none
-
-! the mass matrix needs to be assembled with MPI here once and for all
-  if(ACOUSTIC_SIMULATION) then
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_acoustic, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
-                        my_neighbours_ext_mesh)
-
-    ! fill mass matrix with fictitious non-zero values to make sure it can be inverted globally
-    where(rmass_acoustic <= 0._CUSTOM_REAL) rmass_acoustic = 1._CUSTOM_REAL
-    rmass_acoustic(:) = 1._CUSTOM_REAL / rmass_acoustic(:)
-
-  endif ! ACOUSTIC_SIMULATION
-
-  if(ELASTIC_SIMULATION) then
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        my_neighbours_ext_mesh)
-
-    ! fill mass matrix with fictitious non-zero values to make sure it can be inverted globally
-    where(rmass <= 0._CUSTOM_REAL) rmass = 1._CUSTOM_REAL
-    rmass(:) = 1._CUSTOM_REAL / rmass(:)
-
-    if(OCEANS ) then
-      if( minval(rmass_ocean_load(:)) <= 0._CUSTOM_REAL) &
-        call exit_MPI(myrank,'negative ocean load mass matrix term')
-      rmass_ocean_load(:) = 1. / rmass_ocean_load(:)
-    endif
-
-  endif ! ELASTIC_SIMULATION
-
-  if(POROELASTIC_SIMULATION) then
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_solid_poroelastic, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        my_neighbours_ext_mesh)
-
-    call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_fluid_poroelastic, &
-                        num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-                        nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-                        my_neighbours_ext_mesh)
-
-    ! fills mass matrix with fictitious non-zero values to make sure it can be inverted globally
-    where(rmass_solid_poroelastic <= 0._CUSTOM_REAL) rmass_solid_poroelastic = 1._CUSTOM_REAL
-    where(rmass_fluid_poroelastic <= 0._CUSTOM_REAL) rmass_fluid_poroelastic = 1._CUSTOM_REAL
-    rmass_solid_poroelastic(:) = 1._CUSTOM_REAL / rmass_solid_poroelastic(:)
-    rmass_fluid_poroelastic(:) = 1._CUSTOM_REAL / rmass_fluid_poroelastic(:)
-
-  endif ! POROELASTIC_SIMULATION
-
-  if(myrank == 0) write(IMAIN,*) 'end assembling MPI mass matrix'
-
-
-  end subroutine prepare_timerun_mass_matrices
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
   subroutine prepare_timerun_attenuation()
 
   use specfem_par
@@ -568,21 +645,6 @@
     seismograms_eps(:,:,:,:) = 0._CUSTOM_REAL
   endif
 
-! timing
-  if (SIMULATION_TYPE == 3) then
-
-    ! backward/reconstructed wavefields: time stepping is in time-reversed sense
-    ! (negative time increments)
-    if(CUSTOM_REAL == SIZE_REAL) then
-      b_deltat = - sngl(DT)
-    else
-      b_deltat = - DT
-    endif
-    b_deltatover2 = b_deltat/2._CUSTOM_REAL
-    b_deltatsqover2 = b_deltat*b_deltat/2._CUSTOM_REAL
-
-  endif
-
 ! attenuation backward memories
   if( ATTENUATION .and. SIMULATION_TYPE == 3 ) then
 
@@ -879,7 +941,6 @@
   if(myrank == 0 ) then
     write(IMAIN,*)
     write(IMAIN,*) "GPU Preparing Fields and Constants on Device."
-    write(IMAIN,*)
   endif
 
   ! prepares general fields on GPU

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90	2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90	2012-07-10 18:22:43 UTC (rev 20512)
@@ -51,9 +51,9 @@
     write(IMAIN,*)
     write(IMAIN,*) 'Total number of samples for seismograms = ',NSTEP
     write(IMAIN,*)
-    write(IMAIN,*)
     write(IMAIN,*) 'found a total of ',nrec_tot_found,' receivers in all the slices'
     if(NSOURCES > 1) write(IMAIN,*) 'Using ',NSOURCES,' point sources'
+    write(IMAIN,*)    
   endif
 
   end subroutine setup_sources_receivers