[cig-commits] r19659 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src: cuda shared specfem3D

Tue Feb 21 20:38:33 PST 2012

Author: danielpeter
Date: 2012-02-21 20:38:32 -0800 (Tue, 21 Feb 2012)
New Revision: 19659

Modified:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_receivers.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/noise_tomography.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_topography_bathymetry.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_forward_arrays.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_output.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90
Log:
updates time loop and writing seismograms

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -156,7 +156,7 @@
 
   // copies scalar buffer onto GPU
   cudaMemcpy(mp->d_send_accel_buffer_outer_core, buffer_recv_scalar,
-             (mp->max_nibool_interfaces_outer_core)*(mp->num_interfaces_outer_core)*sizeof(realw), 
+             (mp->max_nibool_interfaces_outer_core)*(mp->num_interfaces_outer_core)*sizeof(realw),
              cudaMemcpyHostToDevice);
 
   // assembles on GPU

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -74,7 +74,7 @@
                                                   int* FORWARD_OR_ADJOINT){
   TRACE("transfer_boun_accel_from_device");
   int blocksize,size_padded,num_blocks_x,num_blocks_y;
-  
+
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
   // crust/mantle region
@@ -113,14 +113,14 @@
       cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_crust_mantle,
                  3*mp->max_nibool_interfaces_crust_mantle*mp->num_interfaces_crust_mantle*sizeof(realw),
                  cudaMemcpyDeviceToHost);
-               
+
     }
   }
-  
+
   // inner core region
   if( *IREGION == IREGION_INNER_CORE ){
     if( mp->num_interfaces_inner_core > 0 ){
-      
+
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_inner_core)/((double)blocksize)))*blocksize;
       num_blocks_x = size_padded/blocksize;
@@ -128,10 +128,10 @@
       while(num_blocks_x > 65535) {
         num_blocks_x = (int) ceil(num_blocks_x*0.5f);
         num_blocks_y = num_blocks_y*2;
-      }    
+      }
       dim3 grid(num_blocks_x,num_blocks_y);
       dim3 threads(blocksize,1,1);
-      
+
       if(*FORWARD_OR_ADJOINT == 1) {
         prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
                                                            mp->d_send_accel_buffer_inner_core,
@@ -148,12 +148,12 @@
                                                            mp->d_nibool_interfaces_inner_core,
                                                            mp->d_ibool_interfaces_inner_core);
       }
-      
+
       // copies buffer to CPU
       cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_inner_core,
                  3*mp->max_nibool_interfaces_inner_core*mp->num_interfaces_inner_core*sizeof(realw),
                  cudaMemcpyDeviceToHost);
-      
+
     }
   }
 
@@ -172,7 +172,7 @@
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
   int iinterface=0;
-  
+
   for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
     if(id < d_nibool_interfaces_ext_mesh[iinterface]) {
       atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)],
@@ -222,7 +222,7 @@
       dim3 threads(blocksize,1,1);
 
       if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
-        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle, 
+        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle,
                                                             mp->d_send_accel_buffer_crust_mantle,
                                                             mp->num_interfaces_crust_mantle,
                                                             mp->max_nibool_interfaces_crust_mantle,
@@ -230,7 +230,7 @@
                                                             mp->d_ibool_interfaces_crust_mantle);
       }
       else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
-        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle, 
+        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
                                                             mp->d_send_accel_buffer_crust_mantle,
                                                             mp->num_interfaces_crust_mantle,
                                                             mp->max_nibool_interfaces_crust_mantle,
@@ -247,7 +247,7 @@
       cudaMemcpy(mp->d_send_accel_buffer_inner_core, buffer_recv_vector,
                  3*(mp->max_nibool_interfaces_inner_core)*(mp->num_interfaces_inner_core)*sizeof(realw),
                  cudaMemcpyHostToDevice);
-      
+
       // assembles values
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_inner_core)/((double)blocksize)))*blocksize;
@@ -257,12 +257,12 @@
         num_blocks_x = (int) ceil(num_blocks_x*0.5f);
         num_blocks_y = num_blocks_y*2;
       }
-      
+
       dim3 grid(num_blocks_x,num_blocks_y);
       dim3 threads(blocksize,1,1);
-      
+
       if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
-        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core, 
+        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
                                                             mp->d_send_accel_buffer_inner_core,
                                                             mp->num_interfaces_inner_core,
                                                             mp->max_nibool_interfaces_inner_core,
@@ -270,7 +270,7 @@
                                                             mp->d_ibool_interfaces_inner_core);
       }
       else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
-        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core, 
+        assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core,
                                                             mp->d_send_accel_buffer_inner_core,
                                                             mp->num_interfaces_inner_core,
                                                             mp->max_nibool_interfaces_inner_core,
@@ -279,8 +279,8 @@
       }
     }
   }
-  
-  
+
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("transfer_asmbl_accel_to_device");
 #endif

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -409,7 +409,7 @@
 // Check functions
 
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_displ_gpu,
               CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {
@@ -426,9 +426,9 @@
   }
   printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_vector,
               CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {
@@ -451,9 +451,9 @@
   }
   printf("%d:maxnorm of vector %d [%d] = %e\n",procid,*announceID,maxloc,maxnorm);
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_displ,
               CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {
@@ -467,9 +467,9 @@
   }
   printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_b_displ_gpu,
               CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {
@@ -494,9 +494,9 @@
   printf("%d: maxnorm of backward displ = %e\n",*announceID,maxnorm);
   printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm_accel);
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_b_accel_gpu,
               CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {
@@ -514,9 +514,9 @@
   }
   printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm);
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_b_veloc_gpu,
               CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {
@@ -534,9 +534,9 @@
   }
   printf("%d: maxnorm of backward veloc = %e\n",*announceID,maxnorm);
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_b_displ,
               CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {
@@ -550,9 +550,9 @@
   }
   printf("%d:maxnorm of backward displ = %e\n",*announceID,maxnorm);
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(check_max_norm_b_accel,
               CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {
@@ -566,9 +566,9 @@
   }
   printf("%d:maxnorm of backward accel = %e\n",*announceID,maxnorm);
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(check_error_vectors,
               CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {
@@ -607,8 +607,8 @@
   }
 
 }
+*/
 
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // Auxiliary functions
@@ -617,7 +617,7 @@
 
 
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(get_max_accel,
               GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {
@@ -642,14 +642,14 @@
   printf("%d/%d: max=%e\n",it,procid,maxval);
   free(accel_cpy);
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
 
 // ACOUSTIC simulations
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void get_maximum_kernel(realw* array, int size, realw* d_max){
+__global__ void get_maximum_scalar_kernel(realw* array, int size, realw* d_max){
 
   /* simplest version: uses only 1 thread
    realw max;
@@ -696,12 +696,12 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(get_norm_acoustic_from_device,
-              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
+void FC_FUNC_(check_norm_acoustic_from_device,
+              CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
                                                   int* SIMULATION_TYPE) {
 
-TRACE("get_norm_acoustic_from_device");
+TRACE("check_norm_acoustic_from_device");
   //double start_time = get_time();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
@@ -745,27 +745,21 @@
   realw* h_max;
   int blocksize = 256;
 
-  int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
-  //printf("num_blocks_x %i \n",num_blocks_x);
+  // outer core
+  int size = mp->NGLOB_OUTER_CORE;
+  int num_blocks_x = (int) ceil(size/blocksize);
 
   h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
   cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
-
   dim3 grid(num_blocks_x,1);
   dim3 threads(blocksize,1,1);
 
   if(*SIMULATION_TYPE == 1 ){
-    get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
-                                         mp->NGLOB_AB,
-                                         d_max);
+    get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_displ_outer_core,size,d_max);
+  }else if(*SIMULATION_TYPE == 3 ){
+    get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,size,d_max);
   }
 
-  if(*SIMULATION_TYPE == 3 ){
-    get_maximum_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
-                                         mp->NGLOB_AB,
-                                         d_max);
-  }
-
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
 
   // determines max for all blocks
@@ -819,7 +813,7 @@
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("after get_norm_acoustic_from_device");
+  exit_on_cuda_error("after check_norm_acoustic_from_device");
 #endif
 }
 
@@ -865,45 +859,71 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(get_norm_elastic_from_device,
-              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
-                                                 long* Mesh_pointer_f,
-                                                 int* SIMULATION_TYPE) {
+void FC_FUNC_(check_norm_elastic_from_device,
+              CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+                                              long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPE) {
 
-  TRACE("get_norm_elastic_from_device");
+  TRACE("check_norm_elastic_from_device");
   //double start_time = get_time();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  realw max;
+
+  realw max,max_crust_mantle,max_inner_core;
   realw *d_max;
+  int num_blocks_x,size;
 
-  max = 0;
 
   // launch simple reduction kernel
   realw* h_max;
   int blocksize = 256;
 
-  int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
-  //printf("num_blocks_x %i \n",num_blocks_x);
+  // crust_mantle
+  max = 0;
+  size = mp->NGLOB_CRUST_MANTLE;
 
+  num_blocks_x = (int) ceil(size/blocksize);
   h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
   cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
 
-  dim3 grid(num_blocks_x,1);
-  dim3 threads(blocksize,1,1);
-
+  dim3 grid1(num_blocks_x,1);
+  dim3 threads1(blocksize,1,1);
   if(*SIMULATION_TYPE == 1 ){
-    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ,
-                                                mp->NGLOB_AB,
-                                                d_max);
+    get_maximum_vector_kernel<<<grid1,threads1>>>(mp->d_displ_crust_mantle,size,d_max);
+  }else if(*SIMULATION_TYPE == 3 ){
+    get_maximum_vector_kernel<<<grid1,threads1>>>(mp->d_b_displ_crust_mantle,size,d_max);
   }
 
-  if(*SIMULATION_TYPE == 3 ){
-    get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ,
-                                                mp->NGLOB_AB,
-                                                d_max);
+  // copies to CPU
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
+
+  // determines max for all blocks
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
   }
+  max_crust_mantle = max;
 
+  cudaFree(d_max);
+  free(h_max);
+
+  // inner_core
+  max = 0;
+  size = mp->NGLOB_INNER_CORE;
+
+  num_blocks_x = (int) ceil(size/blocksize);
+  h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
+
+  dim3 grid2(num_blocks_x,1);
+  dim3 threads2(blocksize,1,1);
+  if(*SIMULATION_TYPE == 1 ){
+    get_maximum_vector_kernel<<<grid2,threads2>>>(mp->d_displ_inner_core,size,d_max);
+  }else if(*SIMULATION_TYPE == 3 ){
+    get_maximum_vector_kernel<<<grid2,threads2>>>(mp->d_b_displ_inner_core,size,d_max);
+  }
+
+  // copies to CPU
   print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
 
   // determines max for all blocks
@@ -911,18 +931,124 @@
   for(int i=1;i<num_blocks_x;i++) {
     if( max < h_max[i]) max = h_max[i];
   }
+  max_inner_core = max;
 
   cudaFree(d_max);
   free(h_max);
 
   // return result
+  max = MAX(max_inner_core,max_crust_mantle);
   *norm = max;
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("after get_norm_elastic_from_device");
+  exit_on_cuda_error("after check_norm_elastic_from_device");
 #endif
 }
 
+/* ----------------------------------------------------------------------------------------------- */
 
+extern "C"
+void FC_FUNC_(check_norm_strain_from_device,
+              CHECK_NORM_STRAIN_FROM_DEVICE)(realw* norm_strain,
+                                             realw* norm_strain2,
+                                             long* Mesh_pointer_f) {
+
+  TRACE("check_norm_strain_from_device");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  realw max,max_eps;
+  realw *d_max;
+  int num_blocks_x,size;
+
+
+  // launch simple reduction kernel
+  realw* h_max;
+  int blocksize = 256;
+
+  // crust_mantle strain arrays
+  size = NGLL3*(mp->NSPEC_CRUST_MANTLE);
+
+  num_blocks_x = (int) ceil(size/blocksize);
+  h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
+
+  dim3 grid(num_blocks_x,1);
+  dim3 threads(blocksize,1,1);
+
+  // determines max for: eps_trace_over_3_crust_mantle
+  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_eps_trace_over_3_crust_mantle,size,d_max);
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),221);
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+  // strain trace maximum
+  *norm_strain = max;
+
+  // initializes
+  max_eps = 0.0f;
+
+  // determines max for: epsilondev_xx_crust_mantle
+  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,size,d_max);
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+  max_eps = MAX(max_eps,max);
+
+  // determines max for: epsilondev_yy_crust_mantle
+  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_yy_crust_mantle,size,d_max);
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),223);
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+  max_eps = MAX(max_eps,max);
+
+  // determines max for: epsilondev_xy_crust_mantle
+  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_xy_crust_mantle,size,d_max);
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),224);
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+  max_eps = MAX(max_eps,max);
+
+  // determines max for: epsilondev_xz_crust_mantle
+  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_xz_crust_mantle,size,d_max);
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),225);
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+  max_eps = MAX(max_eps,max);
+
+  // determines max for: epsilondev_yz_crust_mantle
+  get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_yz_crust_mantle,size,d_max);
+  print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),226);
+  max = h_max[0];
+  for(int i=1;i<num_blocks_x;i++) {
+    if( max < h_max[i]) max = h_max[i];
+  }
+  max_eps = MAX(max_eps,max);
+
+  // strain maximum
+  *norm_strain2 = max_eps;
+
+  // frees arrays
+  cudaFree(d_max);
+  free(h_max);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("after check_norm_strain_from_device");
+#endif
+}
+
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -71,7 +71,7 @@
       stf = (realw) stf_pre_compute[isource];
       iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
 
-      // note: for global version, sourcearrays has dimensions 
+      // note: for global version, sourcearrays has dimensions
       //            sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES)
       atomicAdd(&accel[iglob*3],
                 sourcearrays[INDEX5(3,5,5,5, 0,i,j,k,isource)]*stf);
@@ -161,7 +161,7 @@
   // copies source time function buffer values to GPU
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
                                      NSOURCES*sizeof(double),cudaMemcpyHostToDevice),19);
-  
+
   compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
                                                mp->d_ibool_crust_mantle,
                                                mp->d_sourcearrays,
@@ -193,7 +193,7 @@
 
   int ispec,iglob;
   int irec,i,j,k;
-  
+
   int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
 
   if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -60,33 +60,33 @@
 /* note:
  constant arrays when used in other compute_forces_***_cuda.cu routines stay zero,
  constant declaration and cudaMemcpyToSymbol would have to be in the same file...
- 
+
  extern keyword doesn't work for __constant__ declarations.
- 
+
  also:
  cudaMemcpyToSymbol("deviceCaseParams", caseParams, sizeof(CaseParams));
  ..
  and compile with -arch=sm_20
- 
+
  see also: http://stackoverflow.com/questions/4008031/how-to-use-cuda-constant-memory-in-a-programmer-pleasant-way
  doesn't seem to work.
- 
+
  we could keep arrays separated for acoustic and elastic routines...
- 
+
  workaround:
- 
+
     for now, we store pointers with cudaGetSymbolAddress() function calls.
     we pass those pointers in all other compute_forces_..() routines
-    
+
     in this file, we can use the above constant array declarations without need of the pointers.
-    
+
  */
 
 // constant arrays
 
 void setConst_hprime_xx(realw* array,Mesh* mp)
 {
-  
+
   cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
@@ -94,7 +94,7 @@
     fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
     exit(1);
   }
-  
+
   err = cudaGetSymbolAddress((void**)&(mp->d_hprime_xx),"d_hprime_xx");
   if(err != cudaSuccess) {
     fprintf(stderr, "Error with d_hprime_xx: %s\n", cudaGetErrorString(err));
@@ -104,7 +104,7 @@
 
 void setConst_hprime_yy(realw* array,Mesh* mp)
 {
-  
+
   cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
@@ -112,7 +112,7 @@
     fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
     exit(1);
   }
-  
+
   err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
   if(err != cudaSuccess) {
     fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
@@ -122,7 +122,7 @@
 
 void setConst_hprime_zz(realw* array,Mesh* mp)
 {
-  
+
   cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
   if (err != cudaSuccess)
   {
@@ -130,7 +130,7 @@
     fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
     exit(1);
   }
-  
+
   err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
   if(err != cudaSuccess) {
     fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
@@ -147,7 +147,7 @@
     fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err));
     exit(1);
   }
-  
+
   err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_xx),"d_hprimewgll_xx");
   if(err != cudaSuccess) {
     fprintf(stderr, "Error with d_hprimewgll_xx: %s\n", cudaGetErrorString(err));
@@ -163,7 +163,7 @@
     fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err));
     exit(1);
   }
-  
+
   err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_yy),"d_hprimewgll_yy");
   if(err != cudaSuccess) {
     fprintf(stderr, "Error with d_hprimewgll_yy: %s\n", cudaGetErrorString(err));
@@ -179,7 +179,7 @@
     fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err));
     exit(1);
   }
-  
+
   err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_zz),"d_hprimewgll_zz");
   if(err != cudaSuccess) {
     fprintf(stderr, "Error with d_hprimewgll_zz: %s\n", cudaGetErrorString(err));
@@ -201,7 +201,7 @@
     fprintf(stderr, "Error with d_wgllwgll_xy: %s\n", cudaGetErrorString(err));
     exit(1);
   }
-  
+
 }
 
 void setConst_wgllwgll_xz(realw* array,Mesh* mp)
@@ -218,7 +218,7 @@
     fprintf(stderr, "Error with d_wgllwgll_xz: %s\n", cudaGetErrorString(err));
     exit(1);
   }
-  
+
 }
 
 void setConst_wgllwgll_yz(realw* array,Mesh* mp)
@@ -235,7 +235,7 @@
     fprintf(stderr, "Error with d_wgllwgll_yz: %s\n", cudaGetErrorString(err));
     exit(1);
   }
-  
+
 }
 
 void setConst_wgll_cube(realw* array,Mesh* mp)
@@ -252,7 +252,7 @@
     fprintf(stderr, "Error with d_wgll_cube: %s\n", cudaGetErrorString(err));
     exit(1);
   }
-  
+
 }
 
 
@@ -413,7 +413,7 @@
   reald cos_theta,sin_theta,cos_phi,sin_phi;
   reald minus_g,minus_dg;
   reald rho;
-  reald gxl,gyl,gzl; 
+  reald gxl,gyl,gzl;
   reald minus_g_over_radius,minus_dg_plus_g_over_radius;
   reald cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
   reald Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
@@ -424,50 +424,50 @@
   const reald R_EARTH_KM = 6371.0f;
   // uncomment line below for PREM with oceans
   //const reald R_EARTH_KM = 6368.0f;
-  
+
   // compute non-symmetric terms for gravity
 
   // use mesh coordinates to get theta and phi
   // x y z contain r theta phi
   int iglob = d_ibool[working_element*NGLL3 + tx]-1;
-  
+
   radius = d_xstore[iglob];
   theta = d_ystore[iglob];
   phi = d_zstore[iglob];
-  
+
   cos_theta = cos(theta);
   sin_theta = sin(theta);
   cos_phi = cos(phi);
   sin_phi = sin(phi);
-  
+
   // for efficiency replace with lookup table every 100 m in radial direction
   // note: radius in crust mantle should never be zero,
   //          and arrays in C start from 0, thus we need to subtract -1
   int int_radius = rint(radius * R_EARTH_KM * 10.0f ) - 1;
-  
+
   // get g, rho and dg/dr=dg
   // spherical components of the gravitational acceleration
   // for efficiency replace with lookup table every 100 m in radial direction
   minus_g = d_minus_gravity_table[int_radius];
   minus_dg = d_minus_deriv_gravity_table[int_radius];
   rho = d_density_table[int_radius];
-    
+
   // Cartesian components of the gravitational acceleration
   gxl = minus_g*sin_theta*cos_phi;
   gyl = minus_g*sin_theta*sin_phi;
   gzl = minus_g*cos_theta;
-  
+
   // Cartesian components of gradient of gravitational acceleration
   // obtained from spherical components
-  
+
   minus_g_over_radius = minus_g / radius;
   minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius;
-  
+
   cos_theta_sq = cos_theta*cos_theta;
   sin_theta_sq = sin_theta*sin_theta;
   cos_phi_sq = cos_phi*cos_phi;
   sin_phi_sq = sin_phi*sin_phi;
-  
+
   Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq;
   Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq;
   Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq;
@@ -479,21 +479,21 @@
   sx_l = rho * s_dummyx_loc[tx];
   sy_l = rho * s_dummyy_loc[tx];
   sz_l = rho * s_dummyz_loc[tx];
-  
+
   // compute G tensor from s . g and add to sigma (not symmetric)
   *sigma_xx = *sigma_xx + sy_l*gyl + sz_l*gzl;
   *sigma_yy = *sigma_yy + sx_l*gxl + sz_l*gzl;
   *sigma_zz = *sigma_zz + sx_l*gxl + sy_l*gyl;
-  
+
   *sigma_xy = *sigma_xy - sx_l * gyl;
   *sigma_yx = *sigma_yx - sy_l * gxl;
-  
+
   *sigma_xz = *sigma_xz - sx_l * gzl;
   *sigma_zx = *sigma_zx - sz_l * gxl;
-  
+
   *sigma_yz = *sigma_yz - sy_l * gzl;
   *sigma_zy = *sigma_zy - sz_l * gyl;
-  
+
   // precompute vector
   factor = jacobianl * wgll_cube[tx];
   *rho_s_H1 = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl);
@@ -635,7 +635,7 @@
                                         realw* d_ystore, realw* d_zstore,
                                         reald* sigma_xx,reald* sigma_yy,reald* sigma_zz,
                                         reald* sigma_xy,reald* sigma_xz,reald* sigma_yz){
-                                        
+
   reald kappavl,muvl,kappahl,muhl;
   reald rhovpvsq,rhovphsq,rhovsvsq,rhovshsq,eta_aniso;
   reald costheta,sintheta,cosphi,sinphi;
@@ -649,8 +649,8 @@
   reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
 
   // cosine and sine function in CUDA only supported for float
-  reald theta,phi; 
-  
+  reald theta,phi;
+
   // use Kappa and mu from transversely isotropic model
   kappavl = d_kappavstore[offset];
   muvl = d_muvstore[offset];
@@ -676,7 +676,7 @@
   // use mesh coordinates to get theta and phi
   //ystore and zstore contain theta and phi
   theta = d_ystore[iglob];
-  phi = d_zstore[iglob];    
+  phi = d_zstore[iglob];
 
   if( sizeof( theta ) == sizeof( float ) ){
     // float operations
@@ -685,9 +685,9 @@
     // or: sincosf(theta, &sintheta, &costheta);
     // or with loss of accuracy:  __sincosf(theta, &sintheta, &costheta);
     // or compile with: -use_fast_math
-    
+
     costheta = cosf(theta);
-    sintheta = sinf(theta);   
+    sintheta = sinf(theta);
 
     cosphi = cosf(phi);
     sinphi = sinf(phi);
@@ -697,14 +697,14 @@
     costwophi = cosf(2.0f * phi);
     sintwophi = sinf(2.0f * phi);
     cosfourtheta = cosf(4.0f * theta);
-    cosfourphi = cosf(4.0f * phi);    
+    cosfourphi = cosf(4.0f * phi);
   }else{
     // double operations
     costheta = cos(theta);
-    sintheta = sin(theta);   
+    sintheta = sin(theta);
 
     cosphi = cos(phi);
-    sinphi = sin(phi);      
+    sinphi = sin(phi);
 
     costwotheta = cos(2.0f * theta);
     sintwotheta = sin(2.0f * theta);
@@ -712,9 +712,9 @@
     sintwophi = sin(2.0f * phi);
 
     cosfourtheta = cos(4.0f * theta);
-    cosfourphi = cos(4.0f * phi);    
+    cosfourphi = cos(4.0f * phi);
   }
-  
+
   costhetasq = costheta * costheta;
   sinthetasq = sintheta * sintheta;
   cosphisq = cosphi * cosphi;
@@ -1023,7 +1023,7 @@
 // synchronize all the threads (one thread for each of the NGLL grid points of the
 // current spectral element) because we need the whole element to be ready in order
 // to be able to compute the matrix products along cut planes of the 3D element below
-    __syncthreads();  
+    __syncthreads();
 
 #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
 
@@ -1175,7 +1175,7 @@
         // use unrelaxed parameters if attenuation
         one_minus_sum_beta_use = one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
         minus_sum_beta = one_minus_sum_beta_use - 1.0f;
-      }      
+      }
 
       // computes stresses
       if(ANISOTROPY){
@@ -1204,7 +1204,7 @@
                               duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
                               &sigma_xx,&sigma_yy,&sigma_zz,
                               &sigma_xy,&sigma_xz,&sigma_yz);
-        }else{        
+        }else{
           // transverse isotropy
           compute_element_cm_tiso(offset,
                                 d_kappavstore,d_muvstore,
@@ -1222,8 +1222,8 @@
                                 &sigma_xy,&sigma_xz,&sigma_yz);
         }
       } // ! end of test whether isotropic or anisotropic element
-      
 
+
       if(ATTENUATION && (! USE_ATTENUATION_MIMIC ) ){
         // subtracts memory variables if attenuation
         compute_element_cm_att_stress(tx,working_element,
@@ -1249,7 +1249,7 @@
                                    wgll_cube,jacobianl,
                                    s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
                                    &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
-                                   &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,                                   
+                                   &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,
                                    &rho_s_H1,&rho_s_H2,&rho_s_H3);
       }
 
@@ -1822,5 +1822,5 @@
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
   exit_on_cuda_error("compute_forces_crust_mantle_cuda");
-#endif  
+#endif
 }

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -184,101 +184,101 @@
   reald cos_theta,sin_theta,cos_phi,sin_phi;
   reald minus_g,minus_dg;
   reald rho;
-  reald gxl,gyl,gzl; 
+  reald gxl,gyl,gzl;
   reald minus_g_over_radius,minus_dg_plus_g_over_radius;
   reald cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
   reald Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
   reald sx_l,sy_l,sz_l;
   reald factor;
-  
+
   // R_EARTH_KM is the radius of the bottom of the oceans
   const reald R_EARTH = 6371000.0f; // in m
   const reald R_EARTH_KM = 6371.0f; // in km
   // uncomment line below for PREM with oceans
   //const reald R_EARTH = 6368000.0f;
   //const reald R_EARTH_KM = 6368.0f;
-  
+
   // compute non-symmetric terms for gravity
-  
+
   // use mesh coordinates to get theta and phi
   // x y z contain r theta phi
   int iglob = d_ibool[working_element*NGLL3 + tx]-1;
-  
-  radius = d_xstore[iglob];  
+
+  radius = d_xstore[iglob];
   // make sure radius is never zero even for points at center of cube
   // because we later divide by radius
   if(radius < 100.f / R_EARTH){ radius = 100.f / R_EARTH; }
-      
+
   theta = d_ystore[iglob];
   phi = d_zstore[iglob];
-  
+
   cos_theta = cos(theta);
   sin_theta = sin(theta);
   cos_phi = cos(phi);
   sin_phi = sin(phi);
-  
+
   // for efficiency replace with lookup table every 100 m in radial direction
   // note: radius in crust mantle should never be zero,
   //          and arrays in C start from 0, thus we need to subtract -1
   int int_radius = rint(radius * R_EARTH_KM * 10.0f ) - 1;
   //make sure we never use below zero for point exactly at the center of the Earth
   if( int_radius < 0 ){int_radius = 0;}
-  
+
   // get g, rho and dg/dr=dg
   // spherical components of the gravitational acceleration
   // for efficiency replace with lookup table every 100 m in radial direction
   minus_g = d_minus_gravity_table[int_radius];
   minus_dg = d_minus_deriv_gravity_table[int_radius];
   rho = d_density_table[int_radius];
-  
+
   // Cartesian components of the gravitational acceleration
   gxl = minus_g*sin_theta*cos_phi;
   gyl = minus_g*sin_theta*sin_phi;
   gzl = minus_g*cos_theta;
-  
+
   // Cartesian components of gradient of gravitational acceleration
   // obtained from spherical components
-  
+
   minus_g_over_radius = minus_g / radius;
   minus_dg_plus_g_over_radius = minus_dg - minus_g_over_radius;
-  
+
   cos_theta_sq = cos_theta*cos_theta;
   sin_theta_sq = sin_theta*sin_theta;
   cos_phi_sq = cos_phi*cos_phi;
   sin_phi_sq = sin_phi*sin_phi;
-  
+
   Hxxl = minus_g_over_radius*(cos_phi_sq*cos_theta_sq + sin_phi_sq) + cos_phi_sq*minus_dg*sin_theta_sq;
   Hyyl = minus_g_over_radius*(cos_phi_sq + cos_theta_sq*sin_phi_sq) + minus_dg*sin_phi_sq*sin_theta_sq;
   Hzzl = cos_theta_sq*minus_dg + minus_g_over_radius*sin_theta_sq;
   Hxyl = cos_phi*minus_dg_plus_g_over_radius*sin_phi*sin_theta_sq;
   Hxzl = cos_phi*cos_theta*minus_dg_plus_g_over_radius*sin_theta;
   Hyzl = cos_theta*minus_dg_plus_g_over_radius*sin_phi*sin_theta;
-  
+
   // get displacement and multiply by density to compute G tensor
   sx_l = rho * s_dummyx_loc[tx];
   sy_l = rho * s_dummyy_loc[tx];
   sz_l = rho * s_dummyz_loc[tx];
-  
+
   // compute G tensor from s . g and add to sigma (not symmetric)
   *sigma_xx = *sigma_xx + sy_l*gyl + sz_l*gzl;
   *sigma_yy = *sigma_yy + sx_l*gxl + sz_l*gzl;
   *sigma_zz = *sigma_zz + sx_l*gxl + sy_l*gyl;
-  
+
   *sigma_xy = *sigma_xy - sx_l * gyl;
   *sigma_yx = *sigma_yx - sy_l * gxl;
-  
+
   *sigma_xz = *sigma_xz - sx_l * gzl;
   *sigma_zx = *sigma_zx - sz_l * gxl;
-  
+
   *sigma_yz = *sigma_yz - sy_l * gzl;
   *sigma_zy = *sigma_zy - sz_l * gyl;
-  
+
   // precompute vector
   factor = jacobianl * wgll_cube[tx];
   *rho_s_H1 = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl);
   *rho_s_H2 = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl);
   *rho_s_H3 = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl);
-  
+
   return;
 }
 
@@ -299,7 +299,7 @@
                                         int num_phase_ispec,
                                         int d_iphase,
                                         int use_mesh_coloring_gpu,
-                                        realw* d_displ, 
+                                        realw* d_displ,
                                         realw* d_accel,
                                         realw* d_xix, realw* d_xiy, realw* d_xiz,
                                         realw* d_etax, realw* d_etay, realw* d_etaz,
@@ -671,7 +671,7 @@
                                    wgll_cube,jacobianl,
                                    s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
                                    &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
-                                   &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,                                   
+                                   &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,
                                    &rho_s_H1,&rho_s_H2,&rho_s_H3);
       }
 
@@ -1212,6 +1212,6 @@
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
   exit_on_cuda_error("compute_forces_inner_core_cuda");
-#endif    
+#endif
 }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -663,11 +663,11 @@
                         );
 
   }
-  
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
   exit_on_cuda_error("compute_forces_outer_core_cuda");
-#endif    
+#endif
 }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -473,120 +473,35 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void compute_kernels_hess_el_cudakernel(int* ispec_is_elastic,
-                                                   int* ibool,
-                                                   realw* accel,
-                                                   realw* b_accel,
-                                                   realw* hess_kl,
-                                                   realw deltat,
-                                                   int NSPEC_AB) {
+__global__ void compute_kernels_hess_cudakernel(int* ibool,
+                                                realw* accel,
+                                                realw* b_accel,
+                                                realw* hess_kl,
+                                                realw deltat,
+                                                int NSPEC_AB) {
 
   int ispec = blockIdx.x + blockIdx.y*gridDim.x;
 
   // handles case when there is 1 extra block (due to rectangular grid)
   if(ispec < NSPEC_AB) {
 
-    // elastic elements only
-    if( ispec_is_elastic[ispec] ) {
+    int ijk = threadIdx.x;
+    int ijk_ispec = ijk + NGLL3*ispec;
+    int iglob = ibool[ijk_ispec] - 1 ;
 
-      int ijk = threadIdx.x;
-      int ijk_ispec = ijk + NGLL3*ispec;
-      int iglob = ibool[ijk_ispec] - 1 ;
-
-      // approximate hessian
-      hess_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_accel[3*iglob]+
-                                      accel[3*iglob+1]*b_accel[3*iglob+1]+
-                                      accel[3*iglob+2]*b_accel[3*iglob+2]);
-    }
+    // approximate hessian
+    hess_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_accel[3*iglob]+
+                                    accel[3*iglob+1]*b_accel[3*iglob+1]+
+                                    accel[3*iglob+2]*b_accel[3*iglob+2]);
   }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void compute_kernels_hess_ac_cudakernel(int* ispec_is_acoustic,
-                                                   int* ibool,
-                                                   realw* potential_dot_dot_acoustic,
-                                                   realw* b_potential_dot_dot_acoustic,
-                                                   realw* rhostore,
-                                                   realw* hprime_xx,
-                                                   realw* hprime_yy,
-                                                   realw* hprime_zz,
-                                                   realw* d_xix,
-                                                   realw* d_xiy,
-                                                   realw* d_xiz,
-                                                   realw* d_etax,
-                                                   realw* d_etay,
-                                                   realw* d_etaz,
-                                                   realw* d_gammax,
-                                                   realw* d_gammay,
-                                                   realw* d_gammaz,
-                                                   realw* hess_kl,
-                                                   realw deltat,
-                                                   int NSPEC_AB,
-                                                   int gravity) {
-
-  int ispec = blockIdx.x + blockIdx.y*gridDim.x;
-
-  // handles case when there is 1 extra block (due to rectangular grid)
-  if(ispec < NSPEC_AB) {
-
-    // acoustic elements only
-    if( ispec_is_acoustic[ispec] ){
-
-      // local and global indices
-      int ijk = threadIdx.x;
-      int ijk_ispec = ijk + NGLL3*ispec;
-      int iglob = ibool[ijk_ispec] - 1 ;
-
-      int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
-
-      realw accel_elm[3];
-      realw b_accel_elm[3];
-      realw rhol;
-
-      // shared memory between all threads within this block
-      __shared__ realw scalar_field_accel[NGLL3];
-      __shared__ realw scalar_field_b_accel[NGLL3];
-
-      // copy field values
-      scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
-      scalar_field_b_accel[ijk] = b_potential_dot_dot_acoustic[iglob];
-      __syncthreads();
-
-      // gets material parameter
-      rhol = rhostore[ijk_ispec_padded];
-
-      // acceleration vector
-      compute_gradient_kernel(ijk,ispec,
-                              scalar_field_accel,accel_elm,
-                              hprime_xx,hprime_yy,hprime_zz,
-                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
-                              rhol,gravity);
-
-      // acceleration vector from backward field
-      compute_gradient_kernel(ijk,ispec,
-                              scalar_field_b_accel,b_accel_elm,
-                              hprime_xx,hprime_yy,hprime_zz,
-                              d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
-                              rhol,gravity);
-      // approximates hessian
-      hess_kl[ijk_ispec] += deltat * (accel_elm[0]*b_accel_elm[0] +
-                                      accel_elm[1]*b_accel_elm[1] +
-                                      accel_elm[2]*b_accel_elm[2]);
-
-    } // ispec_is_acoustic
-
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
 extern "C"
 void FC_FUNC_(compute_kernels_hess_cuda,
               COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
-                                         realw* deltat_f,
-                                         int* ELASTIC_SIMULATION,
-                                         int* ACOUSTIC_SIMULATION) {
+                                         realw* deltat_f) {
   TRACE("compute_kernels_hess_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
@@ -594,7 +509,7 @@
   int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
   realw deltat = *deltat_f;
 
-  int num_blocks_x = mp->NSPEC_AB;
+  int num_blocks_x = mp->NSPEC_CRUST_MANTLE;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
@@ -604,41 +519,13 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  if( *ELASTIC_SIMULATION ) {
-    compute_kernels_hess_el_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,
-                                                         mp->d_ibool,
-                                                         mp->d_accel,
-                                                         mp->d_b_accel,
-                                                         mp->d_hess_el_kl,
-                                                         deltat,
-                                                         mp->NSPEC_AB);
-  }
+  compute_kernels_hess_cudakernel<<<grid,threads>>>(mp->d_ibool_crust_mantle,
+                                                       mp->d_accel_crust_mantle,
+                                                       mp->d_b_accel_crust_mantle,
+                                                       mp->d_hess_kl_crust_mantle,
+                                                       deltat,
+                                                       mp->NSPEC_CRUST_MANTLE);
 
-  if( *ACOUSTIC_SIMULATION ) {
-    compute_kernels_hess_ac_cudakernel<<<grid,threads>>>(mp->d_ispec_is_acoustic,
-                                                         mp->d_ibool,
-                                                         mp->d_potential_dot_dot_acoustic,
-                                                         mp->d_b_potential_dot_dot_acoustic,
-                                                         mp->d_rhostore,
-                                                         mp->d_hprime_xx,
-                                                         mp->d_hprime_yy,
-                                                         mp->d_hprime_zz,
-                                                         mp->d_xix,
-                                                         mp->d_xiy,
-                                                         mp->d_xiz,
-                                                         mp->d_etax,
-                                                         mp->d_etay,
-                                                         mp->d_etaz,
-                                                         mp->d_gammax,
-                                                         mp->d_gammay,
-                                                         mp->d_gammaz,
-                                                         mp->d_hess_ac_kl,
-                                                         deltat,
-                                                         mp->NSPEC_AB,
-                                                         mp->gravity);
-  }
-
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_kernels_hess_cuda");
 #endif

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -46,12 +46,12 @@
                                                int* abs_boundary_ispec,
                                                int* nkmin_xi, int* nkmin_eta,
                                                int* njmin, int* njmax,
-                                               int* nimin, int* nimax,                                               
+                                               int* nimin, int* nimax,
                                                realw* abs_boundary_jacobian2D,
                                                realw* wgllwgll,
                                                int* ibool,
                                                realw* vpstore,
-                                               int SIMULATION_TYPE, 
+                                               int SIMULATION_TYPE,
                                                int SAVE_FORWARD,
                                                realw* b_potential_dot_dot_acoustic,
                                                realw* b_absorb_potential) {
@@ -77,73 +77,73 @@
       case 4:
         // xmin
         if( nkmin_xi[INDEX2(2,0,iface)] == 0 || njmin[INDEX2(2,0,iface)] == 0 ) return;
-        
+
         i = 0; // index -1
         k = (igll/NGLLX);
         j = (igll-k*NGLLX);
-        
+
         if( k < nkmin_xi[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
         if( j < njmin[INDEX2(2,0,iface)]-1 || j > njmax[INDEX2(2,0,iface)]-1 ) return;
-        
+
         fac1 = wgllwgll[k*NGLLX+j];
         break;
-        
+
       case 5:
         // xmax
         if( nkmin_xi[INDEX2(2,1,iface)] == 0 || njmin[INDEX2(2,1,iface)] == 0 ) return;
-        
+
         i = NGLLX-1;
         k = (igll/NGLLX);
         j = (igll-k*NGLLX);
-        
+
         if( k < nkmin_xi[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
         if( j < njmin[INDEX2(2,1,iface)]-1 || j > njmax[INDEX2(2,1,iface)]-1 ) return;
-        
+
         fac1 = wgllwgll[k*NGLLX+j];
         break;
-        
+
       case 6:
         // ymin
         if( nkmin_eta[INDEX2(2,0,iface)] == 0 || nimin[INDEX2(2,0,iface)] == 0 ) return;
-        
+
         j = 0;
-        k = (igll/NGLLX);        
+        k = (igll/NGLLX);
         i = (igll-k*NGLLX);
-        
+
         if( k < nkmin_eta[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
         if( i < nimin[INDEX2(2,0,iface)]-1 || i > nimax[INDEX2(2,0,iface)]-1 ) return;
-        
+
         fac1 = wgllwgll[k*NGLLX+i];
         break;
-        
+
       case 7:
         // ymax
         if( nkmin_eta[INDEX2(2,1,iface)] == 0 || nimin[INDEX2(2,1,iface)] == 0 ) return;
-        
+
         j = NGLLX-1;
-        k = (igll/NGLLX);        
+        k = (igll/NGLLX);
         i = (igll-k*NGLLX);
-        
+
         if( k < nkmin_eta[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
         if( i < nimin[INDEX2(2,1,iface)]-1 || i > nimax[INDEX2(2,1,iface)]-1 ) return;
-        
+
         fac1 = wgllwgll[k*NGLLX+i];
         break;
 
       case 8:
-        // zmin        
+        // zmin
         k = 0;
-        j = (igll/NGLLX);        
+        j = (igll/NGLLX);
         i = (igll-j*NGLLX);
-        
+
         if( j < 0 || j > NGLLX-1 ) return;
         if( i < 0 || i > NGLLX-1 ) return;
-        
+
         fac1 = wgllwgll[j*NGLLX+i];
         break;
-        
+
     }
-    
+
     iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
 
     // determines bulk sound speed
@@ -183,9 +183,9 @@
   realw* d_abs_boundary_jacobian2D;
   realw* d_wgllwgll;
   realw* d_b_absorb_potential;
-  
+
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  
+
   // absorbing boundary type
   int interface_type = *itype;
   switch( interface_type ){
@@ -197,7 +197,7 @@
       d_b_absorb_potential = mp->d_absorb_xmin_outer_core;
       d_wgllwgll = mp->d_wgllwgll_yz;
       break;
-      
+
     case 5:
       // xmax
       num_abs_boundary_faces = mp->nspec2D_xmax_outer_core;
@@ -206,22 +206,22 @@
       d_b_absorb_potential = mp->d_absorb_xmax_outer_core;
       d_wgllwgll = mp->d_wgllwgll_yz;
       break;
-      
+
     case 6:
       // ymin
       num_abs_boundary_faces = mp->nspec2D_ymin_outer_core;
       d_abs_boundary_ispec = mp->d_ibelm_ymin_outer_core;
       d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymin_outer_core;
-      d_b_absorb_potential = mp->d_absorb_ymin_outer_core; 
+      d_b_absorb_potential = mp->d_absorb_ymin_outer_core;
       d_wgllwgll = mp->d_wgllwgll_xz;
       break;
-      
+
     case 7:
       // ymax
       num_abs_boundary_faces = mp->nspec2D_ymax_outer_core;
       d_abs_boundary_ispec = mp->d_ibelm_ymax_outer_core;
       d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymax_outer_core;
-      d_b_absorb_potential = mp->d_absorb_ymax_outer_core; 
+      d_b_absorb_potential = mp->d_absorb_ymax_outer_core;
       d_wgllwgll = mp->d_wgllwgll_xz;
       break;
 
@@ -230,18 +230,18 @@
       num_abs_boundary_faces = mp->nspec2D_zmin_outer_core;
       d_abs_boundary_ispec = mp->d_ibelm_zmin_outer_core;
       d_abs_boundary_jacobian2D = mp->d_jacobian2D_zmin_outer_core;
-      d_b_absorb_potential = mp->d_absorb_zmin_outer_core; 
+      d_b_absorb_potential = mp->d_absorb_zmin_outer_core;
       d_wgllwgll = mp->d_wgllwgll_xy;
       break;
-      
+
     default:
       exit_on_cuda_error("compute_stacey_acoustic_cuda: unknown interface type");
       break;
   }
 
-  // checks if anything to do  
+  // checks if anything to do
   if( num_abs_boundary_faces == 0 ) return;
-  
+
   // way 1: Elapsed time: 4.385948e-03
   // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
   //  int blocksize = 32;
@@ -272,11 +272,11 @@
                                                    num_abs_boundary_faces,
                                                    d_abs_boundary_ispec,
                                                    mp->d_nkmin_xi_outer_core,
-                                                   mp->d_nkmin_eta_outer_core,                                                  
+                                                   mp->d_nkmin_eta_outer_core,
                                                    mp->d_njmin_outer_core,
                                                    mp->d_njmax_outer_core,
                                                    mp->d_nimin_outer_core,
-                                                   mp->d_nimax_outer_core,                                                  
+                                                   mp->d_nimax_outer_core,
                                                    d_abs_boundary_jacobian2D,
                                                    d_wgllwgll,
                                                    mp->d_ibool_outer_core,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -73,71 +73,71 @@
   // and don't compute points outside NGLLSQUARE==NGLL2==25
   //if(igll < NGLL2 && iface < num_abs_boundary_faces) {
 
-  // way 2: only check face, no further check needed since blocksize = 25  
+  // way 2: only check face, no further check needed since blocksize = 25
   if( iface < num_abs_boundary_faces){
 
     // "-1" from index values to convert from Fortran-> C indexing
     ispec = abs_boundary_ispec[iface]-1;
-    
+
     // determines indices i,j,k depending on absorbing boundary type
     switch( interface_type ){
       case 0:
         // xmin
         if( nkmin_xi[INDEX2(2,0,iface)] == 0 || njmin[INDEX2(2,0,iface)] == 0 ) return;
-        
+
         i = 0; // index -1
         k = (igll/NGLLX);
         j = (igll-k*NGLLX);
-        
+
         if( k < nkmin_xi[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
         if( j < njmin[INDEX2(2,0,iface)]-1 || j > NGLLX-1 ) return;
-        
+
         fac1 = wgllwgll[k*NGLLX+j];
         break;
-        
+
       case 1:
         // xmax
         if( nkmin_xi[INDEX2(2,1,iface)] == 0 || njmin[INDEX2(2,1,iface)] == 0 ) return;
-        
+
         i = NGLLX-1;
         k = (igll/NGLLX);
         j = (igll-k*NGLLX);
-        
+
         if( k < nkmin_xi[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
         if( j < njmin[INDEX2(2,1,iface)]-1 || j > njmax[INDEX2(2,1,iface)]-1 ) return;
 
         fac1 = wgllwgll[k*NGLLX+j];
         break;
-        
+
       case 2:
         // ymin
         if( nkmin_eta[INDEX2(2,0,iface)] == 0 || nimin[INDEX2(2,0,iface)] == 0 ) return;
-        
+
         j = 0;
-        k = (igll/NGLLX);        
+        k = (igll/NGLLX);
         i = (igll-k*NGLLX);
-        
+
         if( k < nkmin_eta[INDEX2(2,0,iface)]-1 || k > NGLLX-1 ) return;
         if( i < nimin[INDEX2(2,0,iface)]-1 || i > nimax[INDEX2(2,0,iface)]-1 ) return;
-        
+
         fac1 = wgllwgll[k*NGLLX+i];
         break;
 
       case 3:
         // ymax
         if( nkmin_eta[INDEX2(2,1,iface)] == 0 || nimin[INDEX2(2,1,iface)] == 0 ) return;
-        
+
         j = NGLLX-1;
-        k = (igll/NGLLX);        
+        k = (igll/NGLLX);
         i = (igll-k*NGLLX);
-        
+
         if( k < nkmin_eta[INDEX2(2,1,iface)]-1 || k > NGLLX-1 ) return;
         if( i < nimin[INDEX2(2,1,iface)]-1 || i > nimax[INDEX2(2,1,iface)]-1 ) return;
-        
+
         fac1 = wgllwgll[k*NGLLX+i];
         break;
     }
-    
+
     iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
 
     // gets associated velocity
@@ -176,7 +176,7 @@
       b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw;
       b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw;
     } // SIMULATION_TYPE
-    
+
   } // num_abs_boundary_faces
 }
 
@@ -197,7 +197,7 @@
   realw* d_abs_boundary_jacobian2D;
   realw* d_wgllwgll;
   realw* d_b_absorb_field;
-  
+
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
   // absorbing boundary type
@@ -212,7 +212,7 @@
       d_b_absorb_field = mp->d_absorb_xmin_crust_mantle;
       d_wgllwgll = mp->d_wgllwgll_yz;
       break;
-      
+
     case 1:
       // xmax
       num_abs_boundary_faces = mp->nspec2D_xmax_crust_mantle;
@@ -222,33 +222,33 @@
       d_b_absorb_field = mp->d_absorb_xmax_crust_mantle;
       d_wgllwgll = mp->d_wgllwgll_yz;
       break;
-      
+
     case 2:
       // ymin
       num_abs_boundary_faces = mp->nspec2D_ymin_crust_mantle;
       d_abs_boundary_ispec = mp->d_ibelm_ymin_crust_mantle;
       d_abs_boundary_normal = mp->d_normal_ymin_crust_mantle;
       d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymin_crust_mantle;
-      d_b_absorb_field = mp->d_absorb_ymin_crust_mantle; 
+      d_b_absorb_field = mp->d_absorb_ymin_crust_mantle;
       d_wgllwgll = mp->d_wgllwgll_xz;
       break;
-      
+
     case 3:
       // ymax
       num_abs_boundary_faces = mp->nspec2D_ymax_crust_mantle;
       d_abs_boundary_ispec = mp->d_ibelm_ymax_crust_mantle;
       d_abs_boundary_normal = mp->d_normal_ymax_crust_mantle;
       d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymax_crust_mantle;
-      d_b_absorb_field = mp->d_absorb_ymax_crust_mantle; 
+      d_b_absorb_field = mp->d_absorb_ymax_crust_mantle;
       d_wgllwgll = mp->d_wgllwgll_xz;
       break;
-      
+
     default:
       exit_on_cuda_error("compute_stacey_elastic_cuda: unknown interface type");
       break;
   }
-  
-  // checks if anything to do  
+
+  // checks if anything to do
   if( num_abs_boundary_faces == 0 ) return;
 
   // way 1
@@ -258,7 +258,7 @@
   // way 2: seems sligthly faster
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
   int blocksize = NGLL2;
-  
+
   int num_blocks_x = num_abs_boundary_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
@@ -270,7 +270,7 @@
 
   // adjoint simulations: needs absorbing boundary buffer
   if(mp->simulation_type == 3 && num_abs_boundary_faces > 0) {
-    // copies array to GPU  
+    // copies array to GPU
     print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_field,absorb_field,
                             NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
   }
@@ -279,14 +279,14 @@
   compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc_crust_mantle,
                                                   mp->d_accel_crust_mantle,
                                                   interface_type,
-                                                  num_abs_boundary_faces,                                                  
+                                                  num_abs_boundary_faces,
                                                   d_abs_boundary_ispec,
                                                   mp->d_nkmin_xi_crust_mantle,
-                                                  mp->d_nkmin_eta_crust_mantle,                                                  
+                                                  mp->d_nkmin_eta_crust_mantle,
                                                   mp->d_njmin_crust_mantle,
                                                   mp->d_njmax_crust_mantle,
                                                   mp->d_nimin_crust_mantle,
-                                                  mp->d_nimax_crust_mantle,                                                  
+                                                  mp->d_nimax_crust_mantle,
                                                   d_abs_boundary_normal,
                                                   d_abs_boundary_jacobian2D,
                                                   d_wgllwgll,
@@ -301,7 +301,7 @@
 
   // adjoint simulations: stores absorbed wavefield part
   if(mp->simulation_type == 1 && mp->save_forward && num_abs_boundary_faces > 0 ) {
-    // copies array to CPU  
+    // copies array to CPU
     print_CUDA_error_if_any(cudaMemcpy(absorb_field,d_b_absorb_field,
                             NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyDeviceToHost),7701);
   }

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -28,16 +28,18 @@
 
 #include <stdio.h>
 #include <cuda.h>
-#include <cublas.h>
 
+//#include <cublas.h>
+
 #include "config.h"
 #include "mesh_constants_cuda.h"
 
 
-#define CUBLAS_ERROR(s,n)  if (s != CUBLAS_STATUS_SUCCESS) {  \
-fprintf (stderr, "CUBLAS Memory Write Error @ %d\n",n); \
-exit(EXIT_FAILURE); }
+//#define CUBLAS_ERROR(s,n)  if (s != CUBLAS_STATUS_SUCCESS) {  \
+//fprintf (stderr, "CUBLAS Memory Write Error @ %d\n",n); \
+//exit(EXIT_FAILURE); }
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // elastic wavefield
@@ -66,32 +68,30 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// KERNEL 1
+// inner core
+
+/* ----------------------------------------------------------------------------------------------- */
+
 extern "C"
-void FC_FUNC_(it_update_displacement_cuda,
-              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
-                                                 int* size_F,
-                                                 realw* deltat_F,
-                                                 realw* deltatsqover2_F,
-                                                 realw* deltatover2_F,
-                                                 int* SIMULATION_TYPE,
-                                                 realw* b_deltat_F,
-                                                 realw* b_deltatsqover2_F,
-                                                 realw* b_deltatover2_F) {
+void FC_FUNC_(it_update_displacement_ic_cuda,
+              IT_UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
+                                             realw* deltat_F,
+                                             realw* deltatsqover2_F,
+                                             realw* deltatover2_F,
+                                             realw* b_deltat_F,
+                                             realw* b_deltatsqover2_F,
+                                             realw* b_deltatover2_F) {
 
-TRACE("it_update_displacement_cuda");
+TRACE("it_update_displacement_ic_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
-  //int i,device;
+  int size = NDIM * mp->NGLOB_INNER_CORE;
 
-  int size = *size_F;
   realw deltat = *deltat_F;
   realw deltatsqover2 = *deltatsqover2_F;
   realw deltatover2 = *deltatover2_F;
-  realw b_deltat = *b_deltat_F;
-  realw b_deltatsqover2 = *b_deltatsqover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
-  //cublasStatus status;
 
   int blocksize = BLOCKSIZE_KERNEL1;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
@@ -106,39 +106,93 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  //launch kernel
+  UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+                                           mp->d_veloc_inner_core,
+                                           mp->d_accel_inner_core,
+                                           size,deltat,deltatsqover2,deltatover2);
 
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//  exit_on_cuda_error("Before UpdateDispVeloc_kernel");
-//#endif
+  // kernel for backward fields
+  if(mp->simulation_type == 3) {
+    realw b_deltat = *b_deltat_F;
+    realw b_deltatsqover2 = *b_deltatsqover2_F;
+    realw b_deltatover2 = *b_deltatover2_F;
 
+    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
+                                             mp->d_b_veloc_inner_core,
+                                             mp->d_b_accel_inner_core,
+                                             size,b_deltat,b_deltatsqover2,b_deltatover2);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("it_update_displacement_ic_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 1
+// crust/mantle
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(it_update_displacement_cm_cuda,
+              IT_UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
+                                             realw* deltat_F,
+                                             realw* deltatsqover2_F,
+                                             realw* deltatover2_F,
+                                             realw* b_deltat_F,
+                                             realw* b_deltatsqover2_F,
+                                             realw* b_deltatover2_F) {
+
+  TRACE("it_update_displacement_cm_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int size = NDIM * mp->NGLOB_CRUST_MANTLE;
+
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL1;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
   //launch kernel
-  UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
+  UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+                                           mp->d_veloc_crust_mantle,
+                                           mp->d_accel_crust_mantle,
                                            size,deltat,deltatsqover2,deltatover2);
 
-  //cudaThreadSynchronize();
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-//  // sync and check to catch errors from previous async operations
-//  exit_on_cuda_error("UpdateDispVeloc_kernel");
-//#endif
-
   // kernel for backward fields
-  if(*SIMULATION_TYPE == 3) {
+  if(mp->simulation_type == 3) {
+    realw b_deltat = *b_deltat_F;
+    realw b_deltatsqover2 = *b_deltatsqover2_F;
+    realw b_deltatover2 = *b_deltatover2_F;
 
-    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
+    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+                                             mp->d_b_veloc_crust_mantle,
+                                             mp->d_b_accel_crust_mantle,
                                              size,b_deltat,b_deltatsqover2,b_deltatover2);
-
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-//    //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-//    exit_on_cuda_error("after SIM_TYPE==3 UpdateDispVeloc_kernel");
-//#endif
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("it_update_displacement_cuda");
+  exit_on_cuda_error("it_update_displacement_cm_cuda");
 #endif
 }
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // acoustic wavefield
@@ -171,29 +225,30 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// KERNEL 1
+// outer core
+
+/* ----------------------------------------------------------------------------------------------- */
+
 extern "C"
-void FC_FUNC_(it_update_displacement_ac_cuda,
-              it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
-                                               int* size_F,
+void FC_FUNC_(it_update_displacement_oc_cuda,
+              IT_UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
                                                realw* deltat_F,
                                                realw* deltatsqover2_F,
                                                realw* deltatover2_F,
-                                               int* SIMULATION_TYPE,
                                                realw* b_deltat_F,
                                                realw* b_deltatsqover2_F,
                                                realw* b_deltatover2_F) {
-TRACE("it_update_displacement_ac_cuda");
+
+  TRACE("it_update_displacement_oc_cuda");
+
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
-  //int i,device;
-  int size = *size_F;
+  int size = mp->NGLOB_OUTER_CORE;
+
   realw deltat = *deltat_F;
   realw deltatsqover2 = *deltatsqover2_F;
   realw deltatover2 = *deltatover2_F;
-  realw b_deltat = *b_deltat_F;
-  realw b_deltatsqover2 = *b_deltatsqover2_F;
-  realw b_deltatover2 = *b_deltatover2_F;
-  //cublasStatus status;
 
   int blocksize = BLOCKSIZE_KERNEL1;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
@@ -209,22 +264,24 @@
   dim3 threads(blocksize,1,1);
 
   //launch kernel
-  UpdatePotential_kernel<<<grid,threads>>>(mp->d_potential_acoustic,
-                                           mp->d_potential_dot_acoustic,
-                                           mp->d_potential_dot_dot_acoustic,
+  UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
+                                           mp->d_veloc_outer_core,
+                                           mp->d_accel_outer_core,
                                            size,deltat,deltatsqover2,deltatover2);
 
-  if(*SIMULATION_TYPE == 3) {
-    UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_potential_acoustic,
-                                             mp->d_b_potential_dot_acoustic,
-                                             mp->d_b_potential_dot_dot_acoustic,
+  if(mp->simulation_type == 3) {
+    realw b_deltat = *b_deltat_F;
+    realw b_deltatsqover2 = *b_deltatsqover2_F;
+    realw b_deltatover2 = *b_deltatover2_F;
+
+    UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,
+                                             mp->d_b_veloc_outer_core,
+                                             mp->d_b_accel_outer_core,
                                              size,b_deltat,b_deltatsqover2,b_deltatover2);
   }
 
-  //cudaThreadSynchronize();
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("it_update_displacement_ac_cuda");
+  exit_on_cuda_error("it_update_displacement_oc_cuda");
 #endif
 }
 
@@ -301,7 +358,7 @@
   TRACE("kernel_3_a_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-  
+
   int SIMULATION_TYPE = *SIMULATION_TYPE_f;
   realw deltatover2 = *deltatover2_F;
   realw b_deltatover2 = *b_deltatover2_F;
@@ -323,26 +380,26 @@
   // check whether we can update accel and veloc, or only accel at this point
   if( *OCEANS == 0 ){
     // updates both, accel and veloc
-    kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle, 
-                                             mp->d_accel_crust_mantle, 
-                                             mp->NGLOB_CRUST_MANTLE, 
+    kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+                                             mp->d_accel_crust_mantle,
+                                             mp->NGLOB_CRUST_MANTLE,
                                              deltatover2, mp->d_rmass_crust_mantle);
 
     if(SIMULATION_TYPE == 3) {
-      kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle, 
-                                               mp->d_b_accel_crust_mantle, 
-                                               mp->NGLOB_CRUST_MANTLE, 
+      kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+                                               mp->d_b_accel_crust_mantle,
+                                               mp->NGLOB_CRUST_MANTLE,
                                                b_deltatover2,mp->d_rmass_crust_mantle);
     }
   }else{
     // updates only accel
-    kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle, 
-                                                   mp->NGLOB_CRUST_MANTLE, 
+    kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+                                                   mp->NGLOB_CRUST_MANTLE,
                                                    mp->d_rmass_crust_mantle);
 
     if(SIMULATION_TYPE == 3) {
-      kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle, 
-                                                     mp->NGLOB_CRUST_MANTLE, 
+      kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+                                                     mp->NGLOB_CRUST_MANTLE,
                                                      mp->d_rmass_crust_mantle);
     }
   }
@@ -372,7 +429,7 @@
   realw b_deltatover2 = *b_deltatover2_F;
 
   int blocksize = BLOCKSIZE_KERNEL3;
-  
+
   // crust/mantle region
   // in case of ocean loads, we still have to update the velocity for crust/mantle region
   if( *OCEANS ){
@@ -383,24 +440,24 @@
       num_blocks_x = (int) ceil(num_blocks_x*0.5f);
       num_blocks_y = num_blocks_y*2;
     }
-    dim3 grid(num_blocks_x,num_blocks_y);
-    dim3 threads(blocksize,1,1);
-    
+    dim3 grid1(num_blocks_x,num_blocks_y);
+    dim3 threads1(blocksize,1,1);
+
     // updates only veloc at this point
-    kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+    kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_veloc_crust_mantle,
                                                    mp->d_accel_crust_mantle,
                                                    mp->NGLOB_CRUST_MANTLE,
                                                    deltatover2);
 
     if(SIMULATION_TYPE == 3) {
-      kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+      kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_b_veloc_crust_mantle,
                                                      mp->d_b_accel_crust_mantle,
                                                      mp->NGLOB_CRUST_MANTLE,
                                                      b_deltatover2);
     }
   }
-  
-  // inner core  
+
+  // inner core
   size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
   num_blocks_x = size_padded/blocksize;
   num_blocks_y = 1;
@@ -410,19 +467,19 @@
   }
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
-  
+
   // updates both, accel and veloc
-  kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core, 
-                                           mp->d_accel_inner_core, 
-                                           mp->NGLOB_INNER_CORE, 
+  kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
+                                           mp->d_accel_inner_core,
+                                           mp->NGLOB_INNER_CORE,
                                            deltatover2, mp->d_rmass_inner_core);
-  
+
   if(SIMULATION_TYPE == 3) {
-    kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core, 
-                                             mp->d_b_accel_inner_core, 
-                                             mp->NGLOB_INNER_CORE, 
+    kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
+                                             mp->d_b_accel_inner_core,
+                                             mp->NGLOB_INNER_CORE,
                                              b_deltatover2,mp->d_rmass_inner_core);
-  }  
+  }
 
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -451,9 +508,9 @@
   if(id < size) {
     // multiplies pressure with the inverse of the mass matrix
     accel[id] = accel[id]*rmass[id];
-    
+
     // Newmark time scheme: corrector term
-    veloc[id] = veloc[id] + deltatover2*accel[id];    
+    veloc[id] = veloc[id] + deltatover2*accel[id];
   }
 }
 
@@ -474,7 +531,7 @@
   int SIMULATION_TYPE = *SIMULATION_TYPE_f;
   realw deltatover2 = *deltatover2_F;
   realw b_deltatover2 = *b_deltatover2_F;
-  
+
   int blocksize = BLOCKSIZE_KERNEL3;
   int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-02-22 04:38:32 UTC (rev 19659)
@@ -249,6 +249,9 @@
   realw* d_eps_trace_over_3_crust_mantle;
   realw* d_b_eps_trace_over_3_crust_mantle;
 
+  // kernels
+  realw* d_hess_kl_crust_mantle;
+
   // inner / outer elements
   int* d_phase_ispec_inner_crust_mantle;
   int num_phase_ispec_crust_mantle;
@@ -402,7 +405,7 @@
 
   // simulation type: 1 = forward, 2 = adjoint, 3 = kernel
   int simulation_type;
-  
+
   // mesh coloring flag
   int use_mesh_coloring_gpu;
 
@@ -417,7 +420,8 @@
   int rotation;
   int anisotropic_inner_core;
   int save_boundary_mesh;
-  
+  int approximate_hess_kl;
+
   // ------------------------------------------------------------------ //
   // gravity
   // ------------------------------------------------------------------ //
@@ -490,60 +494,60 @@
   int* d_nibool_interfaces_outer_core;
   int* d_ibool_interfaces_outer_core;
   realw* d_send_accel_buffer_outer_core;
-  
+
   // ------------------------------------------------------------------ //
   // absorbing boundaries
   // ------------------------------------------------------------------ //
-  
+
   int nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle;
-  int nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle;      
+  int nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle;
 
   int* d_nimin_crust_mantle, *d_nimax_crust_mantle;
   int* d_njmin_crust_mantle, *d_njmax_crust_mantle;
   int* d_nkmin_xi_crust_mantle, *d_nkmin_eta_crust_mantle;
 
   int* d_ibelm_xmin_crust_mantle, *d_ibelm_xmax_crust_mantle;
-  int* d_ibelm_ymin_crust_mantle, *d_ibelm_ymax_crust_mantle;  
-  
+  int* d_ibelm_ymin_crust_mantle, *d_ibelm_ymax_crust_mantle;
+
   realw* d_normal_xmin_crust_mantle, *d_normal_xmax_crust_mantle;
   realw* d_normal_ymin_crust_mantle, *d_normal_ymax_crust_mantle;
-  
+
   realw* d_jacobian2D_xmin_crust_mantle, *d_jacobian2D_xmax_crust_mantle;
   realw* d_jacobian2D_ymin_crust_mantle, *d_jacobian2D_ymax_crust_mantle;
-      
+
   realw* d_absorb_xmin_crust_mantle, *d_absorb_xmax_crust_mantle;
   realw* d_absorb_ymin_crust_mantle, *d_absorb_ymax_crust_mantle;
-  
+
   realw* d_rho_vp_crust_mantle;
   realw* d_rho_vs_crust_mantle;
 
   int nspec2D_xmin_outer_core,nspec2D_xmax_outer_core;
-  int nspec2D_ymin_outer_core,nspec2D_ymax_outer_core;      
+  int nspec2D_ymin_outer_core,nspec2D_ymax_outer_core;
   int nspec2D_zmin_outer_core;
-  
+
   int* d_nimin_outer_core, *d_nimax_outer_core;
   int* d_njmin_outer_core, *d_njmax_outer_core;
   int* d_nkmin_xi_outer_core, *d_nkmin_eta_outer_core;
-  
+
   int* d_ibelm_xmin_outer_core, *d_ibelm_xmax_outer_core;
-  int* d_ibelm_ymin_outer_core, *d_ibelm_ymax_outer_core;  
+  int* d_ibelm_ymin_outer_core, *d_ibelm_ymax_outer_core;
   int* d_ibelm_zmin_outer_core;
-  
+
   realw* d_jacobian2D_xmin_outer_core, *d_jacobian2D_xmax_outer_core;
   realw* d_jacobian2D_ymin_outer_core, *d_jacobian2D_ymax_outer_core;
   realw* d_jacobian2D_zmin_outer_core;
-  
+
   realw* d_absorb_xmin_outer_core, *d_absorb_xmax_outer_core;
   realw* d_absorb_ymin_outer_core, *d_absorb_ymax_outer_core;
   realw* d_absorb_zmin_outer_core;
-  
+
   realw* d_vp_outer_core;
-  
+
   // ------------------------------------------------------------------ //
   // noise tomography
   // ------------------------------------------------------------------ //
   int noise_tomography;
-  
+
   int nspec_top;
   int* d_ibelm_top_crust_mantle;
 
@@ -558,7 +562,7 @@
   // noise sensitivity kernel
   realw* d_Sigma_kl;
 
-  
+
 // ------------------------------------------------------------------ //
 //daniel: TODO - former code...
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -193,10 +193,10 @@
   int tx = threadIdx.x;
   int ispec = ispec_selected_rec[irec_master_noise]-1;
   int iglob = ibool[tx + NGLL3*ispec]-1;
-  
+
   atomicAdd(&accel[iglob*3  ],noise_sourcearray[  3*tx + 3*NGLL3*it]);
   atomicAdd(&accel[iglob*3+1],noise_sourcearray[1+3*tx + 3*NGLL3*it]);
-  atomicAdd(&accel[iglob*3+2],noise_sourcearray[2+3*tx + 3*NGLL3*it]);  
+  atomicAdd(&accel[iglob*3+2],noise_sourcearray[2+3*tx + 3*NGLL3*it]);
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -207,18 +207,18 @@
                                               int* it_f,
                                               int* irec_master_noise_f,
                                               int* islice_selected_rec) {
-  
+
   TRACE("noise_add_source_master_rec_cu");
-  
+
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  
+
   int it = *it_f - 1; // -1 for Fortran -> C indexing differences
   int irec_master_noise = *irec_master_noise_f-1;
-  
+
   dim3 grid(1,1,1);
   dim3 threads(NGLL3,1,1);
 
-  // adds noise source at master location  
+  // adds noise source at master location
   if(mp->myrank == islice_selected_rec[irec_master_noise]) {
     noise_add_source_master_rec_cuda_kernel<<<grid,threads>>>(mp->d_ibool_crust_mantle,
                                                               mp->d_ispec_selected_rec,
@@ -227,10 +227,10 @@
                                                               mp->d_noise_sourcearray,
                                                               it);
   }
-  
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("noise_add_source_master_rec_cuda_kernel");
-#endif  
+#endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -239,7 +239,7 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void noise_add_surface_movie_cuda_kernel(realw* accel, 
+__global__ void noise_add_surface_movie_cuda_kernel(realw* accel,
                                                     int* ibool,
                                                     int* ibelm_top,
                                                     int nspec_top,
@@ -257,8 +257,8 @@
   // when nspec_top > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
   if(iface < nspec_top) {
 
-    int ispec = ibelm_top[iface]-1;    
-    
+    int ispec = ibelm_top[iface]-1;
+
     int k = NGLLX - 1;
     int j = (igll/NGLLX);
     int i = (igll-j*NGLLX);
@@ -276,7 +276,7 @@
 
     // weighted jacobian
     realw jacobianw = wgllwgll[k*NGLLX+i]*jacobian2D[igll+NGLL2*iface];
-    
+
     // note: check error from cuda-memcheck and ddt seems "incorrect", because we
     //          are passing a __constant__ variable pointer around like it was
     //          made using cudaMalloc, which *may* be "incorrect", but produces
@@ -298,7 +298,7 @@
 void FC_FUNC_(noise_add_surface_movie_cuda,
               NOISE_ADD_SURFACE_MOVIE_CUDA)(long* Mesh_pointer_f,
                                             realw* h_noise_surface_movie) {
-                                            
+
   TRACE("noise_add_surface_movie_cuda");
 
 
@@ -318,7 +318,7 @@
              NDIM*NGLL2*(mp->nspec_top)*sizeof(realw),cudaMemcpyHostToDevice);
 
   switch(mp->noise_tomography) {
-  case 2: 
+  case 2:
     // adds surface source to forward field
     noise_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
                                                           mp->d_ibool_crust_mantle,
@@ -332,8 +332,8 @@
                                                           mp->d_jacobian2D_top_crust_mantle,
                                                           mp->d_wgllwgll_xy);
     break;
-    
-  case 3: 
+
+  case 3:
     // adds surface source to adjoint (backward) field
     noise_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
                                                           mp->d_ibool_crust_mantle,
@@ -343,7 +343,7 @@
                                                           mp->d_normal_x_noise,
                                                           mp->d_normal_y_noise,
                                                           mp->d_normal_z_noise,
-                                                          mp->d_mask_noise,                                                          
+                                                          mp->d_mask_noise,
                                                           mp->d_jacobian2D_top_crust_mantle,
                                                           mp->d_wgllwgll_xy);
     break;

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -183,7 +183,7 @@
 
 extern "C"
 void FC_FUNC_(prepare_constants_device,
-              PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer, 
+              PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
                                         int* myrank_f,
                                         int* h_NGLLX,
                                         realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
@@ -210,7 +210,8 @@
                                         int* ANISOTROPIC_3D_MANTLE_f,
                                         int* ANISOTROPIC_INNER_CORE_f,
                                         int* SAVE_BOUNDARY_MESH_f,
-                                        int* USE_MESH_COLORING_GPU_f) {
+                                        int* USE_MESH_COLORING_GPU_f,
+                                        int* APPROXIMATE_HESS_KL_f) {
 
 TRACE("prepare_constants_device");
 
@@ -246,7 +247,7 @@
   // simulation type
   mp->simulation_type = *SIMULATION_TYPE;
   mp->noise_tomography = *NOISE_TOMOGRAPHY;
-  
+
   // simulation flags initialization
   mp->save_forward = *SAVE_FORWARD_f;
   mp->absorbing_conditions = *ABSORBING_CONDITIONS_f;
@@ -258,7 +259,9 @@
   mp->anisotropic_3D_mantle = *ANISOTROPIC_3D_MANTLE_f;
   mp->anisotropic_inner_core = *ANISOTROPIC_INNER_CORE_f;
   mp->save_boundary_mesh = *SAVE_BOUNDARY_MESH_f;
+  mp->approximate_hess_kl = *APPROXIMATE_HESS_KL_f;
 
+  // mpi process rank
   mp->myrank = *myrank_f;
 
   // mesh coloring flag
@@ -305,28 +308,36 @@
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_number_receiver_global),mp->nrec_local*sizeof(int)),1);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_number_receiver_global,h_number_receiver_global,
                                      mp->nrec_local*sizeof(int),cudaMemcpyHostToDevice),1512);
+
+    // for seismograms
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
+                                       3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+
+    mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
+    if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
+
   }
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_selected_rec),(*nrec)*sizeof(int)),1513);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_rec,h_ispec_selected_rec,
                                      (*nrec)*sizeof(int),cudaMemcpyHostToDevice),1514);
-                                     
+
   // receiver adjoint source arrays only used for noise and adjoint simulations
   // adjoint source arrays
   mp->nadj_rec_local = *nadj_rec_local;
   if( mp->nadj_rec_local > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
                                        (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),6003);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
                                        (mp->nadj_rec_local)*sizeof(int)),6004);
-    
+
     // prepares local irec array:
     // the irec_local variable needs to be precomputed (as
     // h_pre_comp..), because normally it is in the loop updating accel,
     // and due to how it's incremented, it cannot be parallelized
     int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) );
     if( h_pre_computed_irec == NULL ) exit_on_error("h_pre_computed_irec not allocated\n");
-    
+
     int irec_local = 0;
     for(int irec = 0; irec < *nrec; irec++) {
       if(mp->myrank == h_islice_selected_rec[irec]) {
@@ -339,12 +350,12 @@
     print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
                                        (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),6010);
     free(h_pre_computed_irec);
-    
+
     // temporary array to prepare extracted source array values
     mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
-    if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");    
+    if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
   }
-  
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_constants_device");
 #endif
@@ -390,7 +401,7 @@
                                      NGLL3*(*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw)),9002);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_B_array_rotation, B_array_rotation,
                                      NGLL3*(*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw),cudaMemcpyHostToDevice),9003);
-    
+
   // backward/reconstructed fields
   if( mp->simulation_type == 3 ){
     mp->d_b_two_omega_earth = *b_two_omega_earth;
@@ -659,7 +670,7 @@
                                             ) {
 
   TRACE("prepare_fields_strain_device");
-  int R_size,R_size2;
+  int R_size;
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
 
@@ -668,8 +679,6 @@
 
   // crust_mantle
   R_size = NGLL3*mp->NSPEC_CRUST_MANTLE;
-  R_size2 = NGLL3*mp->NSPEC_CRUST_MANTLE;
-
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xx_crust_mantle,
                                      R_size*sizeof(realw)),4432);
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_yy_crust_mantle,
@@ -693,9 +702,9 @@
                                      R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
 
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_eps_trace_over_3_crust_mantle,
-                                       R_size2*sizeof(realw)),4401);
+                                       R_size*sizeof(realw)),4401);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_eps_trace_over_3_crust_mantle,eps_trace_over_3_crust_mantle,
-                                       R_size2*sizeof(realw),cudaMemcpyHostToDevice),4402);
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4402);
   // backward/reconstructed fields
   if( mp->simulation_type == 3 ){
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xx_crust_mantle,
@@ -722,15 +731,13 @@
 
 
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_eps_trace_over_3_crust_mantle,
-                                         R_size2*sizeof(realw)),4401);
+                                         R_size*sizeof(realw)),4401);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_eps_trace_over_3_crust_mantle,b_eps_trace_over_3_crust_mantle,
-                                         R_size2*sizeof(realw),cudaMemcpyHostToDevice),4402);
+                                         R_size*sizeof(realw),cudaMemcpyHostToDevice),4402);
   }
 
   // inner_core
   R_size = NGLL3*mp->NSPEC_INNER_CORE;
-  R_size2 = NGLL3*mp->NSPEC_INNER_CORE;
-
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xx_inner_core,
                                      R_size*sizeof(realw)),4432);
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_yy_inner_core,
@@ -755,9 +762,9 @@
 
 
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_eps_trace_over_3_inner_core,
-                                     R_size2*sizeof(realw)),4401);
+                                     R_size*sizeof(realw)),4401);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_eps_trace_over_3_inner_core,eps_trace_over_3_inner_core,
-                                     R_size2*sizeof(realw),cudaMemcpyHostToDevice),4402);
+                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4402);
   // backward/reconstructed fields
   if( mp->simulation_type == 3 ){
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xx_inner_core,
@@ -784,9 +791,9 @@
 
 
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_eps_trace_over_3_inner_core,
-                                       R_size2*sizeof(realw)),4401);
+                                       R_size*sizeof(realw)),4401);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_eps_trace_over_3_inner_core,b_eps_trace_over_3_inner_core,
-                                       R_size2*sizeof(realw),cudaMemcpyHostToDevice),4402);
+                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4402);
   }
 }
 
@@ -804,11 +811,11 @@
                                             int* NSPEC2DMAX_XMIN_XMAX_CM,int* NSPEC2DMAX_YMIN_YMAX_CM,
                                             int* nimin_crust_mantle,int* nimax_crust_mantle,
                                             int* njmin_crust_mantle,int* njmax_crust_mantle,
-                                            int* nkmin_xi_crust_mantle,int* nkmin_eta_crust_mantle,                                     
+                                            int* nkmin_xi_crust_mantle,int* nkmin_eta_crust_mantle,
                                             int* ibelm_xmin_crust_mantle,int* ibelm_xmax_crust_mantle,
-                                            int* ibelm_ymin_crust_mantle,int* ibelm_ymax_crust_mantle,                                       
+                                            int* ibelm_ymin_crust_mantle,int* ibelm_ymax_crust_mantle,
                                             realw* normal_xmin_crust_mantle,realw* normal_xmax_crust_mantle,
-                                            realw* normal_ymin_crust_mantle,realw* normal_ymax_crust_mantle,                                     
+                                            realw* normal_ymin_crust_mantle,realw* normal_ymax_crust_mantle,
                                             realw* jacobian2D_xmin_crust_mantle, realw* jacobian2D_xmax_crust_mantle,
                                             realw* jacobian2D_ymin_crust_mantle, realw* jacobian2D_ymax_crust_mantle,
                                             realw* rho_vp_crust_mantle,
@@ -819,31 +826,31 @@
                                             int* NSPEC2DMAX_XMIN_XMAX_OC,int* NSPEC2DMAX_YMIN_YMAX_OC,
                                             int* nimin_outer_core,int* nimax_outer_core,
                                             int* njmin_outer_core,int* njmax_outer_core,
-                                            int* nkmin_xi_outer_core,int* nkmin_eta_outer_core,                                     
+                                            int* nkmin_xi_outer_core,int* nkmin_eta_outer_core,
                                             int* ibelm_xmin_outer_core,int* ibelm_xmax_outer_core,
                                             int* ibelm_ymin_outer_core,int* ibelm_ymax_outer_core,
                                             int* ibelm_bottom_outer_core,
                                             realw* jacobian2D_xmin_outer_core, realw* jacobian2D_xmax_outer_core,
                                             realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
-                                            realw* jacobian2D_bottom_outer_core,                                            
+                                            realw* jacobian2D_bottom_outer_core,
                                             realw* vp_outer_core
                                             ) {
-  
+
   TRACE("prepare_fields_absorb_device");
   int size;
-  
+
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-  
+
   // checks flag
   if( ! mp->absorbing_conditions ){ exit_on_cuda_error("prepare_fields_absorb_device absorbing_conditions not properly initialized"); }
-  
+
   // crust_mantle
   mp->nspec2D_xmin_crust_mantle = *nspec2D_xmin_crust_mantle;
   mp->nspec2D_xmax_crust_mantle = *nspec2D_xmax_crust_mantle;
   mp->nspec2D_ymin_crust_mantle = *nspec2D_ymin_crust_mantle;
   mp->nspec2D_ymax_crust_mantle = *nspec2D_ymax_crust_mantle;
 
-  // vp & vs 
+  // vp & vs
   size = NGLL3*(mp->NSPEC_CRUST_MANTLE);
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rho_vp_crust_mantle,
                                      size*sizeof(realw)),2201);
@@ -865,12 +872,12 @@
                                      2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int)),1201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_eta_crust_mantle,nkmin_eta_crust_mantle,
                                      2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
-  
+
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmin_crust_mantle,
                                      2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int)),1201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_njmin_crust_mantle,njmin_crust_mantle,
                                      2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
-  
+
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmax_crust_mantle,
                                      2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int)),1201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_njmax_crust_mantle,njmax_crust_mantle,
@@ -885,15 +892,15 @@
                                      2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int)),1201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_nimax_crust_mantle,nimax_crust_mantle,
                                      2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
-  
-  
-  // xmin  
+
+
+  // xmin
   if( mp->nspec2D_xmin_crust_mantle > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_xmin_crust_mantle,
                                        (mp->nspec2D_xmin_crust_mantle)*sizeof(int)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmin_crust_mantle,ibelm_xmin_crust_mantle,
                                        (mp->nspec2D_xmin_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_xmin_crust_mantle,
                             NDIM*NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_xmin_crust_mantle,normal_xmin_crust_mantle,
@@ -903,12 +910,12 @@
                             NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmin_crust_mantle,jacobian2D_xmin_crust_mantle,
                             NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     // boundary buffer
-    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmin_crust_mantle,
                               NDIM*NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw)),1202);
-    }    
+    }
   }
 
   // xmax
@@ -917,22 +924,22 @@
                                        (mp->nspec2D_xmax_crust_mantle)*sizeof(int)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmax_crust_mantle,ibelm_xmax_crust_mantle,
                                        (mp->nspec2D_xmax_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
-        
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_xmax_crust_mantle,
                                        NDIM*NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_xmax_crust_mantle,normal_xmax_crust_mantle,
                                        NDIM*NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmax_crust_mantle,
                                        NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmax_crust_mantle,jacobian2D_xmax_crust_mantle,
                                        NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     // boundary buffer
-    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmax_crust_mantle,
                                          NDIM*NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw)),1202);
-    }    
+    }
   }
 
   // ymin
@@ -941,22 +948,22 @@
                                        (mp->nspec2D_ymin_crust_mantle)*sizeof(int)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymin_crust_mantle,ibelm_ymin_crust_mantle,
                                        (mp->nspec2D_ymin_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_ymin_crust_mantle,
                                        NDIM*NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_ymin_crust_mantle,normal_ymin_crust_mantle,
                                        NDIM*NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymin_crust_mantle,
                                        NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymin_crust_mantle,jacobian2D_ymin_crust_mantle,
                                        NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     // boundary buffer
-    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymin_crust_mantle,
                                          NDIM*NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw)),1202);
-    }    
+    }
   }
 
   // ymax
@@ -965,87 +972,87 @@
                                        (mp->nspec2D_ymax_crust_mantle)*sizeof(int)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymax_crust_mantle,ibelm_ymax_crust_mantle,
                                        (mp->nspec2D_ymax_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_ymax_crust_mantle,
                                        NDIM*NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_ymax_crust_mantle,normal_ymax_crust_mantle,
                                        NDIM*NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymax_crust_mantle,
                                        NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymax_crust_mantle,jacobian2D_ymax_crust_mantle,
                                        NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     // boundary buffer
-    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymax_crust_mantle,
                                          NDIM*NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw)),1202);
-    }    
+    }
   }
-  
 
+
   // outer_core
   mp->nspec2D_xmin_outer_core = *nspec2D_xmin_outer_core;
   mp->nspec2D_xmax_outer_core = *nspec2D_xmax_outer_core;
   mp->nspec2D_ymin_outer_core = *nspec2D_ymin_outer_core;
   mp->nspec2D_ymax_outer_core = *nspec2D_ymax_outer_core;
   mp->nspec2D_zmin_outer_core = *nspec2D_zmin_outer_core;
-  
-  // vp 
+
+  // vp
   size = NGLL3*(mp->NSPEC_OUTER_CORE);
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_vp_outer_core,
                                      size*sizeof(realw)),2201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_vp_outer_core,vp_outer_core,
                                      size*sizeof(realw),cudaMemcpyHostToDevice),2202);
-  
+
   // ijk index arrays
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nkmin_xi_outer_core,
                                      2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int)),1201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_xi_outer_core,nkmin_xi_outer_core,
                                      2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-  
+
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nkmin_eta_outer_core,
                                      2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int)),1201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_eta_outer_core,nkmin_eta_outer_core,
                                      2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-  
+
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmin_outer_core,
                                      2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int)),1201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_njmin_outer_core,njmin_outer_core,
                                      2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-  
+
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmax_outer_core,
                                      2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int)),1201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_njmax_outer_core,njmax_outer_core,
                                      2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-  
+
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nimin_outer_core,
                                      2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int)),1201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_nimin_outer_core,nimin_outer_core,
                                      2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-  
+
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nimax_outer_core,
                                      2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int)),1201);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_nimax_outer_core,nimax_outer_core,
                                      2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-  
-  // xmin  
+
+  // xmin
   if( mp->nspec2D_xmin_outer_core > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_xmin_outer_core,
                                        (mp->nspec2D_xmin_outer_core)*sizeof(int)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmin_outer_core,ibelm_xmin_outer_core,
                                        (mp->nspec2D_xmin_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmin_outer_core,
                                        NGLL2*(mp->nspec2D_xmin_outer_core)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmin_outer_core,jacobian2D_xmin_outer_core,
                                        NGLL2*(mp->nspec2D_xmin_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     // boundary buffer
-    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmin_outer_core,
                                          NGLL2*(mp->nspec2D_xmin_outer_core)*sizeof(realw)),1202);
-    }    
+    }
   }
 
   // xmax
@@ -1054,55 +1061,55 @@
                                        (mp->nspec2D_xmax_outer_core)*sizeof(int)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmax_outer_core,ibelm_xmax_outer_core,
                                        (mp->nspec2D_xmax_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmax_outer_core,
                                        NGLL2*(mp->nspec2D_xmax_outer_core)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmax_outer_core,jacobian2D_xmax_outer_core,
                                        NGLL2*(mp->nspec2D_xmax_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     // boundary buffer
-    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmax_outer_core,
                                          NGLL2*(mp->nspec2D_xmax_outer_core)*sizeof(realw)),1202);
-    }    
+    }
   }
-  
+
   // ymin
   if( mp->nspec2D_ymin_outer_core > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_ymin_outer_core,
                                        (mp->nspec2D_ymin_outer_core)*sizeof(int)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymin_outer_core,ibelm_ymin_outer_core,
                                        (mp->nspec2D_ymin_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymin_outer_core,
                                        NGLL2*(mp->nspec2D_ymin_outer_core)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymin_outer_core,jacobian2D_ymin_outer_core,
                                        NGLL2*(mp->nspec2D_ymin_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     // boundary buffer
-    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymin_outer_core,
                                          NGLL2*(mp->nspec2D_ymin_outer_core)*sizeof(realw)),1202);
-    }    
+    }
   }
-  
+
   // ymax
   if( mp->nspec2D_ymax_outer_core > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_ymax_outer_core,
                                        (mp->nspec2D_ymax_outer_core)*sizeof(int)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymax_outer_core,ibelm_ymax_outer_core,
                                        (mp->nspec2D_ymax_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymax_outer_core,
                                        NGLL2*(mp->nspec2D_ymax_outer_core)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymax_outer_core,jacobian2D_ymax_outer_core,
                                        NGLL2*(mp->nspec2D_ymax_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     // boundary buffer
-    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymax_outer_core,
                                          NGLL2*(mp->nspec2D_ymax_outer_core)*sizeof(realw)),1202);
-    }    
+    }
   }
 
   // zmin
@@ -1111,21 +1118,21 @@
                                        (mp->nspec2D_zmin_outer_core)*sizeof(int)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_zmin_outer_core,ibelm_bottom_outer_core,
                                        (mp->nspec2D_zmin_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
-    
+
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_zmin_outer_core,
                                        NGLL2*(mp->nspec2D_zmin_outer_core)*sizeof(realw)),1201);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_zmin_outer_core,jacobian2D_bottom_outer_core,
                                        NGLL2*(mp->nspec2D_zmin_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-    
+
     // boundary buffer
-    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){    
+    if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_zmin_outer_core,
                                          NGLL2*(mp->nspec2D_zmin_outer_core)*sizeof(realw)),1202);
-    }    
+    }
   }
-  
-}  
 
+}
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // MPI interfaces
@@ -1134,7 +1141,7 @@
 
 extern "C"
 void FC_FUNC_(prepare_mpi_buffers_device,
-              PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f, 
+              PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f,
                                           int* num_interfaces_crust_mantle,
                                           int* max_nibool_interfaces_crust_mantle,
                                           int* nibool_interfaces_crust_mantle,
@@ -1146,13 +1153,13 @@
                                           int* num_interfaces_outer_core,
                                           int* max_nibool_interfaces_outer_core,
                                           int* nibool_interfaces_outer_core,
-                                          int* ibool_interfaces_outer_core                                          
+                                          int* ibool_interfaces_outer_core
                                           ){
 
   TRACE("prepare_mpi_buffers_device");
-  
+
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-  
+
   // prepares interprocess-edge exchange information
 
   // crust/mantle mesh
@@ -1172,8 +1179,8 @@
                                        cudaMemcpyHostToDevice),1204);
     // allocates mpi buffer for exchange with cpu
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_crust_mantle),
-                                       3*(mp->max_nibool_interfaces_crust_mantle)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);    
-  }  
+                                       3*(mp->max_nibool_interfaces_crust_mantle)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);
+  }
 
   // inner core mesh
   mp->num_interfaces_inner_core = *num_interfaces_inner_core;
@@ -1192,8 +1199,8 @@
                                        cudaMemcpyHostToDevice),1204);
     // allocates mpi buffer for exchange with cpu
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_inner_core),
-                                       3*(mp->max_nibool_interfaces_inner_core)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);    
-  }  
+                                       3*(mp->max_nibool_interfaces_inner_core)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);
+  }
 
   // outer core mesh
   // note: uses only scalar wavefield arrays
@@ -1213,10 +1220,10 @@
                                        cudaMemcpyHostToDevice),1204);
     // allocates mpi buffer for exchange with cpu
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_outer_core),
-                                       (mp->max_nibool_interfaces_outer_core)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);    
-  }  
-  
+                                       (mp->max_nibool_interfaces_outer_core)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);
+  }
 
+
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -1314,7 +1321,7 @@
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_tiso_crust_mantle, (mp->NSPEC_CRUST_MANTLE)*sizeof(int)),1025);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_tiso_crust_mantle, h_ispec_is_tiso,
                                        (mp->NSPEC_CRUST_MANTLE)*sizeof(int),cudaMemcpyHostToDevice),1025);
-    
+
     // kappavstore, kappahstore
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappavstore_crust_mantle, size_padded*sizeof(realw)),1010);
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappahstore_crust_mantle, size_padded*sizeof(realw)),1010);
@@ -1336,7 +1343,7 @@
     }
   }else{
     // anisotropic 3D mantle
-    
+
     // allocates memory on GPU
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store_crust_mantle),
                                        size_padded*sizeof(realw)),4700);
@@ -1445,7 +1452,7 @@
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_zstore_crust_mantle),sizeof(realw)*size_glob),2005);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_zstore_crust_mantle,h_zstore,
                                      sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
-  
+
   // xstore only needed when gravity is on
   if( mp->gravity ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_xstore_crust_mantle),sizeof(realw)*size_glob),2005);
@@ -1465,6 +1472,7 @@
 
   // wavefield
   int size = NDIM * mp->NGLOB_CRUST_MANTLE;
+
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ_crust_mantle),sizeof(realw)*size),4001);
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc_crust_mantle),sizeof(realw)*size),4002);
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel_crust_mantle),sizeof(realw)*size),4003);
@@ -1481,6 +1489,20 @@
                                      sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
 
 
+  // kernels
+  if( mp->simulation_type == 3 ){
+    size = NGLL3*(mp->NSPEC_CRUST_MANTLE);
+
+    // preconditioner
+    if( mp->approximate_hess_kl ){
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_kl_crust_mantle),
+                                         size*sizeof(realw)),3030);
+      // initializes with zeros
+      print_CUDA_error_if_any(cudaMemset(mp->d_hess_kl_crust_mantle,0,
+                                         size*sizeof(realw)),3031);
+    }
+  }
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_crust_mantle_device");
 #endif
@@ -1794,7 +1816,7 @@
 //  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_iboolleft_xi_inner_core,npoin2D_xi*sizeof(int)),280);
 //  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_iboolright_eta_inner_core,npoin2D_eta*sizeof(int)),290);
 //  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_iboolleft_eta_inner_core,npoin2D_eta*sizeof(int)),300);
-//  
+//
 //  print_CUDA_error_if_any(cudaMemcpy(mp->d_iboolright_xi_inner_core,iboolright_xi,
 //                                     npoin2D_xi*sizeof(int),cudaMemcpyHostToDevice),91);
 //  print_CUDA_error_if_any(cudaMemcpy(mp->d_iboolleft_xi_inner_core,iboolleft_xi,
@@ -1803,16 +1825,16 @@
 //                                     npoin2D_eta*sizeof(int),cudaMemcpyHostToDevice),93);
 //  print_CUDA_error_if_any(cudaMemcpy(mp->d_iboolleft_eta_inner_core,iboolleft_eta,
 //                                     npoin2D_eta*sizeof(int),cudaMemcpyHostToDevice),94);
-  
 
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_inner_core_device");
 #endif
-}                                  
- 
+}
 
 
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // for ACOUSTIC simulations
@@ -2607,10 +2629,10 @@
 
     // alloc storage for the surface buffer to be copied
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
-                                       NDIM*NGLL2*(mp->nspec_top)*sizeof(realw)),7005);    
+                                       NDIM*NGLL2*(mp->nspec_top)*sizeof(realw)),7005);
   }
-  
 
+
   // prepares noise source array
   if( mp->noise_tomography == 1 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
@@ -2637,12 +2659,12 @@
                                        nface_size*sizeof(realw)),7303);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
                                        nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308);
-                                       
+
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
                                        nface_size*sizeof(realw)),7304);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
                                        nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309);
-                                       
+
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_jacobian2D_top_crust_mantle,
                                        nface_size*sizeof(realw)),7305);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_top_crust_mantle, jacobian2D_top_crust_mantle,
@@ -2700,7 +2722,7 @@
     cudaFree(mp->d_kappavstore_crust_mantle);
     cudaFree(mp->d_kappahstore_crust_mantle);
     cudaFree(mp->d_muhstore_crust_mantle);
-    cudaFree(mp->d_eta_anisostore_crust_mantle);    
+    cudaFree(mp->d_eta_anisostore_crust_mantle);
     cudaFree(mp->d_ispec_is_tiso_crust_mantle);
   }else{
     cudaFree(mp->d_c11store_crust_mantle);
@@ -2830,7 +2852,11 @@
   cudaFree(mp->d_ispec_selected_source);
 
   // receivers
-  if( mp->nrec_local > 0 ) cudaFree(mp->d_number_receiver_global);
+  if( mp->nrec_local > 0 ) {
+    cudaFree(mp->d_number_receiver_global);
+    cudaFree(mp->d_station_seismo_field);
+    free(mp->h_station_seismo_field);
+  }
   cudaFree(mp->d_ispec_selected_rec);
 
   // rotation arrays

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-02-22 04:38:32 UTC (rev 19659)
@@ -110,17 +110,22 @@
 void FC_FUNC_(get_max_accel,
               GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} 
 
-void FC_FUNC_(get_norm_acoustic_from_device,
-              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
+void FC_FUNC_(check_norm_acoustic_from_device,
+              CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
                                                   int* SIMULATION_TYPE) {} 
 
-void FC_FUNC_(get_norm_elastic_from_device,
-              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
-                                                 long* Mesh_pointer_f,
-                                                 int* SIMULATION_TYPE) {} 
+void FC_FUNC_(check_norm_elastic_from_device,
+              CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+                                              long* Mesh_pointer_f,
+                                              int* SIMULATION_TYPE) {} 
 
+void FC_FUNC_(check_norm_strain_from_device,
+              CHECK_NORM_STRAIN_FROM_DEVICE)(realw* norm_strain,
+                                             realw* norm_strain2,
+                                             long* Mesh_pointer_f) {} 
 
+
 //
 // src/cuda/compute_add_sources_elastic_cuda.cu
 //
@@ -216,9 +221,7 @@
 
 void FC_FUNC_(compute_kernels_hess_cuda,
               COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
-                                         realw* deltat_f,
-                                         int* ELASTIC_SIMULATION,
-                                         int* ACOUSTIC_SIMULATION) {} 
+                                         realw* deltat_f) {} 
 
 
 //
@@ -245,24 +248,29 @@
 // src/cuda/it_update_displacement_cuda.cu
 //
 
-void FC_FUNC_(it_update_displacement_cuda,
-              IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
-                                                 int* size_F,
-                                                 realw* deltat_F,
-                                                 realw* deltatsqover2_F,
-                                                 realw* deltatover2_F,
-                                                 int* SIMULATION_TYPE,
-                                                 realw* b_deltat_F,
-                                                 realw* b_deltatsqover2_F,
-                                                 realw* b_deltatover2_F) {} 
+void FC_FUNC_(it_update_displacement_ic_cuda,
+              IT_UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
+                                             realw* deltat_F,
+                                             realw* deltatsqover2_F,
+                                             realw* deltatover2_F,
+                                             realw* b_deltat_F,
+                                             realw* b_deltatsqover2_F,
+                                             realw* b_deltatover2_F) {} 
 
-void FC_FUNC_(it_update_displacement_ac_cuda,
-              it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
-                                               int* size_F,
+void FC_FUNC_(it_update_displacement_cm_cuda,
+              IT_UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
+                                             realw* deltat_F,
+                                             realw* deltatsqover2_F,
+                                             realw* deltatover2_F,
+                                             realw* b_deltat_F,
+                                             realw* b_deltatsqover2_F,
+                                             realw* b_deltatover2_F) {} 
+
+void FC_FUNC_(it_update_displacement_oc_cuda,
+              IT_UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
                                                realw* deltat_F,
                                                realw* deltatsqover2_F,
                                                realw* deltatover2_F,
-                                               int* SIMULATION_TYPE,
                                                realw* b_deltat_F,
                                                realw* b_deltatsqover2_F,
                                                realw* b_deltatover2_F) {} 
@@ -355,7 +363,8 @@
                                         int* ANISOTROPIC_3D_MANTLE_f,
                                         int* ANISOTROPIC_INNER_CORE_f,
                                         int* SAVE_BOUNDARY_MESH_f,
-                                        int* USE_MESH_COLORING_GPU_f) {} 
+                                        int* USE_MESH_COLORING_GPU_f,
+                                        int* APPROXIMATE_HESS_KL_f) {} 
 
 void FC_FUNC_(prepare_fields_rotation_device,
               PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
@@ -713,12 +722,82 @@
 void FC_FUNC_(transfer_accel_cm_to_device,
               TRNASFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
+void FC_FUNC_(transfer_displ_cm_from_device,
+              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_displ_cm_from_device,
+              TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_displ_ic_from_device,
+              TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_b_displ_ic_from_device,
+              TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_veloc_cm_from_device,
+              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {} 
+
 void FC_FUNC_(transfer_accel_cm_from_device,
               TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(transfer_b_accel_cm_from_device,
               TRNASFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {} 
 
+void FC_FUNC_(transfer_accel_ic_from_device,
+              TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {} 
+
+void FC_FUNC_(transfer_strain_cm_from_device,
+              TRANSFER_STRAIN_CM_FROM_DEVICE)(long* Mesh_pointer,
+                                                  realw* eps_trace_over_3,
+                                                  realw* epsilondev_xx,
+                                                  realw* epsilondev_yy,
+                                                  realw* epsilondev_xy,
+                                                  realw* epsilondev_xz,
+                                                  realw* epsilondev_yz) {} 
+
+void FC_FUNC_(transfer_b_strain_cm_to_device,
+              TRANSFER_B_STRAIN_CM_TO_DEVICE)(long* Mesh_pointer,
+                                              realw* epsilondev_xx,
+                                              realw* epsilondev_yy,
+                                              realw* epsilondev_xy,
+                                              realw* epsilondev_xz,
+                                              realw* epsilondev_yz) {} 
+
+void FC_FUNC_(transfer_strain_ic_from_device,
+              TRANSFER_STRAIN_IC_FROM_DEVICE)(long* Mesh_pointer,
+                                              realw* eps_trace_over_3,
+                                              realw* epsilondev_xx,
+                                              realw* epsilondev_yy,
+                                              realw* epsilondev_xy,
+                                              realw* epsilondev_xz,
+                                              realw* epsilondev_yz) {} 
+
+void FC_FUNC_(transfer_b_strain_ic_to_device,
+              TRANSFER_B_STRAIN_IC_TO_DEVICE)(long* Mesh_pointer,
+                                              realw* epsilondev_xx,
+                                              realw* epsilondev_yy,
+                                              realw* epsilondev_xy,
+                                              realw* epsilondev_xz,
+                                              realw* epsilondev_yz) {} 
+
+void FC_FUNC_(transfer_rotation_from_device,
+              TRANSFER_ROTATION_FROM_DEVICE)(long* Mesh_pointer,
+                                             realw* A_array_rotation,
+                                             realw* B_array_rotation) {} 
+
+void FC_FUNC_(transfer_b_rotation_to_device,
+              TRANSFER_B_ROTATION_TO_DEVICE)(long* Mesh_pointer,
+                                              realw* A_array_rotation,
+                                              realw* B_array_rotation) {} 
+
+void FC_FUNC_(transfer_b_att_cm_to_device,
+              TRANSFER_B_ATT_CM_TO_DEVICE)(long* Mesh_pointer,
+                                           realw* R_xx,
+                                           realw* R_yy,
+                                           realw* R_xy,
+                                           realw* R_xz,
+                                           realw* R_yz) {} 
+
 void FC_FUNC_(transfer_sigma_from_device,
               TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {} 
 
@@ -834,10 +913,10 @@
                                                              realw* h_kappa_ac_kl,
                                                              int* NSPEC_AB) {} 
 
-void FC_FUNC_(transfer_kernels_hess_el_tohost,
-              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
+void FC_FUNC_(transfer_kernels_hess_cm_tohost,
+              TRANSFER_KERNELS_HESS_CM_TOHOST)(long* Mesh_pointer,
                                               realw* h_hess_kl,
-                                              int* NSPEC_AB) {} 
+                                              int* NSPEC) {} 
 
 void FC_FUNC_(transfer_kernels_hess_ac_tohost,
               TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
@@ -849,12 +928,20 @@
 // src/cuda/write_seismograms_cuda.cu
 //
 
-void FC_FUNC_(transfer_station_el_from_device,
-              TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
-                                                   realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                   long* Mesh_pointer_f,int* number_receiver_global,
-                                                   int* ispec_selected_rec,int* ispec_selected_source,
-                                                   int* ibool,int* SIMULATION_TYPEf) {} 
+void FC_FUNC_(write_seismograms_transfer_cuda,
+              WRITE_SEISMOGRAMS_TRANSFER_CUDA)(realw* displ,
+                                               realw* b_displ,
+                                               realw* eps_trace_over_3,
+                                               realw* epsilondev_xx,
+                                               realw* epsilondev_yy,
+                                               realw* epsilondev_xy,
+                                               realw* epsilondev_xz,
+                                               realw* epsilondev_yz,
+                                               long* Mesh_pointer_f,
+                                               int* number_receiver_global,
+                                               int* ispec_selected_rec,
+                                               int* ispec_selected_source,
+                                               int* ibool) {} 
 
 void FC_FUNC_(transfer_station_ac_from_device,
               TRANSFER_STATION_AC_FROM_DEVICE)(

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -168,6 +168,8 @@
 
 }
 
+/* ----------------------------------------------------------------------------------------------- */
+
 // inner_core
 extern "C"
 void FC_FUNC_(transfer_fields_ic_from_device,
@@ -183,6 +185,8 @@
 
 }
 
+/* ----------------------------------------------------------------------------------------------- */
+
 // outer_core
 extern "C"
 void FC_FUNC_(transfer_fields_oc_from_device,
@@ -218,36 +222,40 @@
 
 }
 
+/* ----------------------------------------------------------------------------------------------- */
+
 // inner_core
 extern "C"
 void FC_FUNC_(transfer_b_fields_ic_from_device,
               TRANSFER_B_FIELDS_IC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
                                                 long* Mesh_pointer_f) {
-  
+
   TRACE("transfer_fields_b_ic_from_device_");
-  
+
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  
+
   print_CUDA_error_if_any(cudaMemcpy(b_displ,mp->d_b_displ_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
   print_CUDA_error_if_any(cudaMemcpy(b_veloc,mp->d_b_veloc_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
   print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008);
-  
+
 }
 
+/* ----------------------------------------------------------------------------------------------- */
+
 // outer_core
 extern "C"
 void FC_FUNC_(transfer_b_fields_oc_from_device,
               TRANSFER_B_FIELDS_OC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
                                                 long* Mesh_pointer_f) {
-  
+
   TRACE("transfer_b_fields_oc_from_device_");
-  
+
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  
+
   print_CUDA_error_if_any(cudaMemcpy(b_displ,mp->d_b_displ_outer_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
   print_CUDA_error_if_any(cudaMemcpy(b_veloc,mp->d_b_veloc_outer_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
   print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel_outer_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008);
-  
+
 }
 
 
@@ -265,13 +273,79 @@
 
 }
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
+void FC_FUNC_(transfer_displ_cm_from_device,
+              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {
+
+  TRACE("transfer_displ_cm_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_displ_cm_from_device,
+              TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {
+
+  TRACE("transfer_b_displ_cm_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_b_displ_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_displ_ic_from_device,
+              TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {
+
+  TRACE("transfer_displ_ic_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_b_displ_ic_from_device,
+              TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {
+
+  TRACE("transfer_b_displ_ic_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_b_displ_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_veloc_cm_from_device,
+              TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {
+
+  TRACE("transfer_veloc_cm_from_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc_crust_mantle,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
 void FC_FUNC_(transfer_accel_cm_from_device,
               TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
 
-TRACE("transfer_accel_cm_from_device");
+  TRACE("transfer_accel_cm_from_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
@@ -293,20 +367,248 @@
 
 }
 
+/* ----------------------------------------------------------------------------------------------- */
 
+extern "C"
+void FC_FUNC_(transfer_accel_ic_from_device,
+              TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
 
+  TRACE("transfer_accel_ic_from_device");
 
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
+  print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel_inner_core,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40026);
 
+}
 
+/* ----------------------------------------------------------------------------------------------- */
 
+// strain fields
 
+/* ----------------------------------------------------------------------------------------------- */
 
+// crust/mantle
+extern "C"
+void FC_FUNC_(transfer_strain_cm_from_device,
+              TRANSFER_STRAIN_CM_FROM_DEVICE)(long* Mesh_pointer,
+                                                  realw* eps_trace_over_3,
+                                                  realw* epsilondev_xx,
+                                                  realw* epsilondev_yy,
+                                                  realw* epsilondev_xy,
+                                                  realw* epsilondev_xz,
+                                                  realw* epsilondev_yz) {
+  TRACE("transfer_strain_cm_from_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
 
+  int size = NGLL3*mp->NSPEC_CRUST_MANTLE;
 
+  cudaMemcpy(eps_trace_over_3,mp->d_eps_trace_over_3_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
 
+  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
 
 
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_strain_cm_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// crust/mantle
+
+extern "C"
+void FC_FUNC_(transfer_b_strain_cm_to_device,
+              TRANSFER_B_STRAIN_CM_TO_DEVICE)(long* Mesh_pointer,
+                                              realw* epsilondev_xx,
+                                              realw* epsilondev_yy,
+                                              realw* epsilondev_xy,
+                                              realw* epsilondev_xz,
+                                              realw* epsilondev_yz) {
+  TRACE("transfer_b_strain_cm_to_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  int size = NGLL3*mp->NSPEC_CRUST_MANTLE;
+
+  cudaMemcpy(mp->d_b_epsilondev_xx_crust_mantle,epsilondev_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yy_crust_mantle,epsilondev_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xy_crust_mantle,epsilondev_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xz_crust_mantle,epsilondev_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yz_crust_mantle,epsilondev_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_strain_cm_to_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// inner core
+
+extern "C"
+void FC_FUNC_(transfer_strain_ic_from_device,
+              TRANSFER_STRAIN_IC_FROM_DEVICE)(long* Mesh_pointer,
+                                              realw* eps_trace_over_3,
+                                              realw* epsilondev_xx,
+                                              realw* epsilondev_yy,
+                                              realw* epsilondev_xy,
+                                              realw* epsilondev_xz,
+                                              realw* epsilondev_yz) {
+  TRACE("transfer_strain_ic_from_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  int size = NGLL3*mp->NSPEC_INNER_CORE;
+
+  cudaMemcpy(eps_trace_over_3,mp->d_eps_trace_over_3_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_strain_ic_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// inner_core
+
+extern "C"
+void FC_FUNC_(transfer_b_strain_ic_to_device,
+              TRANSFER_B_STRAIN_IC_TO_DEVICE)(long* Mesh_pointer,
+                                              realw* epsilondev_xx,
+                                              realw* epsilondev_yy,
+                                              realw* epsilondev_xy,
+                                              realw* epsilondev_xz,
+                                              realw* epsilondev_yz) {
+  TRACE("transfer_b_strain_cm_to_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  int size = NGLL3*mp->NSPEC_INNER_CORE;
+
+  cudaMemcpy(mp->d_b_epsilondev_xx_inner_core,epsilondev_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yy_inner_core,epsilondev_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xy_inner_core,epsilondev_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_xz_inner_core,epsilondev_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_epsilondev_yz_inner_core,epsilondev_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_strain_ic_to_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// rotation arrays
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for outer core
+
+extern "C"
+void FC_FUNC_(transfer_rotation_from_device,
+              TRANSFER_ROTATION_FROM_DEVICE)(long* Mesh_pointer,
+                                             realw* A_array_rotation,
+                                             realw* B_array_rotation) {
+  TRACE("transfer_rotation_from_device");
+
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  int size = NGLL3*mp->NSPEC_OUTER_CORE;
+
+  cudaMemcpy(A_array_rotation,mp->d_A_array_rotation,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  cudaMemcpy(B_array_rotation,mp->d_B_array_rotation,size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_rotation_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for outer core
+
+extern "C"
+void FC_FUNC_(transfer_b_rotation_to_device,
+              TRANSFER_B_ROTATION_TO_DEVICE)(long* Mesh_pointer,
+                                              realw* A_array_rotation,
+                                              realw* B_array_rotation) {
+  TRACE("transfer_b_rotation_to_device");
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  int size = NGLL3*mp->NSPEC_OUTER_CORE;
+
+  cudaMemcpy(mp->d_b_A_array_rotation,A_array_rotation,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_B_array_rotation,B_array_rotation,size*sizeof(realw),cudaMemcpyHostToDevice);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_rotation_to_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// attenuation fields
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+// feature not used so far ...
+// crust_mantle
+extern "C"
+void FC_FUNC_(transfer_b_att_cm_to_device,
+              TRANSFER_B_ATT_CM_TO_DEVICE)(long* Mesh_pointer,
+                                           realw* R_xx,
+                                           realw* R_yy,
+                                           realw* R_xy,
+                                           realw* R_xz,
+                                           realw* R_yz) {
+  TRACE("transfer_b_att_cm_to_device");
+
+  //get mesh pointer out of fortran integer container
+  Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+  if( ! mp->use_attenuation_mimic){ exit_on_cuda_error("not supported attenuation feature yet");}
+
+  // not used so far...
+  // see notes about USE_ATTENUATION_MIMIC
+  int size = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
+
+  cudaMemcpy(mp->d_b_R_xx_crust_mantle,R_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_yy_crust_mantle,R_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xy_crust_mantle,R_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_xz_crust_mantle,R_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
+  cudaMemcpy(mp->d_b_R_yz_crust_mantle,R_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after transfer_b_att_cm_to_device");
+#endif
+}
+*/
+
+
+
+
+
+
+
+
+
 //daniel: TODO old code routines...
 
 
@@ -326,7 +628,7 @@
 }
 
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_b_displ_from_device,
               TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
@@ -338,9 +640,9 @@
   print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40056);
 
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_displ_from_device,
               TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
@@ -352,7 +654,7 @@
   print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40066);
 
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
 /*
 extern "C"
@@ -426,7 +728,7 @@
 */
 
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 // attenuation fields
 
 extern "C"
@@ -461,9 +763,9 @@
   exit_on_cuda_error("after transfer_b_fields_att_to_device");
 #endif
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 // attenuation fields
 
 extern "C"
@@ -498,8 +800,8 @@
   exit_on_cuda_error("after transfer_fields_att_from_device");
 #endif
 }
+*/
 
-
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
@@ -548,7 +850,7 @@
 // for ACOUSTIC simulations
 
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_fields_ac_to_device,
               TRANSFER_FIELDS_AC_TO_DEVICE)(
@@ -572,9 +874,9 @@
   exit_on_cuda_error("after transfer_fields_ac_to_device");
 #endif
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_b_fields_ac_to_device,
               TRANSFER_B_FIELDS_AC_TO_DEVICE)(
@@ -598,10 +900,10 @@
   exit_on_cuda_error("after transfer_b_fields_ac_to_device");
 #endif
 }
+*/
 
-
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_fields_ac_from_device,TRANSFER_FIELDS_AC_FROM_DEVICE)(
                                                                                          int* size,
@@ -624,9 +926,9 @@
   exit_on_cuda_error("after transfer_fields_ac_from_device");
 #endif
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_b_fields_ac_from_device,
               TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
@@ -650,9 +952,9 @@
   exit_on_cuda_error("after transfer_b_fields_ac_from_device");
 #endif
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_dot_dot_from_device,
               TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {
@@ -665,9 +967,9 @@
                                      sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50041);
 
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_b_dot_dot_from_device,
               TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {
@@ -680,8 +982,8 @@
                                      sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50042);
 
 }
+*/
 
-
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
@@ -711,20 +1013,20 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(transfer_kernels_hess_el_tohost,
-              TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
+void FC_FUNC_(transfer_kernels_hess_cm_tohost,
+              TRANSFER_KERNELS_HESS_CM_TOHOST)(long* Mesh_pointer,
                                               realw* h_hess_kl,
-                                              int* NSPEC_AB) {
-TRACE("transfer_kernels_hess_el_tohost");
+                                              int* NSPEC) {
+TRACE("transfer_kernels_hess_cm_tohost");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_el_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
+  print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_kl_crust_mantle,NGLL3*(*NSPEC)*sizeof(realw),
                                      cudaMemcpyDeviceToHost),70201);
 }
 
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_kernels_hess_ac_tohost,
               TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
@@ -737,5 +1039,5 @@
   print_CUDA_error_if_any(cudaMemcpy(h_hess_ac_kl,mp->d_hess_ac_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
                                      cudaMemcpyDeviceToHost),70202);
 }
+*/
 
-

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2012-02-22 04:38:32 UTC (rev 19659)
@@ -43,14 +43,18 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void transfer_stations_fields_from_device_kernel(int* number_receiver_global,
+__global__ void write_seismograms_transfer_from_device_kernel(int* number_receiver_global,
                                                             int* ispec_selected_rec,
                                                             int* ibool,
                                                             realw* station_seismo_field,
                                                             realw* desired_field,
                                                             int nrec_local) {
+
+// vector fields
+
   int blockID = blockIdx.x + blockIdx.y*gridDim.x;
-  if(blockID<nrec_local) {
+
+  if(blockID < nrec_local) {
     int irec = number_receiver_global[blockID]-1;
     int ispec = ispec_selected_rec[irec]-1;
     int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
@@ -61,16 +65,37 @@
   }
 }
 
+/* ----------------------------------------------------------------------------------------------- */
 
+__global__ void write_seismograms_transfer_scalar_from_device_kernel(int* number_receiver_global,
+                                                                     int* ispec_selected_rec,
+                                                                     int* ibool,
+                                                                     realw* station_seismo_field,
+                                                                     realw* desired_field,
+                                                                     int nrec_local) {
+
+// scalar fields
+
+  int blockID = blockIdx.x + blockIdx.y*gridDim.x;
+
+  if(blockID < nrec_local) {
+    int irec = number_receiver_global[blockID]-1;
+    int ispec = ispec_selected_rec[irec]-1;
+    int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+
+    station_seismo_field[NGLL3*blockID + threadIdx.x] = desired_field[iglob];
+  }
+}
+
 /* ----------------------------------------------------------------------------------------------- */
 
-void transfer_field_from_device(Mesh* mp, realw* d_field,realw* h_field,
+void write_seismograms_transfer_from_device(Mesh* mp, realw* d_field,realw* h_field,
                                           int* number_receiver_global,
                                           int* d_ispec_selected,
                                           int* h_ispec_selected,
                                           int* ibool) {
 
-TRACE("transfer_field_from_device");
+TRACE("write_seismograms_transfer_from_device");
 
   // checks if anything to do
   if( mp->nrec_local == 0 ) return;
@@ -82,80 +107,170 @@
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
-
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
   // prepare field transfer array on device
-  transfer_stations_fields_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
-                                                                d_ispec_selected,
-                                                                mp->d_ibool,
-                                                                mp->d_station_seismo_field,
-                                                                d_field,
-                                                                mp->nrec_local);
+  write_seismograms_transfer_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+                                                                  d_ispec_selected,
+                                                                  mp->d_ibool_crust_mantle,
+                                                                  mp->d_station_seismo_field,
+                                                                  d_field,
+                                                                  mp->nrec_local);
 
+  // copies array to CPU
   cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
-       (3*NGLL3)*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+             3*NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
 
   int irec_local;
-  for(irec_local=0;irec_local<mp->nrec_local;irec_local++) {
+  for(irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
     int irec = number_receiver_global[irec_local] - 1;
     int ispec = h_ispec_selected[irec] - 1;
 
-    for(int i=0;i<NGLL3;i++) {
+    for(int i = 0; i < NGLL3; i++) {
       int iglob = ibool[i+NGLL3*ispec] - 1;
       h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
       h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
       h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
     }
+  }
 
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("write_seismograms_transfer_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void write_seismograms_transfer_scalar_from_device(Mesh* mp,
+                                                   realw* d_field,realw* h_field,
+                                                   int* number_receiver_global,
+                                                   int* d_ispec_selected,
+                                                   int* h_ispec_selected,
+                                                   int* ibool) {
+
+  TRACE("write_seismograms_transfer_scalar_from_device");
+
+  // checks if anything to do
+  if( mp->nrec_local == 0 ) return;
+
+  int blocksize = NGLL3;
+  int num_blocks_x = mp->nrec_local;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
   }
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // prepare field transfer array on device
+  write_seismograms_transfer_scalar_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+                                                                         d_ispec_selected,
+                                                                         mp->d_ibool_crust_mantle,
+                                                                         mp->d_station_seismo_field,
+                                                                         d_field,
+                                                                         mp->nrec_local);
+
+  // copies array to CPU
+  cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
+             1*NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+
+  int irec_local;
+  for(irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
+    int irec = number_receiver_global[irec_local] - 1;
+    int ispec = h_ispec_selected[irec] - 1;
+
+    for(int i = 0; i < NGLL3; i++) {
+      int iglob = ibool[i+NGLL3*ispec] - 1;
+      h_field[iglob] = mp->h_station_seismo_field[i+irec_local*NGLL3];
+    }
+  }
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("transfer_field_from_device");
+  exit_on_cuda_error("write_seismograms_transfer_scalar_from_device");
 #endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(transfer_station_el_from_device,
-              TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
-                                                   realw* b_displ, realw* b_veloc, realw* b_accel,
-                                                   long* Mesh_pointer_f,int* number_receiver_global,
-                                                   int* ispec_selected_rec,int* ispec_selected_source,
-                                                   int* ibool,int* SIMULATION_TYPEf) {
-TRACE("transfer_station_el_from_device");
+void FC_FUNC_(write_seismograms_transfer_cuda,
+              WRITE_SEISMOGRAMS_TRANSFER_CUDA)(realw* displ,
+                                               realw* b_displ,
+                                               realw* eps_trace_over_3,
+                                               realw* epsilondev_xx,
+                                               realw* epsilondev_yy,
+                                               realw* epsilondev_xy,
+                                               realw* epsilondev_xz,
+                                               realw* epsilondev_yz,
+                                               long* Mesh_pointer_f,
+                                               int* number_receiver_global,
+                                               int* ispec_selected_rec,
+                                               int* ispec_selected_source,
+                                               int* ibool) {
+TRACE("write_seismograms_transfer_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
   // checks if anything to do
   if( mp->nrec_local == 0 ) return;
 
-  int SIMULATION_TYPE = *SIMULATION_TYPEf;
+  switch( mp->simulation_type ){
+    case 1:
+      write_seismograms_transfer_from_device(mp,mp->d_displ_crust_mantle,
+                                           displ,
+                                           number_receiver_global,
+                                           mp->d_ispec_selected_rec,
+                                           ispec_selected_rec, ibool);
+      break;
 
-  if(SIMULATION_TYPE == 1) {
-    transfer_field_from_device(mp,mp->d_displ,displ, number_receiver_global,
-             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-    transfer_field_from_device(mp,mp->d_veloc,veloc, number_receiver_global,
-             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-    transfer_field_from_device(mp,mp->d_accel,accel, number_receiver_global,
-             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
+    case 2:
+      write_seismograms_transfer_from_device(mp,mp->d_displ_crust_mantle,
+                                             displ,
+                                             number_receiver_global,
+                                             mp->d_ispec_selected_source,
+                                             ispec_selected_source, ibool);
+
+      write_seismograms_transfer_scalar_from_device(mp,mp->d_eps_trace_over_3_crust_mantle,
+                                                    eps_trace_over_3,
+                                                    number_receiver_global,
+                                                    mp->d_ispec_selected_source,
+                                                    ispec_selected_source, ibool);
+      write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_xx_crust_mantle,
+                                                    epsilondev_xx,
+                                                    number_receiver_global,
+                                                    mp->d_ispec_selected_source,
+                                                    ispec_selected_source, ibool);
+      write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_yy_crust_mantle,
+                                                    epsilondev_yy,
+                                                    number_receiver_global,
+                                                    mp->d_ispec_selected_source,
+                                                    ispec_selected_source, ibool);
+      write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_xy_crust_mantle,
+                                                    epsilondev_xy,
+                                                    number_receiver_global,
+                                                    mp->d_ispec_selected_source,
+                                                    ispec_selected_source, ibool);
+      write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_xz_crust_mantle,
+                                                    epsilondev_xz,
+                                                    number_receiver_global,
+                                                    mp->d_ispec_selected_source,
+                                                    ispec_selected_source, ibool);
+      write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_yz_crust_mantle,
+                                                    epsilondev_yz,
+                                                    number_receiver_global,
+                                                    mp->d_ispec_selected_source,
+                                                    ispec_selected_source, ibool);
+      break;
+
+    case 3:
+      write_seismograms_transfer_from_device(mp,mp->d_b_displ_crust_mantle,b_displ,
+                                                 number_receiver_global,
+                                                 mp->d_ispec_selected_rec,
+                                                 ispec_selected_rec, ibool);
+      break;
   }
-  else if(SIMULATION_TYPE == 2) {
-    transfer_field_from_device(mp,mp->d_displ,displ, number_receiver_global,
-             mp->d_ispec_selected_source, ispec_selected_source, ibool);
-    transfer_field_from_device(mp,mp->d_veloc,veloc, number_receiver_global,
-             mp->d_ispec_selected_source, ispec_selected_source, ibool);
-    transfer_field_from_device(mp,mp->d_accel,accel, number_receiver_global,
-             mp->d_ispec_selected_source, ispec_selected_source, ibool);
-  }
-  else if(SIMULATION_TYPE == 3) {
-    transfer_field_from_device(mp,mp->d_b_displ,b_displ, number_receiver_global,
-             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-    transfer_field_from_device(mp,mp->d_b_veloc,b_veloc, number_receiver_global,
-             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-    transfer_field_from_device(mp,mp->d_b_accel,b_accel, number_receiver_global,
-             mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-  }
 
 }
 
@@ -164,7 +279,7 @@
 // ACOUSTIC simulations
 
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 __global__ void transfer_stations_fields_acoustic_from_device_kernel(int* number_receiver_global,
                                                                      int* ispec_selected_rec,
                                                                      int* ibool,
@@ -182,9 +297,9 @@
 
   station_seismo_potential[nodeID] = desired_potential[iglob];
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 void transfer_field_acoustic_from_device(Mesh* mp,
                                          realw* d_potential,
                                          realw* h_potential,
@@ -245,9 +360,9 @@
   exit_on_cuda_error("transfer_field_acoustic_from_device");
 #endif
 }
-
+*/
 /* ----------------------------------------------------------------------------------------------- */
-
+/*
 extern "C"
 void FC_FUNC_(transfer_station_ac_from_device,
               TRANSFER_STATION_AC_FROM_DEVICE)(
@@ -313,4 +428,4 @@
   exit_on_cuda_error("transfer_station_ac_from_device");
 #endif
 }
-
+*/

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -120,9 +120,9 @@
 
   ! gets callers rank
   call MPI_COMM_RANK(MPI_COMM_WORLD,rank,ier)
-  
-  ! synchronizes MPI processes  
+
+  ! synchronizes MPI processes
   call MPI_BARRIER(MPI_COMM_WORLD,ier)
   if( ier /= 0 ) call exit_mpi(rank,'error synchronize MPI processes')
-  
+
   end subroutine sync_all

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -736,7 +736,7 @@
 
   use constants
   use specfem_par,only: Mesh_pointer
-  
+
   implicit none
 
   integer :: NPROC
@@ -799,7 +799,7 @@
 
   use constants
   use specfem_par,only: Mesh_pointer
-  
+
   implicit none
 
   integer :: NPROC
@@ -813,7 +813,7 @@
   integer :: FORWARD_OR_ADJOINT
 
   ! local parameters
-  integer :: iinterface 
+  integer :: iinterface
 
 ! assemble only if more than one partition
   if(NPROC > 1) then

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -956,10 +956,10 @@
 
   integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
   integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
-  
+
   integer :: IREGION
   integer :: FORWARD_OR_ADJOINT
-  
+
   ! local parameters
   integer iinterface
 
@@ -1068,7 +1068,7 @@
 ! waits for data to receive and assembles
   use constants
   use specfem_par,only: Mesh_pointer
-  
+
   implicit none
 
   integer :: NPROC
@@ -1083,7 +1083,7 @@
   integer :: FORWARD_OR_ADJOINT
 
   ! local parameters
-  
+
   integer iinterface
 
 ! here we have to assemble all the contributions between partitions using MPI
@@ -1098,7 +1098,7 @@
 
     ! adding contributions of neighbours
     call transfer_asmbl_accel_to_device(Mesh_pointer, &
-                                      buffer_recv_vector_ext_mesh, &                                      
+                                      buffer_recv_vector_ext_mesh, &
                                       IREGION,FORWARD_OR_ADJOINT)
 
     ! This step is done via previous function transfer_and_assemble...

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -31,14 +31,15 @@
                           epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
                           epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
                           SIMULATION_TYPE,OUTPUT_FILES,time_start,DT,t0,NSTEP, &
-                          myrank) !COMPUTE_AND_STORE_STRAIN,myrank)
+                          myrank)
 
+  use constants
+  use specfem_par,only: GPU_MODE,Mesh_pointer
+
   implicit none
 
   include 'mpif.h'
-  include "constants.h"
   include "precision.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
 
   ! time step
   integer it,NSTEP,myrank
@@ -71,7 +72,7 @@
   ! local parameters
   ! maximum of the norm of the displacement and of the potential in the fluid
   real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all,Ufluidnorm,Ufluidnorm_all
-  real(kind=CUSTOM_REAL) Strain_norm,Strain_norm_all,strain2_norm,strain2_norm_all
+  real(kind=CUSTOM_REAL) Strain_norm,Strain_norm_all,Strain2_norm,Strain2_norm_all
   real(kind=CUSTOM_REAL) b_Usolidnorm,b_Usolidnorm_all,b_Ufluidnorm,b_Ufluidnorm_all
   ! names of the data files for all the processors in MPI
   character(len=150) outputname
@@ -98,13 +99,25 @@
 
 
   ! compute maximum of norm of displacement in each slice
-  Usolidnorm = max( &
-      maxval(sqrt(displ_crust_mantle(1,:)**2 + &
-                  displ_crust_mantle(2,:)**2 + displ_crust_mantle(3,:)**2)), &
-      maxval(sqrt(displ_inner_core(1,:)**2 + displ_inner_core(2,:)**2 + displ_inner_core(3,:)**2)))
+  if( .not. GPU_MODE) then
+    ! on CPU
+    Usolidnorm = max( &
+        maxval(sqrt(displ_crust_mantle(1,:)**2 + &
+                    displ_crust_mantle(2,:)**2 + &
+                    displ_crust_mantle(3,:)**2)), &
+        maxval(sqrt(displ_inner_core(1,:)**2 + &
+                    displ_inner_core(2,:)**2 + &
+                    displ_inner_core(3,:)**2)))
 
-  Ufluidnorm = maxval(abs(displ_outer_core))
+    Ufluidnorm = maxval(abs(displ_outer_core))
 
+  else
+    ! on GPU
+    ! way 2: just get maximum of fields from GPU
+    call check_norm_elastic_from_device(Usolidnorm,Mesh_pointer,1)
+    call check_norm_acoustic_from_device(Ufluidnorm,Mesh_pointer,1)
+  endif
+
   ! compute the maximum of the maxima for all the slices using an MPI reduction
   call MPI_REDUCE(Usolidnorm,Usolidnorm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
                       MPI_COMM_WORLD,ier)
@@ -112,14 +125,21 @@
                       MPI_COMM_WORLD,ier)
 
   if (SIMULATION_TYPE == 3) then
-    b_Usolidnorm = max( &
+    if( .not. GPU_MODE) then
+      ! on CPU
+      b_Usolidnorm = max( &
              maxval(sqrt(b_displ_crust_mantle(1,:)**2 + &
                           b_displ_crust_mantle(2,:)**2 + b_displ_crust_mantle(3,:)**2)), &
              maxval(sqrt(b_displ_inner_core(1,:)**2  &
                         + b_displ_inner_core(2,:)**2 &
                         + b_displ_inner_core(3,:)**2)))
 
-    b_Ufluidnorm = maxval(abs(b_displ_outer_core))
+      b_Ufluidnorm = maxval(abs(b_displ_outer_core))
+    else
+      ! on GPU
+      call check_norm_elastic_from_device(b_Usolidnorm,Mesh_pointer,3)
+      call check_norm_acoustic_from_device(b_Ufluidnorm,Mesh_pointer,3)
+    endif
 
     ! compute the maximum of the maxima for all the slices using an MPI reduction
     call MPI_REDUCE(b_Usolidnorm,b_Usolidnorm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
@@ -129,10 +149,19 @@
   endif
 
   if (COMPUTE_AND_STORE_STRAIN) then
-    Strain_norm = maxval(abs(eps_trace_over_3_crust_mantle))
-    strain2_norm= max( maxval(abs(epsilondev_xx_crust_mantle)),maxval(abs(epsilondev_yy_crust_mantle)), &
-                       maxval(abs(epsilondev_xy_crust_mantle)),maxval(abs(epsilondev_xz_crust_mantle)), &
-                       maxval(abs(epsilondev_yz_crust_mantle)) )
+    if( .not. GPU_MODE) then
+      ! on CPU
+      Strain_norm = maxval(abs(eps_trace_over_3_crust_mantle))
+      Strain2_norm= max( maxval(abs(epsilondev_xx_crust_mantle)), &
+                         maxval(abs(epsilondev_yy_crust_mantle)), &
+                         maxval(abs(epsilondev_xy_crust_mantle)), &
+                         maxval(abs(epsilondev_xz_crust_mantle)), &
+                         maxval(abs(epsilondev_yz_crust_mantle)) )
+    else
+      ! on GPU
+      call check_norm_strain_from_device(Strain_norm,Strain2_norm,Mesh_pointer)
+    endif
+
     call MPI_REDUCE(Strain_norm,Strain_norm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
              MPI_COMM_WORLD,ier)
     call MPI_REDUCE(Strain2_norm,Strain2_norm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -25,10 +25,10 @@
 !
 !=====================================================================
 
-  subroutine compute_add_sources() 
+  subroutine compute_add_sources()
 
   use specfem_par
-  use specfem_par_crustmantle,only: accel_crust_mantle,ibool_crust_mantle  
+  use specfem_par_crustmantle,only: accel_crust_mantle,ibool_crust_mantle
   implicit none
 
   ! local parameters
@@ -106,8 +106,6 @@
 
   else
     ! on GPU
-    call load_GPU_elastic()
-    
     ! prepares buffer with source time function values, to be copied onto GPU
     if(USE_FORCE_POINT_SOURCE) then
       do isource = 1,NSOURCES
@@ -117,13 +115,11 @@
     else
       do isource = 1,NSOURCES
         stf_pre_compute(isource) = &
-          comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))      
+          comp_source_time_function(dble(it-1)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
       enddo
     endif
     ! adds sources: only implements SIMTYPE=1 and NOISE_TOM=0
     call compute_add_sources_el_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
-
-    call load_CPU_elastic()    
   endif
 
 
@@ -136,7 +132,7 @@
   subroutine compute_add_sources_adjoint()
 
   use specfem_par
-  use specfem_par_crustmantle,only: accel_crust_mantle,ibool_crust_mantle  
+  use specfem_par_crustmantle,only: accel_crust_mantle,ibool_crust_mantle
   implicit none
 
   ! local parameters
@@ -278,13 +274,9 @@
 
   else
     ! on GPU
-    call load_GPU_elastic()
-    
     call add_sources_el_sim_type_2_or_3(Mesh_pointer,nrec,adj_sourcearrays, &
                                        islice_selected_rec,ispec_selected_rec, &
-                                       iadj_vec(it))    
-
-    call load_CPU_elastic()                                       
+                                       iadj_vec(it))
   endif
 
   end subroutine compute_add_sources_adjoint
@@ -386,8 +378,6 @@
 
   else
     ! on GPU
-    call load_GPU_elastic()
-    
     ! prepares buffer with source time function values, to be copied onto GPU
     if(USE_FORCE_POINT_SOURCE) then
       do isource = 1,NSOURCES
@@ -397,13 +387,11 @@
     else
       do isource = 1,NSOURCES
         stf_pre_compute(isource) = &
-          comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))      
+          comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
       enddo
     endif
     ! adds sources: only implements SIMTYPE=3 (and NOISE_TOM=0)
     call compute_add_sources_el_s3_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
-
-    call load_CPU_elastic()    
   endif
 
   end subroutine compute_add_sources_backward

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic.F90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -81,7 +81,7 @@
     endif
 
     if( .not. GPU_MODE ) then
-      ! on CPU    
+      ! on CPU
       if( USE_DEVILLE_PRODUCTS_VAL ) then
         ! uses Deville et al. (2002) routine
         call compute_forces_outer_core_Dev(time,deltat,two_omega_earth, &
@@ -114,12 +114,8 @@
 
     else
       ! on GPU
-      call load_GPU_acoustic()
-      
       ! includes both forward and adjoint/kernel simulations
       call compute_forces_outer_core_cuda(Mesh_pointer,iphase,time,b_time)
-
-      call load_CPU_acoustic()
     endif
 
 
@@ -135,6 +131,9 @@
       !---
       !--- couple with mantle at the top of the outer core
       !---
+      call load_CPU_acoustic()
+      call load_CPU_elastic()
+
       if(ACTUALLY_COUPLE_FLUID_CMB) &
         call compute_coupling_fluid_CMB(displ_crust_mantle,b_displ_crust_mantle, &
                               ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
@@ -153,6 +152,9 @@
                               normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
                               wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
                               SIMULATION_TYPE,NSPEC2D_BOTTOM(IREGION_OUTER_CORE))
+
+      call load_GPU_acoustic()
+
     endif
 
 
@@ -160,7 +162,7 @@
     ! in outer core
     if( iphase == 1 ) then
       ! sends out MPI interface data (non-blocking)
-      
+
       if(.NOT. GPU_MODE) then
         ! on CPU
         call assemble_MPI_scalar_ext_mesh_s(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
@@ -172,7 +174,6 @@
                                 request_send_scalar_outer_core,request_recv_scalar_outer_core)
       else
         ! on GPU
-        call load_GPU_acoustic()
         ! outer core
         call assemble_MPI_scalar_send_cuda(NPROCTOT_VAL, &
                                 buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core, &
@@ -196,7 +197,6 @@
                                 b_request_send_scalar_outer_core,b_request_recv_scalar_outer_core)
         else
           ! on GPU
-          call load_GPU_acoustic()
           ! outer core
           call assemble_MPI_scalar_send_cuda(NPROCTOT_VAL, &
                                 b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
@@ -207,10 +207,10 @@
                                 3) ! <-- 3 == adjoint b_accel
         endif ! GPU
       endif ! SIMULATION_TYPE == 3
-      
+
     else
       ! make sure the last communications are finished and processed
-      ! waits for send/receive requests to be completed and assembles values      
+      ! waits for send/receive requests to be completed and assembles values
       if(.NOT. GPU_MODE) then
         ! on CPU
         call assemble_MPI_scalar_ext_mesh_w(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
@@ -226,7 +226,6 @@
                                 num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
                                 request_send_scalar_outer_core,request_recv_scalar_outer_core, &
                                 1) ! <-- 1 == fwd accel
-        call load_CPU_acoustic()
       endif
 
       ! adjoint simulations
@@ -245,8 +244,7 @@
                                 b_buffer_recv_scalar_outer_core, &
                                 num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
                                 b_request_send_scalar_outer_core,b_request_recv_scalar_outer_core, &
-                                3) ! <-- 3 == adjoint b_accel 
-          call load_CPU_acoustic()
+                                3) ! <-- 3 == adjoint b_accel
         endif
       endif ! SIMULATION_TYPE == 3
     endif ! iphase == 1
@@ -264,14 +262,12 @@
     if (SIMULATION_TYPE == 3) &
       call compute_forces_ac_update_veloc(b_veloc_outer_core,b_accel_outer_core, &
                                          b_deltatover2,rmass_outer_core)
-  else    
+  else
     ! on GPU
-    call load_GPU_acoustic()
     call kernel_3_outer_core_cuda(Mesh_pointer, &
                                 deltatover2,SIMULATION_TYPE,b_deltatover2)
-    call load_CPU_acoustic()
   endif
-  
+
   end subroutine compute_forces_acoustic
 
 !=====================================================================
@@ -332,30 +328,30 @@
 !=====================================================================
 
   subroutine load_GPU_acoustic
-  
+
   use specfem_par
   use specfem_par_outercore
   implicit none
-  
+
   ! daniel: TODO - temporary transfers to the GPU
   call transfer_fields_oc_to_device(NGLOB_OUTER_CORE,displ_outer_core, &
                                 veloc_outer_core,accel_outer_core,Mesh_pointer)
 
   if( SIMULATION_TYPE == 3 ) then
     call transfer_b_fields_oc_to_device(NGLOB_OUTER_CORE,b_displ_outer_core, &
-                                b_veloc_outer_core,b_accel_outer_core,Mesh_pointer)  
+                                b_veloc_outer_core,b_accel_outer_core,Mesh_pointer)
   endif
-    
-  end subroutine 
 
+  end subroutine
+
 !=====================================================================
 
   subroutine load_CPU_acoustic
-  
+
   use specfem_par
   use specfem_par_outercore
   implicit none
-  
+
   ! daniel: TODO - temporary transfers to the CPU
   call transfer_fields_oc_from_device(NGLOB_OUTER_CORE,displ_outer_core, &
                                 veloc_outer_core,accel_outer_core,Mesh_pointer)
@@ -364,5 +360,5 @@
     call transfer_b_fields_oc_from_device(NGLOB_OUTER_CORE,b_displ_outer_core, &
                                 b_veloc_outer_core,b_accel_outer_core,Mesh_pointer)
   endif
-    
+
   end subroutine

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -185,20 +185,15 @@
       endif !SIMULATION_TYPE == 3
 
     else
-      ! on GPU      
-      call load_GPU_elastic()      
-      
+      ! on GPU
       ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
       ! for crust/mantle
       call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase)
       ! for inner core
       call compute_forces_inner_core_cuda(Mesh_pointer,iphase)
-            
-      call load_CPU_elastic()      
     endif ! GPU_MODE
 
 
-    
     ! computes additional contributions to acceleration field
     if( iphase == 1 ) then
 
@@ -226,7 +221,7 @@
         ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
         ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
         ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
-        call noise_add_source_master_rec()         
+        call noise_add_source_master_rec()
       case( 2 )
         ! second step of noise tomography, i.e., read the surface movie saved at every timestep
         ! use the movie to drive the ensemble forward wavefield
@@ -241,13 +236,15 @@
         ! use the movie to reconstruct the ensemble forward wavefield
         ! the ensemble adjoint wavefield is done as usual
         ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
-        call noise_read_add_surface_movie(b_accel_crust_mantle,it)                                
+        call noise_read_add_surface_movie(b_accel_crust_mantle,it)
       end select
 
 
       ! ****************************************************
       ! **********  add matching with fluid part  **********
       ! ****************************************************
+      call load_CPU_elastic()
+      call load_CPU_acoustic()
 
       ! only for elements in first matching layer in the solid
 
@@ -277,18 +274,20 @@
                               RHO_BOTTOM_OC,minus_g_icb, &
                               SIMULATION_TYPE,NSPEC2D_TOP(IREGION_INNER_CORE))
 
+      call load_GPU_elastic()
+
     endif ! iphase == 1
 
-    
+
     ! assemble all the contributions between slices using MPI
 
     ! crust/mantle and inner core handled in the same call
     ! in order to reduce the number of MPI messages by 2
 
     if( iphase == 1 ) then
-      ! sends out MPI interface data      
+      ! sends out MPI interface data
       if(.NOT. GPU_MODE) then
-        ! on CPU      
+        ! on CPU
         ! sends accel values to corresponding MPI interface neighbors
         ! crust mantle
         call assemble_MPI_vector_ext_mesh_s(NPROCTOT_VAL,NGLOB_CRUST_MANTLE, &
@@ -307,8 +306,7 @@
                       my_neighbours_inner_core, &
                       request_send_vector_inner_core,request_recv_vector_inner_core)
       else
-        ! on GPU    
-        call load_GPU_elastic()
+        ! on GPU
         ! crust mantle
         call assemble_MPI_vector_send_cuda(NPROCTOT_VAL, &
                       buffer_send_vector_crust_mantle,buffer_recv_vector_crust_mantle, &
@@ -332,7 +330,7 @@
       ! adjoint / kernel runs
       if (SIMULATION_TYPE == 3) then
         if(.NOT. GPU_MODE) then
-          ! on CPU          
+          ! on CPU
           ! sends accel values to corresponding MPI interface neighbors
           ! crust mantle
           call assemble_MPI_vector_ext_mesh_s(NPROCTOT_VAL,NGLOB_CRUST_MANTLE, &
@@ -350,9 +348,8 @@
                         nibool_interfaces_inner_core,ibool_interfaces_inner_core,&
                         my_neighbours_inner_core, &
                         b_request_send_vector_inner_core,b_request_recv_vector_inner_core)
-        else 
+        else
           ! on GPU
-          call load_GPU_elastic()
           ! crust mantle
           call assemble_MPI_vector_send_cuda(NPROCTOT_VAL, &
                       b_buffer_send_vector_crust_mantle,b_buffer_recv_vector_crust_mantle, &
@@ -361,7 +358,7 @@
                       my_neighbours_crust_mantle, &
                       b_request_send_vector_crust_mantle,b_request_recv_vector_crust_mantle, &
                       IREGION_CRUST_MANTLE, &
-                      3) ! <-- 3 == adjoint b_accel 
+                      3) ! <-- 3 == adjoint b_accel
           ! inner core
           call assemble_MPI_vector_send_cuda(NPROCTOT_VAL, &
                       b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
@@ -375,7 +372,7 @@
       endif ! SIMULATION_TYPE == 3
 
     else
-      ! waits for send/receive requests to be completed and assembles values      
+      ! waits for send/receive requests to be completed and assembles values
       if(.NOT. GPU_MODE) then
         ! on CPU
         ! crust mantle
@@ -394,7 +391,7 @@
                               request_send_vector_inner_core,request_recv_vector_inner_core)
       else
         ! on GPU
-        ! crust mantle  
+        ! crust mantle
         call assemble_MPI_vector_write_cuda(NPROCTOT_VAL, &
                             buffer_recv_vector_crust_mantle, &
                             num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
@@ -407,14 +404,13 @@
                             num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
                             request_send_vector_inner_core,request_recv_vector_inner_core, &
                             IREGION_INNER_CORE, &
-                            1) 
-        call load_CPU_elastic()
+                            1)
       endif
-      
 
+
       ! adjoint / kernel runs
       if (SIMULATION_TYPE == 3) then
-        ! waits for send/receive requests to be completed and assembles values      
+        ! waits for send/receive requests to be completed and assembles values
         if(.NOT. GPU_MODE) then
           ! on CPU
           ! crust mantle
@@ -431,7 +427,7 @@
                               max_nibool_interfaces_inner_core, &
                               nibool_interfaces_inner_core,ibool_interfaces_inner_core, &
                               b_request_send_vector_inner_core,b_request_recv_vector_inner_core)
-                            
+
         else
           ! on GPU
           ! crust mantle
@@ -440,16 +436,15 @@
                             num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
                             b_request_send_vector_crust_mantle,b_request_recv_vector_crust_mantle, &
                             IREGION_CRUST_MANTLE, &
-                            3) ! <-- 3 == adjoint b_accel 
+                            3) ! <-- 3 == adjoint b_accel
           ! inner core
           call assemble_MPI_vector_write_cuda(NPROCTOT_VAL,&
                             b_buffer_recv_vector_inner_core, &
                             num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
                             b_request_send_vector_inner_core,b_request_recv_vector_inner_core, &
                             IREGION_INNER_CORE, &
-                            3) 
-          call load_CPU_elastic()
-        endif                            
+                            3)
+        endif
       endif ! SIMULATION_TYPE == 3
     endif ! iphase == 1
 
@@ -466,20 +461,22 @@
                                       b_two_omega_earth,rmass_crust_mantle)
   else
     ! on GPU
-    call load_GPU_elastic()    
     call kernel_3_a_cuda(Mesh_pointer, &
                         deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS_VAL)
-    call load_CPU_elastic()
   endif
-  
+
   ! couples ocean with crust mantle
   ! (updates acceleration with ocean load approximation)
-  if(OCEANS_VAL) &
+  if(OCEANS_VAL) then
+    call load_CPU_elastic()
+
     call compute_coupling_ocean(accel_crust_mantle,b_accel_crust_mantle, &
                           rmass_crust_mantle,rmass_ocean_load,normal_top_crust_mantle, &
                           ibool_crust_mantle,ibelm_top_crust_mantle, &
                           updated_dof_ocean_load, &
                           SIMULATION_TYPE,NSPEC2D_TOP(IREGION_CRUST_MANTLE))
+    call load_GPU_elastic()
+  endif
 
   ! Newmark time scheme:
   ! corrector terms for elastic parts
@@ -496,12 +493,10 @@
                                         b_deltatover2,b_two_omega_earth,rmass_inner_core)
   else
     ! on GPU
-    call load_GPU_elastic()
     call kernel_3_b_cuda(Mesh_pointer, &
                         deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS_VAL)
-    call load_CPU_elastic()    
   endif
-  
+
   end subroutine compute_forces_elastic
 
 
@@ -694,12 +689,12 @@
 !=====================================================================
 
   subroutine load_GPU_elastic
-  
+
   use specfem_par
   use specfem_par_crustmantle
   use specfem_par_innercore
   implicit none
-  
+
   ! daniel: TODO - temporary transfers to the GPU
   call transfer_fields_cm_to_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle, &
                                   veloc_crust_mantle,accel_crust_mantle,Mesh_pointer)
@@ -712,29 +707,29 @@
     call transfer_b_fields_ic_to_device(NDIM*NGLOB_INNER_CORE,b_displ_inner_core, &
                                   b_veloc_inner_core,b_accel_inner_core,Mesh_pointer)
   endif
-    
-  end subroutine 
 
+  end subroutine
+
 !=====================================================================
 
   subroutine load_CPU_elastic
-  
+
   use specfem_par
   use specfem_par_crustmantle
   use specfem_par_innercore
   implicit none
-  
+
   ! daniel: TODO - temporary transfers back to the CPU
   call transfer_fields_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle, &
-                                  veloc_crust_mantle,accel_crust_mantle,Mesh_pointer)                                  
+                                  veloc_crust_mantle,accel_crust_mantle,Mesh_pointer)
   call transfer_fields_ic_from_device(NDIM*NGLOB_INNER_CORE,displ_inner_core, &
                                   veloc_inner_core,accel_inner_core,Mesh_pointer)
 
   if( SIMULATION_TYPE == 3 ) then
     call transfer_b_fields_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,b_displ_crust_mantle, &
-                                  b_veloc_crust_mantle,b_accel_crust_mantle,Mesh_pointer)                                  
+                                  b_veloc_crust_mantle,b_accel_crust_mantle,Mesh_pointer)
     call transfer_b_fields_ic_from_device(NDIM*NGLOB_INNER_CORE,b_displ_inner_core, &
-                                  b_veloc_inner_core,b_accel_inner_core,Mesh_pointer)  
+                                  b_veloc_inner_core,b_accel_inner_core,Mesh_pointer)
   endif
-    
+
   end subroutine

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -101,7 +101,7 @@
   logical :: phase_is_inner
 
   ! local parameters
-  
+
   real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ) :: epsilondev_loc
   real(kind=CUSTOM_REAL), dimension(NGLLX, NGLLY, NGLLZ) :: factor_common_use
   real(kind=CUSTOM_REAL) R_xx_val,R_yy_val

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_kernels.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -94,6 +94,20 @@
 
   ! --- boundary kernels ------
   if (SAVE_BOUNDARY_MESH) then
+
+    ! transfers wavefields onto CPU
+    if( GPU_MODE ) then
+      ! crust/mantle
+      call transfer_accel_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,accel_crust_mantle,Mesh_pointer)
+      call transfer_displ_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle,Mesh_pointer)
+      call transfer_b_displ_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,b_displ_crust_mantle,Mesh_pointer)
+      ! inner core
+      call transfer_accel_ic_from_device(NDIM*NGLOB_CRUST_MANTLE,accel_inner_core,Mesh_pointer)
+      call transfer_displ_ic_from_device(NDIM*NGLOB_CRUST_MANTLE,displ_inner_core,Mesh_pointer)
+      call transfer_b_displ_ic_from_device(NDIM*NGLOB_CRUST_MANTLE,b_displ_inner_core,Mesh_pointer)
+    endif
+
+    ! updates kernels on CPU
     fluid_solid_boundary = .false.
     iregion_code = IREGION_CRUST_MANTLE
 
@@ -1294,11 +1308,10 @@
                                     accel_crust_mantle,b_accel_crust_mantle, &
                                     deltat)
 
+  use constants
+  use specfem_par,only: GPU_MODE,Mesh_pointer
   implicit none
 
-  include "constants.h"
-  include "OUTPUT_FILES/values_from_mesher.h"
-
   integer, dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE) :: ibool_crust_mantle
 
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_ADJOINT) :: &
@@ -1314,26 +1327,34 @@
   ! local parameters
   integer :: i,j,k,ispec,iglob
 
-  ! crust_mantle
-  do ispec = 1, NSPEC_CRUST_MANTLE
-    do k = 1, NGLLZ
-      do j = 1, NGLLY
-        do i = 1, NGLLX
-          iglob = ibool_crust_mantle(i,j,k,ispec)
+  if( .not. GPU_MODE ) then
+    ! on CPU
+    ! crust_mantle
+    do ispec = 1, NSPEC_CRUST_MANTLE
+      do k = 1, NGLLZ
+        do j = 1, NGLLY
+          do i = 1, NGLLX
+            iglob = ibool_crust_mantle(i,j,k,ispec)
 
-          ! approximates hessian
-          ! term with adjoint acceleration and backward/reconstructed acceleration
-          hess_kl_crust_mantle(i,j,k,ispec) =  hess_kl_crust_mantle(i,j,k,ispec) &
-             + deltat * (accel_crust_mantle(1,iglob) * b_accel_crust_mantle(1,iglob) &
-             + accel_crust_mantle(2,iglob) * b_accel_crust_mantle(2,iglob) &
-             + accel_crust_mantle(3,iglob) * b_accel_crust_mantle(3,iglob) )
+            ! approximates hessian
+            ! term with adjoint acceleration and backward/reconstructed acceleration
+            hess_kl_crust_mantle(i,j,k,ispec) =  hess_kl_crust_mantle(i,j,k,ispec) &
+               + deltat * (accel_crust_mantle(1,iglob) * b_accel_crust_mantle(1,iglob) &
+               + accel_crust_mantle(2,iglob) * b_accel_crust_mantle(2,iglob) &
+               + accel_crust_mantle(3,iglob) * b_accel_crust_mantle(3,iglob) )
 
+          enddo
         enddo
       enddo
     enddo
-  enddo
 
+  else
+    ! updates kernels on GPU
 
+    ! computes contribution to density and bulk modulus kernel
+    call compute_kernels_hess_cuda(Mesh_pointer,deltat)
+  endif
+
   end subroutine compute_kernels_hessian
 
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -32,7 +32,7 @@
   use specfem_par,only: &
     ichunk,SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
     wgllwgll_xz,wgllwgll_yz,wgllwgll_xy
-  
+
   use specfem_par,only: GPU_MODE,Mesh_pointer
 
   use specfem_par_crustmantle, only: &
@@ -73,8 +73,6 @@
 
 
   ! crust & mantle
-  if( GPU_MODE ) call load_GPU_elastic()   
-  
   !   xmin
   ! if two chunks exclude this face for one of them
   if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
@@ -130,19 +128,19 @@
           enddo
         enddo
       enddo
-      
+
     else
       ! on GPU
       if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
                                                                 absorb_xmin_crust_mantle, &
-                                                                0) ! <= xmin              
+                                                                0) ! <= xmin
     endif
-    
+
     ! writes absorbing boundary values
     if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmin_crust_mantle > 0 ) then
       call write_abs(0,absorb_xmin_crust_mantle, reclen_xmin_crust_mantle,it)
     endif
-    
+
   endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC
 
   !   xmax
@@ -204,15 +202,15 @@
       ! on GPU
       if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
                                                                 absorb_xmax_crust_mantle, &
-                                                                1) ! <= xmin              
+                                                                1) ! <= xmin
     endif
 
-  
+
     ! writes absorbing boundary values
     if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_xmax_crust_mantle > 0 ) then
       call write_abs(1,absorb_xmax_crust_mantle,reclen_xmax_crust_mantle,it)
     endif
-    
+
   endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB
 
   !   ymin
@@ -272,7 +270,7 @@
     ! on GPU
     if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
                                                               absorb_ymin_crust_mantle, &
-                                                              2) ! <= ymin              
+                                                              2) ! <= ymin
   endif
 
 
@@ -340,7 +338,7 @@
     ! on GPU
     if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
                                                               absorb_ymax_crust_mantle, &
-                                                              3) ! <= ymax              
+                                                              3) ! <= ymax
   endif
 
   ! writes absorbing boundary values
@@ -348,7 +346,5 @@
     call write_abs(3,absorb_ymax_crust_mantle,reclen_ymax_crust_mantle,it)
   endif
 
-  if( GPU_MODE ) call load_CPU_elastic()   
-
   end subroutine compute_stacey_crust_mantle
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -32,7 +32,7 @@
 
   use specfem_par,only: &
     ichunk,SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
-    wgllwgll_xz,wgllwgll_yz,wgllwgll_xy    
+    wgllwgll_xz,wgllwgll_yz,wgllwgll_xy
 
   use specfem_par,only: GPU_MODE,Mesh_pointer
 
@@ -70,8 +70,7 @@
   !           file access (by process rank modulo 8) showed that the following,
   !           simple approach is still fastest. (assuming that files are accessed on a local scratch disk)
 
-  if( GPU_MODE ) call load_GPU_acoustic()   
-
+  ! outer core
   !   xmin
   ! if two chunks exclude this face for one of them
   if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
@@ -116,7 +115,7 @@
       ! on GPU
       if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
                                                                 absorb_xmin_outer_core, &
-                                                                4) ! <= xmin              
+                                                                4) ! <= xmin
     endif
 
     ! writes absorbing boundary values
@@ -311,11 +310,8 @@
                                                               8) ! <= zmin
   endif
 
-
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. nspec2D_zmin_outer_core > 0 ) then
     call write_abs(8,absorb_zmin_outer_core,reclen_zmin,it)
   endif
 
-  if( GPU_MODE) call load_CPU_acoustic()
-
   end subroutine compute_stacey_outer_core

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/finalize_simulation.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -147,14 +147,14 @@
     ! boundary kernel
     if (SAVE_BOUNDARY_MESH) then
       call save_kernels_boundary_kl(myrank,scale_t,scale_displ, &
-                                  moho_kl,d400_kl,d670_kl,cmb_kl,icb_kl, &
-                                  LOCAL_PATH,HONOR_1D_SPHERICAL_MOHO)
+                                   moho_kl,d400_kl,d670_kl,cmb_kl,icb_kl, &
+                                   LOCAL_PATH,HONOR_1D_SPHERICAL_MOHO)
     endif
 
     ! approximate hessian
     if( APPROXIMATE_HESS_KL ) then
       call save_kernels_hessian(myrank,scale_t,scale_displ, &
-                                            hess_kl_crust_mantle,LOCAL_PATH)
+                               hess_kl_crust_mantle,LOCAL_PATH)
     endif
   endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/iterate_time.F90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -100,7 +100,9 @@
     endif
 
     ! write the seismograms with time shift
-    call write_seismograms()
+    if( nrec_local > 0 .or. ( WRITE_SEISMOGRAMS_BY_MASTER .and. myrank == 0 ) ) then
+      call write_seismograms()
+    endif
 
     ! adjoint simulations: kernels
     if( SIMULATION_TYPE == 3 ) then
@@ -116,9 +118,6 @@
     ! modified from the subroutine 'write_movie_surface'
     if ( NOISE_TOMOGRAPHY == 1 ) then
       call noise_save_surface_movie()
-!                              displ_crust_mantle, &
-!                              ibelm_top_crust_mantle,ibool_crust_mantle, &
-!                              NSPEC2D_TOP(IREGION_CRUST_MANTLE),noise_surface_movie,it)
     endif
 
   enddo   ! end of main time loop
@@ -175,7 +174,11 @@
   ! local parameters
   integer :: i
 
-  ! Newmark time scheme update
+  ! updates wavefields
+  if( .not. GPU_MODE) then
+  ! on CPU
+
+    ! Newmark time scheme update
 #ifdef _HANDOPT_NEWMARK
 ! way 2:
 ! One common technique in computational science to help enhance pipelining is loop unrolling
@@ -186,240 +189,263 @@
 ! in most cases a real (CUSTOM_REAL) value will have 4 bytes,
 ! assuming a default cache size of about 128 bytes, we unroll here in steps of 3, thus 29 reals or 118 bytes,
 ! rather than with steps of 4
-  ! mantle
-  if(imodulo_NGLOB_CRUST_MANTLE >= 1) then
-    do i = 1,imodulo_NGLOB_CRUST_MANTLE
+    ! mantle
+    if(imodulo_NGLOB_CRUST_MANTLE >= 1) then
+      do i = 1,imodulo_NGLOB_CRUST_MANTLE
+        displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
+          + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
+
+        veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
+          + deltatover2*accel_crust_mantle(:,i)
+
+        accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+      enddo
+    endif
+    do i = imodulo_NGLOB_CRUST_MANTLE+1,NGLOB_CRUST_MANTLE, 3 ! in steps of 3
       displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
         + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
+      displ_crust_mantle(:,i+1) = displ_crust_mantle(:,i+1) &
+        + deltat*veloc_crust_mantle(:,i+1) + deltatsqover2*accel_crust_mantle(:,i+1)
+      displ_crust_mantle(:,i+2) = displ_crust_mantle(:,i+2) &
+        + deltat*veloc_crust_mantle(:,i+2) + deltatsqover2*accel_crust_mantle(:,i+2)
 
+
       veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
         + deltatover2*accel_crust_mantle(:,i)
+      veloc_crust_mantle(:,i+1) = veloc_crust_mantle(:,i+1) &
+        + deltatover2*accel_crust_mantle(:,i+1)
+      veloc_crust_mantle(:,i+2) = veloc_crust_mantle(:,i+2) &
+        + deltatover2*accel_crust_mantle(:,i+2)
 
+      ! set acceleration to zero
+      ! note: we do initialize acceleration in this loop since it is read already into the cache,
+      !           otherwise it would have to be read in again for this explicitly,
+      !           which would make this step more expensive
       accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+      accel_crust_mantle(:,i+1) = 0._CUSTOM_REAL
+      accel_crust_mantle(:,i+2) = 0._CUSTOM_REAL
     enddo
-  endif
-  do i = imodulo_NGLOB_CRUST_MANTLE+1,NGLOB_CRUST_MANTLE, 3 ! in steps of 3
-    displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
-      + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
-    displ_crust_mantle(:,i+1) = displ_crust_mantle(:,i+1) &
-      + deltat*veloc_crust_mantle(:,i+1) + deltatsqover2*accel_crust_mantle(:,i+1)
-    displ_crust_mantle(:,i+2) = displ_crust_mantle(:,i+2) &
-      + deltat*veloc_crust_mantle(:,i+2) + deltatsqover2*accel_crust_mantle(:,i+2)
 
+    ! outer core
+    do i=1,NGLOB_OUTER_CORE
+      displ_outer_core(i) = displ_outer_core(i) &
+        + deltat*veloc_outer_core(i) + deltatsqover2*accel_outer_core(i)
 
-    veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
-      + deltatover2*accel_crust_mantle(:,i)
-    veloc_crust_mantle(:,i+1) = veloc_crust_mantle(:,i+1) &
-      + deltatover2*accel_crust_mantle(:,i+1)
-    veloc_crust_mantle(:,i+2) = veloc_crust_mantle(:,i+2) &
-      + deltatover2*accel_crust_mantle(:,i+2)
+      veloc_outer_core(i) = veloc_outer_core(i) &
+        + deltatover2*accel_outer_core(i)
 
-    ! set acceleration to zero
-    ! note: we do initialize acceleration in this loop since it is read already into the cache,
-    !           otherwise it would have to be read in again for this explicitly,
-    !           which would make this step more expensive
-    accel_crust_mantle(:,i) = 0._CUSTOM_REAL
-    accel_crust_mantle(:,i+1) = 0._CUSTOM_REAL
-    accel_crust_mantle(:,i+2) = 0._CUSTOM_REAL
-  enddo
+      accel_outer_core(i) = 0._CUSTOM_REAL
+    enddo
 
-  ! outer core
-  do i=1,NGLOB_OUTER_CORE
-    displ_outer_core(i) = displ_outer_core(i) &
-      + deltat*veloc_outer_core(i) + deltatsqover2*accel_outer_core(i)
+    ! inner core
+    if(imodulo_NGLOB_INNER_CORE >= 1) then
+      do i = 1,imodulo_NGLOB_INNER_CORE
+        displ_inner_core(:,i) = displ_inner_core(:,i) &
+          + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
 
-    veloc_outer_core(i) = veloc_outer_core(i) &
-      + deltatover2*accel_outer_core(i)
+        veloc_inner_core(:,i) = veloc_inner_core(:,i) &
+          + deltatover2*accel_inner_core(:,i)
 
-    accel_outer_core(i) = 0._CUSTOM_REAL
-  enddo
-
-  ! inner core
-  if(imodulo_NGLOB_INNER_CORE >= 1) then
-    do i = 1,imodulo_NGLOB_INNER_CORE
+        accel_inner_core(:,i) = 0._CUSTOM_REAL
+      enddo
+    endif
+    do i = imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE, 3 ! in steps of 3
       displ_inner_core(:,i) = displ_inner_core(:,i) &
         + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
+      displ_inner_core(:,i+1) = displ_inner_core(:,i+1) &
+        + deltat*veloc_inner_core(:,i+1) + deltatsqover2*accel_inner_core(:,i+1)
+      displ_inner_core(:,i+2) = displ_inner_core(:,i+2) &
+        + deltat*veloc_inner_core(:,i+2) + deltatsqover2*accel_inner_core(:,i+2)
 
+
       veloc_inner_core(:,i) = veloc_inner_core(:,i) &
         + deltatover2*accel_inner_core(:,i)
+      veloc_inner_core(:,i+1) = veloc_inner_core(:,i+1) &
+        + deltatover2*accel_inner_core(:,i+1)
+      veloc_inner_core(:,i+2) = veloc_inner_core(:,i+2) &
+        + deltatover2*accel_inner_core(:,i+2)
 
       accel_inner_core(:,i) = 0._CUSTOM_REAL
+      accel_inner_core(:,i+1) = 0._CUSTOM_REAL
+      accel_inner_core(:,i+2) = 0._CUSTOM_REAL
     enddo
-  endif
-  do i = imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE, 3 ! in steps of 3
-    displ_inner_core(:,i) = displ_inner_core(:,i) &
-      + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
-    displ_inner_core(:,i+1) = displ_inner_core(:,i+1) &
-      + deltat*veloc_inner_core(:,i+1) + deltatsqover2*accel_inner_core(:,i+1)
-    displ_inner_core(:,i+2) = displ_inner_core(:,i+2) &
-      + deltat*veloc_inner_core(:,i+2) + deltatsqover2*accel_inner_core(:,i+2)
-
-
-    veloc_inner_core(:,i) = veloc_inner_core(:,i) &
-      + deltatover2*accel_inner_core(:,i)
-    veloc_inner_core(:,i+1) = veloc_inner_core(:,i+1) &
-      + deltatover2*accel_inner_core(:,i+1)
-    veloc_inner_core(:,i+2) = veloc_inner_core(:,i+2) &
-      + deltatover2*accel_inner_core(:,i+2)
-
-    accel_inner_core(:,i) = 0._CUSTOM_REAL
-    accel_inner_core(:,i+1) = 0._CUSTOM_REAL
-    accel_inner_core(:,i+2) = 0._CUSTOM_REAL
-  enddo
-
 #else
 ! way 1:
-  ! mantle
-  do i=1,NGLOB_CRUST_MANTLE
-    displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
-      + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
-    veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
-      + deltatover2*accel_crust_mantle(:,i)
-    accel_crust_mantle(:,i) = 0._CUSTOM_REAL
-  enddo
-  ! outer core
-  do i=1,NGLOB_OUTER_CORE
-    displ_outer_core(i) = displ_outer_core(i) &
-      + deltat*veloc_outer_core(i) + deltatsqover2*accel_outer_core(i)
-    veloc_outer_core(i) = veloc_outer_core(i) &
-      + deltatover2*accel_outer_core(i)
-    accel_outer_core(i) = 0._CUSTOM_REAL
-  enddo
-  ! inner core
-  do i=1,NGLOB_INNER_CORE
-    displ_inner_core(:,i) = displ_inner_core(:,i) &
-      + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
-    veloc_inner_core(:,i) = veloc_inner_core(:,i) &
-      + deltatover2*accel_inner_core(:,i)
-    accel_inner_core(:,i) = 0._CUSTOM_REAL
-  enddo
+    ! mantle
+    do i=1,NGLOB_CRUST_MANTLE
+      displ_crust_mantle(:,i) = displ_crust_mantle(:,i) &
+        + deltat*veloc_crust_mantle(:,i) + deltatsqover2*accel_crust_mantle(:,i)
+      veloc_crust_mantle(:,i) = veloc_crust_mantle(:,i) &
+        + deltatover2*accel_crust_mantle(:,i)
+      accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+    enddo
+    ! outer core
+    do i=1,NGLOB_OUTER_CORE
+      displ_outer_core(i) = displ_outer_core(i) &
+        + deltat*veloc_outer_core(i) + deltatsqover2*accel_outer_core(i)
+      veloc_outer_core(i) = veloc_outer_core(i) &
+        + deltatover2*accel_outer_core(i)
+      accel_outer_core(i) = 0._CUSTOM_REAL
+    enddo
+    ! inner core
+    do i=1,NGLOB_INNER_CORE
+      displ_inner_core(:,i) = displ_inner_core(:,i) &
+        + deltat*veloc_inner_core(:,i) + deltatsqover2*accel_inner_core(:,i)
+      veloc_inner_core(:,i) = veloc_inner_core(:,i) &
+        + deltatover2*accel_inner_core(:,i)
+      accel_inner_core(:,i) = 0._CUSTOM_REAL
+    enddo
 #endif
 
+    ! backward field
+    if (SIMULATION_TYPE == 3) then
 
-
-
-  ! backward field
-  if (SIMULATION_TYPE == 3) then
-
 #ifdef _HANDOPT_NEWMARK
 ! way 2:
-    ! mantle
-    if(imodulo_NGLOB_CRUST_MANTLE >= 1) then
-      do i=1,imodulo_NGLOB_CRUST_MANTLE
+      ! mantle
+      if(imodulo_NGLOB_CRUST_MANTLE >= 1) then
+        do i=1,imodulo_NGLOB_CRUST_MANTLE
+          b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
+            + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
+          b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
+            + b_deltatover2*b_accel_crust_mantle(:,i)
+          b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+        enddo
+      endif
+      do i=imodulo_NGLOB_CRUST_MANTLE+1,NGLOB_CRUST_MANTLE,3
         b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
           + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
+        b_displ_crust_mantle(:,i+1) = b_displ_crust_mantle(:,i+1) &
+          + b_deltat*b_veloc_crust_mantle(:,i+1) + b_deltatsqover2*b_accel_crust_mantle(:,i+1)
+        b_displ_crust_mantle(:,i+2) = b_displ_crust_mantle(:,i+2) &
+          + b_deltat*b_veloc_crust_mantle(:,i+2) + b_deltatsqover2*b_accel_crust_mantle(:,i+2)
+
+
         b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
           + b_deltatover2*b_accel_crust_mantle(:,i)
+        b_veloc_crust_mantle(:,i+1) = b_veloc_crust_mantle(:,i+1) &
+          + b_deltatover2*b_accel_crust_mantle(:,i+1)
+        b_veloc_crust_mantle(:,i+2) = b_veloc_crust_mantle(:,i+2) &
+          + b_deltatover2*b_accel_crust_mantle(:,i+2)
+
         b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+        b_accel_crust_mantle(:,i+1) = 0._CUSTOM_REAL
+        b_accel_crust_mantle(:,i+2) = 0._CUSTOM_REAL
       enddo
-    endif
-    do i=imodulo_NGLOB_CRUST_MANTLE+1,NGLOB_CRUST_MANTLE,3
-      b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
-        + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
-      b_displ_crust_mantle(:,i+1) = b_displ_crust_mantle(:,i+1) &
-        + b_deltat*b_veloc_crust_mantle(:,i+1) + b_deltatsqover2*b_accel_crust_mantle(:,i+1)
-      b_displ_crust_mantle(:,i+2) = b_displ_crust_mantle(:,i+2) &
-        + b_deltat*b_veloc_crust_mantle(:,i+2) + b_deltatsqover2*b_accel_crust_mantle(:,i+2)
 
+      ! outer core
+      do i=1,NGLOB_OUTER_CORE
+        b_displ_outer_core(i) = b_displ_outer_core(i) &
+          + b_deltat*b_veloc_outer_core(i) + b_deltatsqover2*b_accel_outer_core(i)
+        b_veloc_outer_core(i) = b_veloc_outer_core(i) &
+          + b_deltatover2*b_accel_outer_core(i)
+        b_accel_outer_core(i) = 0._CUSTOM_REAL
+      enddo
 
-      b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
-        + b_deltatover2*b_accel_crust_mantle(:,i)
-      b_veloc_crust_mantle(:,i+1) = b_veloc_crust_mantle(:,i+1) &
-        + b_deltatover2*b_accel_crust_mantle(:,i+1)
-      b_veloc_crust_mantle(:,i+2) = b_veloc_crust_mantle(:,i+2) &
-        + b_deltatover2*b_accel_crust_mantle(:,i+2)
+      ! inner core
+      if(imodulo_NGLOB_INNER_CORE >= 1) then
+        do i=1,imodulo_NGLOB_INNER_CORE
+          b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
+            + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
+          b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
+            + b_deltatover2*b_accel_inner_core(:,i)
+          b_accel_inner_core(:,i) = 0._CUSTOM_REAL
+        enddo
+      endif
+      do i=imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE,3
+        b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
+          + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
+        b_displ_inner_core(:,i+1) = b_displ_inner_core(:,i+1) &
+          + b_deltat*b_veloc_inner_core(:,i+1) + b_deltatsqover2*b_accel_inner_core(:,i+1)
+        b_displ_inner_core(:,i+2) = b_displ_inner_core(:,i+2) &
+          + b_deltat*b_veloc_inner_core(:,i+2) + b_deltatsqover2*b_accel_inner_core(:,i+2)
 
-      b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
-      b_accel_crust_mantle(:,i+1) = 0._CUSTOM_REAL
-      b_accel_crust_mantle(:,i+2) = 0._CUSTOM_REAL
-    enddo
+        b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
+          + b_deltatover2*b_accel_inner_core(:,i)
+        b_veloc_inner_core(:,i+1) = b_veloc_inner_core(:,i+1) &
+          + b_deltatover2*b_accel_inner_core(:,i+1)
+        b_veloc_inner_core(:,i+2) = b_veloc_inner_core(:,i+2) &
+          + b_deltatover2*b_accel_inner_core(:,i+2)
 
-    ! outer core
-    do i=1,NGLOB_OUTER_CORE
-      b_displ_outer_core(i) = b_displ_outer_core(i) &
-        + b_deltat*b_veloc_outer_core(i) + b_deltatsqover2*b_accel_outer_core(i)
-      b_veloc_outer_core(i) = b_veloc_outer_core(i) &
-        + b_deltatover2*b_accel_outer_core(i)
-      b_accel_outer_core(i) = 0._CUSTOM_REAL
-    enddo
-
-    ! inner core
-    if(imodulo_NGLOB_INNER_CORE >= 1) then
-      do i=1,imodulo_NGLOB_INNER_CORE
+        b_accel_inner_core(:,i) = 0._CUSTOM_REAL
+        b_accel_inner_core(:,i+1) = 0._CUSTOM_REAL
+        b_accel_inner_core(:,i+2) = 0._CUSTOM_REAL
+      enddo
+#else
+! way 1:
+      ! mantle
+      do i=1,NGLOB_CRUST_MANTLE
+        b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
+          + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
+        b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
+          + b_deltatover2*b_accel_crust_mantle(:,i)
+        b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
+      enddo
+      ! outer core
+      do i=1,NGLOB_OUTER_CORE
+        b_displ_outer_core(i) = b_displ_outer_core(i) &
+          + b_deltat*b_veloc_outer_core(i) + b_deltatsqover2*b_accel_outer_core(i)
+        b_veloc_outer_core(i) = b_veloc_outer_core(i) &
+          + b_deltatover2*b_accel_outer_core(i)
+        b_accel_outer_core(i) = 0._CUSTOM_REAL
+      enddo
+      ! inner core
+      do i=1,NGLOB_INNER_CORE
         b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
           + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
         b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
           + b_deltatover2*b_accel_inner_core(:,i)
         b_accel_inner_core(:,i) = 0._CUSTOM_REAL
       enddo
-    endif
-    do i=imodulo_NGLOB_INNER_CORE+1,NGLOB_INNER_CORE,3
-      b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
-        + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
-      b_displ_inner_core(:,i+1) = b_displ_inner_core(:,i+1) &
-        + b_deltat*b_veloc_inner_core(:,i+1) + b_deltatsqover2*b_accel_inner_core(:,i+1)
-      b_displ_inner_core(:,i+2) = b_displ_inner_core(:,i+2) &
-        + b_deltat*b_veloc_inner_core(:,i+2) + b_deltatsqover2*b_accel_inner_core(:,i+2)
+#endif
+    endif ! SIMULATION_TYPE == 3
+  else
+    ! on GPU
+    ! Includes SIM_TYPE 1 & 3
 
-      b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
-        + b_deltatover2*b_accel_inner_core(:,i)
-      b_veloc_inner_core(:,i+1) = b_veloc_inner_core(:,i+1) &
-        + b_deltatover2*b_accel_inner_core(:,i+1)
-      b_veloc_inner_core(:,i+2) = b_veloc_inner_core(:,i+2) &
-        + b_deltatover2*b_accel_inner_core(:,i+2)
+    ! outer core region
+    call it_update_displacement_oc_cuda(Mesh_pointer, &
+                                       deltat, deltatsqover2, deltatover2, &
+                                       b_deltat, b_deltatsqover2, b_deltatover2)
+    ! inner core region
+    call it_update_displacement_ic_cuda(Mesh_pointer, &
+                                       deltat, deltatsqover2, deltatover2, &
+                                       b_deltat, b_deltatsqover2, b_deltatover2)
 
-      b_accel_inner_core(:,i) = 0._CUSTOM_REAL
-      b_accel_inner_core(:,i+1) = 0._CUSTOM_REAL
-      b_accel_inner_core(:,i+2) = 0._CUSTOM_REAL
-    enddo
-#else
-! way 1:
-    ! mantle
-    do i=1,NGLOB_CRUST_MANTLE
-      b_displ_crust_mantle(:,i) = b_displ_crust_mantle(:,i) &
-        + b_deltat*b_veloc_crust_mantle(:,i) + b_deltatsqover2*b_accel_crust_mantle(:,i)
-      b_veloc_crust_mantle(:,i) = b_veloc_crust_mantle(:,i) &
-        + b_deltatover2*b_accel_crust_mantle(:,i)
-      b_accel_crust_mantle(:,i) = 0._CUSTOM_REAL
-    enddo
-    ! outer core
-    do i=1,NGLOB_OUTER_CORE
-      b_displ_outer_core(i) = b_displ_outer_core(i) &
-        + b_deltat*b_veloc_outer_core(i) + b_deltatsqover2*b_accel_outer_core(i)
-      b_veloc_outer_core(i) = b_veloc_outer_core(i) &
-        + b_deltatover2*b_accel_outer_core(i)
-      b_accel_outer_core(i) = 0._CUSTOM_REAL
-    enddo
-    ! inner core
-    do i=1,NGLOB_INNER_CORE
-      b_displ_inner_core(:,i) = b_displ_inner_core(:,i) &
-        + b_deltat*b_veloc_inner_core(:,i) + b_deltatsqover2*b_accel_inner_core(:,i)
-      b_veloc_inner_core(:,i) = b_veloc_inner_core(:,i) &
-        + b_deltatover2*b_accel_inner_core(:,i)
-      b_accel_inner_core(:,i) = 0._CUSTOM_REAL
-    enddo
-#endif
-  endif ! SIMULATION_TYPE == 3
+    ! crust/mantle region
+    call it_update_displacement_cm_cuda(Mesh_pointer, &
+                                       deltat, deltatsqover2, deltatover2, &
+                                       b_deltat, b_deltatsqover2, b_deltatover2)
+  endif
 
   ! integral of strain for adjoint movie volume
   if(MOVIE_VOLUME .and. (MOVIE_VOLUME_TYPE == 2 .or. MOVIE_VOLUME_TYPE == 3) ) then
-!    Iepsilondev_crust_mantle(:,:,:,:,:) = Iepsilondev_crust_mantle(:,:,:,:,:)  &
-!                                            + deltat*epsilondev_crust_mantle(:,:,:,:,:)
+    if( GPU_MODE ) then
+      ! transfers strain arrays onto CPU
+      call transfer_strain_cm_from_device(Mesh_pointer,eps_trace_over_3_crust_mantle, &
+                                         epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle, &
+                                         epsilondev_xy_crust_mantle,epsilondev_xz_crust_mantle, &
+                                         epsilondev_yz_crust_mantle)
+    endif
+
+    ! updates integral values
     Iepsilondev_crust_mantle(1,:,:,:,:) = Iepsilondev_crust_mantle(1,:,:,:,:)  &
-                                            + deltat*epsilondev_xx_crust_mantle(:,:,:,:)
+                                              + deltat*epsilondev_xx_crust_mantle(:,:,:,:)
     Iepsilondev_crust_mantle(2,:,:,:,:) = Iepsilondev_crust_mantle(2,:,:,:,:)  &
-                                            + deltat*epsilondev_yy_crust_mantle(:,:,:,:)
+                                              + deltat*epsilondev_yy_crust_mantle(:,:,:,:)
     Iepsilondev_crust_mantle(3,:,:,:,:) = Iepsilondev_crust_mantle(3,:,:,:,:)  &
-                                            + deltat*epsilondev_xy_crust_mantle(:,:,:,:)
+                                              + deltat*epsilondev_xy_crust_mantle(:,:,:,:)
     Iepsilondev_crust_mantle(4,:,:,:,:) = Iepsilondev_crust_mantle(4,:,:,:,:)  &
-                                            + deltat*epsilondev_xz_crust_mantle(:,:,:,:)
+                                              + deltat*epsilondev_xz_crust_mantle(:,:,:,:)
     Iepsilondev_crust_mantle(5,:,:,:,:) = Iepsilondev_crust_mantle(5,:,:,:,:)  &
-                                            + deltat*epsilondev_yz_crust_mantle(:,:,:,:)
+                                              + deltat*epsilondev_yz_crust_mantle(:,:,:,:)
 
     Ieps_trace_over_3_crust_mantle(:,:,:,:) = Ieps_trace_over_3_crust_mantle(:,:,:,:) &
-                                            + deltat*eps_trace_over_3_crust_mantle(:,:,:,:)
+                                              + deltat*eps_trace_over_3_crust_mantle(:,:,:,:)
   endif
 
+
+
   end subroutine it_update_displacement_scheme
 
 
@@ -477,8 +503,62 @@
 ! transfers fields on GPU back onto CPU
 
   use specfem_par
+  use specfem_par_crustmantle
+  use specfem_par_innercore
+  use specfem_par_outercore
   implicit none
 
+  ! to store forward wave fields
+  if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+
+    call transfer_fields_cm_from_device(NDIM*NGLOB_CRUST_MANTLE, &
+                                    displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
+                                    Mesh_pointer)
+    call transfer_fields_ic_from_device(NDIM*NGLOB_INNER_CORE, &
+                                    displ_inner_core,veloc_inner_core,accel_inner_core, &
+                                    Mesh_pointer)
+    call transfer_fields_oc_from_device(NGLOB_OUTER_CORE, &
+                                    displ_outer_core,veloc_outer_core,accel_outer_core, &
+                                    Mesh_pointer)
+
+    call transfer_strain_cm_from_device(Mesh_pointer, &
+                                    epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle, &
+                                    epsilondev_xy_crust_mantle,epsilondev_xz_crust_mantle, &
+                                    epsilondev_yz_crust_mantle)
+    call transfer_strain_ic_from_device(Mesh_pointer, &
+                                    epsilondev_xx_inner_core,epsilondev_yy_inner_core, &
+                                    epsilondev_xy_inner_core,epsilondev_xz_inner_core, &
+                                    epsilondev_yz_inner_core)
+
+    if (ROTATION_VAL) then
+      call transfer_rotation_from_device(Mesh_pointer,A_array_rotation,B_array_rotation)
+    endif
+
+    ! note: for kernel simulations (SIMULATION_TYPE == 3), attenuation is
+    !          only mimicking effects on phase shifts, but not on amplitudes.
+    !          flag USE_ATTENUATION_MIMIC will have to be set to true in this case.
+    !
+    ! arrays b_R_xx, ... are not used when USE_ATTENUATION_MIMIC is set,
+    ! therefore no need to transfer arrays from GPU to CPU
+    !if (ATTENUATION) then
+    !endif
+
+  else if (SIMULATION_TYPE == 3) then
+    ! to store kernels
+    !call transfer_kernels_ac_to_host(Mesh_pointer,rho_ac_kl,kappa_ac_kl,NSPEC_AB)
+    !call transfer_kernels_el_to_host(Mesh_pointer,rho_kl,mu_kl,kappa_kl,NSPEC_AB)
+
+    ! specific noise strength kernel
+    if( NOISE_TOMOGRAPHY == 3 ) then
+      !call transfer_kernels_noise_to_host(Mesh_pointer,Sigma_kl,NSPEC_AB)
+    endif
+
+    ! approximative hessian for preconditioning kernels
+    if ( APPROXIMATE_HESS_KL ) then
+      call transfer_kernels_hess_cm_tohost(Mesh_pointer,hess_kl_crust_mantle,NSPEC_CRUST_MANTLE)
+    endif
+  endif
+
   ! frees allocated memory on GPU
   call prepare_cleanup_device(Mesh_pointer)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_receivers.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/locate_receivers.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -703,7 +703,7 @@
 
 ! main process broadcasts the results to all the slices
   call MPI_BCAST(nrec,1,MPI_INTEGER,0,MPI_COMM_WORLD,ier)
-  
+
   call MPI_BCAST(islice_selected_rec,nrec,MPI_INTEGER,0,MPI_COMM_WORLD,ier)
   call MPI_BCAST(ispec_selected_rec,nrec,MPI_INTEGER,0,MPI_COMM_WORLD,ier)
   call MPI_BCAST(xi_receiver,nrec,MPI_DOUBLE_PRECISION,0,MPI_COMM_WORLD,ier)

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/noise_tomography.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/noise_tomography.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/noise_tomography.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -417,10 +417,10 @@
 ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
 ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
 
-  use specfem_par  
+  use specfem_par
   use specfem_par_crustmantle
   implicit none
-  
+
   ! local parameters
   integer :: i,j,k,iglob
 
@@ -442,11 +442,7 @@
 
   else
     ! on GPU
-    call load_GPU_elastic()
-    
-    call noise_add_source_master_rec_cu(Mesh_pointer,it,irec_master_noise,islice_selected_rec)  
-    
-    call load_CPU_elastic()
+    call noise_add_source_master_rec_cu(Mesh_pointer,it,irec_master_noise,islice_selected_rec)
   endif
 
   end subroutine noise_add_source_master_rec
@@ -475,7 +471,7 @@
 
   ! local parameters
   integer :: ispec2D,ispec,i,j,k,iglob
-  
+
   ! get coordinates of surface mesh and surface displacement
   if( .not. GPU_MODE ) then
     ! on CPU
@@ -493,7 +489,7 @@
     ! on GPU
     call noise_transfer_surface_to_host(Mesh_pointer,noise_surface_movie)
   endif
-  
+
   ! save surface motion to disk
   call write_abs(9,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLX*NGLLY*nspec_top,it)
 
@@ -522,17 +518,17 @@
 
   use specfem_par
   use specfem_par_crustmantle
-  
+
   implicit none
 
   real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_CRUST_MANTLE),intent(inout) :: accel
   integer,intent(in) :: it_index
-  
+
   ! local parameters
   integer :: ipoin,ispec2D,ispec,i,j,k,iglob
   real(kind=CUSTOM_REAL) :: eta
-  
-  
+
+
   ! read surface movie
   call read_abs(9,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLX*NGLLY*nspec_top,it_index)
 
@@ -572,7 +568,7 @@
 
   else
     ! on GPU
-    call noise_add_surface_movie_cuda(Mesh_pointer,noise_surface_movie)    
+    call noise_add_surface_movie_cuda(Mesh_pointer,noise_surface_movie)
   endif
 
   end subroutine noise_read_add_surface_movie

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -30,12 +30,12 @@
   use specfem_par
   use specfem_par_movie
   implicit none
-  
+
   include 'mpif.h'
 
   ! get MPI starting time
   time_start = MPI_WTIME()
-  
+
   ! user output infos
   call prepare_timerun_user_output()
 
@@ -44,7 +44,7 @@
 
   ! convert x/y/z into r/theta/phi spherical coordinates
   call prepare_timerun_convert_coord()
-  
+
   ! allocate files to save movies
   ! for noise tomography, store_val_x/y/z/ux/uy/uz needed for 'surface movie'
   if(MOVIE_SURFACE .or. NOISE_TOMOGRAPHY /= 0 ) then
@@ -310,13 +310,13 @@
   use specfem_par
   use specfem_par_crustmantle
   use specfem_par_innercore
-  use specfem_par_outercore  
+  use specfem_par_outercore
   implicit none
 
   ! local parameters
   integer :: i
   real(kind=CUSTOM_REAL) :: rval,thetaval,phival
-  
+
   ! change x, y, z to r, theta and phi once and for all
   ! IMPROVE dangerous: old name kept (xstore ystore zstore) for new values
 
@@ -361,7 +361,7 @@
   use specfem_par
   use specfem_par_movie
   implicit none
-  
+
   ! local parameters
   integer :: ier
 
@@ -415,7 +415,7 @@
   use specfem_par_crustmantle
   use specfem_par_movie
   implicit none
-  
+
   ! local parameters
   integer :: ier
 
@@ -815,7 +815,7 @@
   subroutine prepare_timerun_init_wavefield()
 
 ! initializes arrays
- 
+
   use specfem_par
   use specfem_par_crustmantle
   use specfem_par_innercore
@@ -1082,7 +1082,8 @@
                                   ANISOTROPIC_3D_MANTLE_VAL, &
                                   ANISOTROPIC_INNER_CORE_VAL, &
                                   SAVE_BOUNDARY_MESH, &
-                                  USE_MESH_COLORING_GPU)
+                                  USE_MESH_COLORING_GPU, &
+                                  APPROXIMATE_HESS_KL)
   call sync_all()
 
   ! prepares rotation arrays
@@ -1209,13 +1210,13 @@
                                     jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core, &
                                     jacobian2D_bottom_outer_core, &
                                     vp_outer_core)
-    
+
   endif
   call sync_all()
-  
+
   ! prepares MPI interfaces
   if(myrank == 0 ) write(IMAIN,*) "  loading mpi interfaces"
-  
+
   call prepare_mpi_buffers_device(Mesh_pointer, &
                                 num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
                                 nibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle, &

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -73,7 +73,7 @@
   if(NUMBER_OF_THIS_RUN > 1) then
     write(outputname,"('dump_all_arrays',i6.6)") myrank
     open(unit=55,file=trim(LOCAL_PATH)//'/'//outputname,status='old',action='read',form='unformatted')
-    
+
     read(55) displ_crust_mantle
     read(55) veloc_crust_mantle
     read(55) accel_crust_mantle
@@ -175,11 +175,18 @@
   implicit none
 
   !local parameters
+  integer :: ier
   character(len=150) outputname
 
   write(outputname,'(a,i6.6,a)') 'proc',myrank,'_save_forward_arrays.bin'
-  open(unit=55,file=trim(LOCAL_PATH)//'/'//outputname,status='old',action='read',form='unformatted')
-  
+  open(unit=55,file=trim(LOCAL_PATH)//'/'//outputname, &
+        status='old',action='read',form='unformatted',iostat=ier)
+  if( ier /= 0 ) then
+    print*,'error: opening proc_****_save_forward_arrays.bin'
+    print*,'path: ',trim(LOCAL_PATH)//'/'//outputname
+    call exit_mpi(myrank,'error open file save_forward_arrays.bin')
+  endif
+
   read(55) b_displ_crust_mantle
   read(55) b_veloc_crust_mantle
   read(55) b_accel_crust_mantle
@@ -202,10 +209,41 @@
   read(55) b_epsilondev_xz_inner_core
   read(55) b_epsilondev_yz_inner_core
 
+  ! transfers fields onto GPU
+  if(GPU_MODE) then
+    call transfer_b_fields_cm_to_device(NDIM*NGLOB_CRUST_MANTLE, &
+                                    b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
+                                    Mesh_pointer)
+
+    call transfer_b_fields_ic_to_device(NDIM*NGLOB_INNER_CORE, &
+                                    b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
+                                    Mesh_pointer)
+
+    call transfer_b_fields_oc_to_device(NGLOB_OUTER_CORE, &
+                                    b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
+                                    Mesh_pointer)
+
+    call transfer_b_strain_cm_to_device(Mesh_pointer, &
+                                    b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle, &
+                                    b_epsilondev_xy_crust_mantle,b_epsilondev_xz_crust_mantle, &
+                                    b_epsilondev_yz_crust_mantle)
+
+    call transfer_b_strain_ic_to_device(Mesh_pointer, &
+                                    b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core, &
+                                    b_epsilondev_xy_inner_core,b_epsilondev_xz_inner_core, &
+                                    b_epsilondev_yz_inner_core)
+  endif
+
+
   if (ROTATION_VAL) then
     read(55) b_A_array_rotation
     read(55) b_B_array_rotation
+    ! transfers to GPU
+    if(GPU_MODE) then
+      call transfer_b_rotation_to_device(Mesh_pointer,b_A_array_rotation,b_B_array_rotation)
+    endif
   endif
+
   if (ATTENUATION_VAL) then
     read(55) b_R_xx_crust_mantle
     read(55) b_R_yy_crust_mantle
@@ -219,8 +257,17 @@
     read(55) b_R_xz_inner_core
     read(55) b_R_yz_inner_core
 
+    ! note: for kernel simulations (SIMULATION_TYPE == 3), attenuation is
+    !          only mimicking effects on phase shifts, but not on amplitudes.
+    !          flag USE_ATTENUATION_MIMIC will have to be set to true in this case.
+    !
+    ! arrays b_R_xx, ... are not used when USE_ATTENUATION_MIMIC is set,
+    ! therefore no need to transfer arrays onto GPU
+    !if(GPU_MODE) then
+    !endif
+
   endif
-  
+
   close(55)
 
   end subroutine read_forward_arrays

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -34,7 +34,7 @@
   implicit none
 
   include 'mpif.h'
-  
+
   ! local parameters
   integer :: ier
 
@@ -53,16 +53,16 @@
 
   ! reads "solver_data_1.bin" & "solver_data_2.bin" files for crust and mantle
   call read_mesh_databases_CM()
-  
+
   ! reads "solver_data_1.bin" & "solver_data_2.bin" files for outer core
   call read_mesh_databases_OC()
-  
+
   ! reads "solver_data_1.bin" & "solver_data_2.bin" files for inner core
   call read_mesh_databases_IC()
 
   ! reads "boundary.bin" files to couple mantle with outer core and inner core boundaries
   call read_mesh_databases_coupling()
-  
+
   ! reads "addressing.txt" 2-D addressing for summation between slices with MPI
   call read_mesh_databases_addressing()
 
@@ -89,7 +89,7 @@
     write(IMAIN,*) 'Elapsed time for reading mesh in seconds = ',sngl(tCPU)
     write(IMAIN,*)
   endif
-  
+
   ! frees temporary allocated arrays
   deallocate(is_on_a_slice_edge_crust_mantle, &
             is_on_a_slice_edge_outer_core, &
@@ -134,7 +134,7 @@
     READ_KAPPA_MU = .true.
     READ_TISO = .true.
   endif
-  
+
   call read_arrays_solver(IREGION_CRUST_MANTLE,myrank, &
             rho_vp_crust_mantle,rho_vs_crust_mantle, &
             xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
@@ -181,13 +181,13 @@
   ! local parameters
   integer :: nspec_iso,nspec_tiso,nspec_ani
   logical :: READ_KAPPA_MU,READ_TISO
-  
+
   ! dummy array that does not need to be actually read
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
 
   logical, dimension(:),allocatable:: dummy_ispec_is_tiso
   integer, dimension(:),allocatable :: dummy_idoubling_outer_core
-  
+
   ! outer core (no anisotropy nor S velocity)
   ! rmass_ocean_load is not used in this routine because it is meaningless in the outer core
   READ_KAPPA_MU = .false.
@@ -195,7 +195,7 @@
   nspec_iso = NSPEC_OUTER_CORE
   nspec_tiso = 1
   nspec_ani = 1
-  
+
   ! dummy allocation
   allocate(dummy_ispec_is_tiso(NSPEC_OUTER_CORE))
   allocate(dummy_idoubling_outer_core(NSPEC_OUTER_CORE))
@@ -251,7 +251,7 @@
   ! dummy array that does not need to be actually read
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,1) :: dummy_array
   logical, dimension(:),allocatable:: dummy_ispec_is_tiso
-  
+
   ! inner core (no anisotropy)
   ! rmass_ocean_load is not used in this routine because it is meaningless in the inner core
   READ_KAPPA_MU = .true.
@@ -263,10 +263,10 @@
   else
     nspec_ani = 1
   endif
-  
+
   ! dummy allocation
   allocate(dummy_ispec_is_tiso(NSPEC_INNER_CORE))
-  
+
   call read_arrays_solver(IREGION_INNER_CORE,myrank, &
             dummy_array,dummy_array, &
             xstore_inner_core,ystore_inner_core,zstore_inner_core, &
@@ -313,7 +313,7 @@
   implicit none
 
   include 'mpif.h'
-  
+
   ! local parameters
   integer njunk1,njunk2,njunk3
 
@@ -595,7 +595,7 @@
   use specfem_par
   use specfem_par_crustmantle
   use specfem_par_innercore
-  use specfem_par_outercore  
+  use specfem_par_outercore
   implicit none
 
   ! local parameters
@@ -606,7 +606,7 @@
 
   ! mantle and crust
   if(myrank == 0) write(IMAIN,*) 'crust/mantle region:'
-  
+
   call read_arrays_buffers_solver(IREGION_CRUST_MANTLE,myrank,iboolleft_xi_crust_mantle, &
      iboolright_xi_crust_mantle,iboolleft_eta_crust_mantle,iboolright_eta_crust_mantle, &
      npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle, &
@@ -634,7 +634,7 @@
 
   ! inner core
   if(myrank == 0) write(IMAIN,*) 'inner core region:'
-  
+
   call read_arrays_buffers_solver(IREGION_INNER_CORE,myrank, &
      iboolleft_xi_inner_core,iboolright_xi_inner_core,iboolleft_eta_inner_core,iboolright_eta_inner_core, &
      npoin2D_xi_inner_core,npoin2D_eta_inner_core, &
@@ -721,11 +721,11 @@
 
   ! note: fix_... routines below update is_on_a_slice_edge_.. arrays:
   !          assign flags for each element which is on a rim of the slice
-  !          thus, they include elements on top and bottom not shared with other MPI partitions  
+  !          thus, they include elements on top and bottom not shared with other MPI partitions
   !
   !          we will re-set these flags when setting up inner/outer elements, but will
-  !          use these arrays for now as initial guess for the search for elements which share a global point 
-  !          between different MPI processes  
+  !          use these arrays for now as initial guess for the search for elements which share a global point
+  !          between different MPI processes
   call fix_non_blocking_slices(is_on_a_slice_edge_crust_mantle,iboolright_xi_crust_mantle, &
          iboolleft_xi_crust_mantle,iboolright_eta_crust_mantle,iboolleft_eta_crust_mantle, &
          npoin2D_xi_crust_mantle,npoin2D_eta_crust_mantle,ibool_crust_mantle, &
@@ -748,31 +748,31 @@
          idoubling_inner_core,npoin2D_cube_from_slices,ibool_central_cube, &
          NSPEC2D_BOTTOM(IREGION_INNER_CORE),ichunk)
   endif
-  
+
   ! debug: saves element flags
   ! crust mantle
-  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_is_on_a_slice_edge_crust_mantle_proc',myrank  
+  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_is_on_a_slice_edge_crust_mantle_proc',myrank
   !call write_VTK_data_elem_l(NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
   !                          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
   !                          ibool_crust_mantle, &
-  !                          is_on_a_slice_edge_crust_mantle,filename)  
+  !                          is_on_a_slice_edge_crust_mantle,filename)
   ! outer core
-  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_is_on_a_slice_edge_outer_core_proc',myrank  
+  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_is_on_a_slice_edge_outer_core_proc',myrank
   !call write_VTK_data_elem_l(NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
   !                          xstore_outer_core,ystore_outer_core,zstore_outer_core, &
   !                          ibool_outer_core, &
   !                          is_on_a_slice_edge_outer_core,filename)
-!daniel  
+!daniel
   ! inner core
-  write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_is_on_a_slice_edge_inner_core_proc',myrank  
+  write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_is_on_a_slice_edge_inner_core_proc',myrank
   call write_VTK_data_elem_l(NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
                             xstore_inner_core,ystore_inner_core,zstore_inner_core, &
                             ibool_inner_core, &
                             is_on_a_slice_edge_inner_core,filename)
-  
+
   end subroutine read_mesh_databases_MPIbuffers
-  
-  
+
+
 !
 !-------------------------------------------------------------------------------------------------
 !
@@ -784,29 +784,29 @@
   use specfem_par
   use specfem_par_crustmantle
   use specfem_par_innercore
-  use specfem_par_outercore  
+  use specfem_par_outercore
   implicit none
 
   include 'mpif.h'
-  
-  ! local parameters  
+
+  ! local parameters
   integer :: ier,ndim_assemble
   character(len=150) :: filename
 
   ! temporary buffers for send and receive between faces of the slices and the chunks
   real(kind=CUSTOM_REAL), dimension(npoin2D_max_all_CM_IC) ::  &
     buffer_send_faces_scalar,buffer_received_faces_scalar
-  
+
   ! assigns initial maximum arrays
   ! for global slices, maximum number of neighbor is around 17 ( 8 horizontal, max of 8 on bottom )
   integer, parameter :: MAX_NEIGHBOURS = 8 + NCORNERSCHUNKS_VAL
   integer, dimension(MAX_NEIGHBOURS) :: my_neighbours,nibool_neighbours
   integer, dimension(:,:),allocatable :: ibool_neighbours
-  integer :: max_nibool  
+  integer :: max_nibool
   real(kind=CUSTOM_REAL),dimension(:),allocatable :: test_flag
   real(kind=CUSTOM_REAL),dimension(:),allocatable :: test_flag_cc
   integer :: i,j,k,ispec,iglob
-  
+
   ! estimates initial maximum ibool array
   max_nibool = npoin2D_max_all_CM_IC * NUMFACES_SHARED &
                + non_zero_nb_msgs_theor_in_cube*npoin2D_cube_from_slices
@@ -815,13 +815,13 @@
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating ibool_neighbours')
 
 
-! sets up MPI interfaces  
-! crust mantle region  
+! sets up MPI interfaces
+! crust mantle region
   if( myrank == 0 ) write(IMAIN,*) 'crust mantle mpi:'
   allocate(test_flag(NGLOB_CRUST_MANTLE), &
           stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating test_flag')
-  
+
   ! sets flag to rank id (+1 to avoid problems with zero rank)
   test_flag(:) = myrank + 1.0
 
@@ -839,7 +839,7 @@
             NUMMSGS_FACES,NUM_MSG_TYPES,NCORNERSCHUNKS, &
             NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_CRUST_MANTLE), &
             NGLOB2DMAX_XMIN_XMAX(IREGION_CRUST_MANTLE),NGLOB2DMAX_YMIN_YMAX(IREGION_CRUST_MANTLE),NGLOB2DMAX_XY,NCHUNKS_VAL)
-    
+
   ! removes own myrank id (+1)
   test_flag(:) = test_flag(:) - ( myrank + 1.0)
 
@@ -848,21 +848,21 @@
   !call write_VTK_glob_points(NGLOB_CRUST_MANTLE, &
   !                      xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
   !                      test_flag,filename)
-  
-  ! determines neighbor rank for shared faces 
+
+  ! determines neighbor rank for shared faces
   call rmd_get_MPI_interfaces(myrank,NGLOB_CRUST_MANTLE,NSPEC_CRUST_MANTLE, &
                             test_flag,my_neighbours,nibool_neighbours,ibool_neighbours, &
                             num_interfaces_crust_mantle,max_nibool_interfaces_crust_mantle, &
                             max_nibool,MAX_NEIGHBOURS, &
                             ibool_crust_mantle,&
                             is_on_a_slice_edge_crust_mantle, &
-                            IREGION_CRUST_MANTLE,.false.)  
+                            IREGION_CRUST_MANTLE,.false.)
 
   deallocate(test_flag)
-    
+
   ! stores MPI interfaces informations
   allocate(my_neighbours_crust_mantle(num_interfaces_crust_mantle), &
-          nibool_interfaces_crust_mantle(num_interfaces_crust_mantle), &          
+          nibool_interfaces_crust_mantle(num_interfaces_crust_mantle), &
           stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating array my_neighbours_crust_mantle etc.')
 
@@ -877,13 +877,13 @@
     ! number of global ibool entries on each interface
     nibool_interfaces_crust_mantle(:) = nibool_neighbours(1:num_interfaces_crust_mantle)
     ! global iglob point ids on each interface
-    ibool_interfaces_crust_mantle(:,:) = ibool_neighbours(1:max_nibool_interfaces_crust_mantle,1:num_interfaces_crust_mantle)    
+    ibool_interfaces_crust_mantle(:,:) = ibool_neighbours(1:max_nibool_interfaces_crust_mantle,1:num_interfaces_crust_mantle)
   else
     ! dummy allocation (fortran90 should allow allocate statement with zero array size)
     max_nibool_interfaces_crust_mantle = 0
     allocate(ibool_interfaces_crust_mantle(0,0),stat=ier)
   endif
-  
+
   ! debug: saves 1. MPI interface
   !if( num_interfaces_crust_mantle >= 1 ) then
   !  write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_1_points_crust_mantle_proc',myrank
@@ -894,13 +894,13 @@
   !endif
 
 
-! outer core region  
+! outer core region
   if( myrank == 0 ) write(IMAIN,*) 'outer core mpi:'
 
   allocate(test_flag(NGLOB_OUTER_CORE), &
           stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating test_flag outer core')
-  
+
   ! sets flag to rank id (+1 to avoid problems with zero rank)
   test_flag(:) = myrank + 1.0
 
@@ -919,7 +919,7 @@
             NPROC_XI_VAL,NPROC_ETA_VAL,NGLOB1D_RADIAL(IREGION_OUTER_CORE), &
             NGLOB2DMAX_XMIN_XMAX(IREGION_OUTER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_OUTER_CORE),NGLOB2DMAX_XY,NCHUNKS_VAL)
 
-    
+
   ! removes own myrank id (+1)
   test_flag(:) = test_flag(:) - ( myrank + 1.0)
 
@@ -929,21 +929,21 @@
   !                      xstore_outer_core,ystore_outer_core,zstore_outer_core, &
   !                      test_flag,filename)
 
-  
-  ! determines neighbor rank for shared faces 
+
+  ! determines neighbor rank for shared faces
   call rmd_get_MPI_interfaces(myrank,NGLOB_OUTER_CORE,NSPEC_OUTER_CORE, &
                             test_flag,my_neighbours,nibool_neighbours,ibool_neighbours, &
                             num_interfaces_outer_core,max_nibool_interfaces_outer_core, &
                             max_nibool,MAX_NEIGHBOURS, &
                             ibool_outer_core,&
                             is_on_a_slice_edge_outer_core, &
-                            IREGION_OUTER_CORE,.false.)  
+                            IREGION_OUTER_CORE,.false.)
 
   deallocate(test_flag)
-    
+
   ! stores MPI interfaces informations
   allocate(my_neighbours_outer_core(num_interfaces_outer_core), &
-          nibool_interfaces_outer_core(num_interfaces_outer_core), &          
+          nibool_interfaces_outer_core(num_interfaces_outer_core), &
           stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating array my_neighbours_outer_core etc.')
 
@@ -958,7 +958,7 @@
     ! number of global ibool entries on each interface
     nibool_interfaces_outer_core(:) = nibool_neighbours(1:num_interfaces_outer_core)
     ! global iglob point ids on each interface
-    ibool_interfaces_outer_core(:,:) = ibool_neighbours(1:max_nibool_interfaces_outer_core,1:num_interfaces_outer_core)    
+    ibool_interfaces_outer_core(:,:) = ibool_neighbours(1:max_nibool_interfaces_outer_core,1:num_interfaces_outer_core)
   else
     ! dummy allocation (fortran90 should allow allocate statement with zero array size)
     max_nibool_interfaces_outer_core = 0
@@ -980,16 +980,16 @@
   allocate(test_flag(NGLOB_INNER_CORE), &
           stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating test_flag inner core')
-  
+
   ! sets flag to rank id (+1 to avoid problems with zero rank)
   test_flag(:) = 0.0
   do ispec=1,NSPEC_INNER_CORE
     ! suppress fictitious elements in central cube
-    if(idoubling_inner_core(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle    
+    if(idoubling_inner_core(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle
     do k = 1,NGLLZ
       do j = 1,NGLLY
         do i = 1,NGLLX
-          iglob = ibool_inner_core(i,j,k,ispec)          
+          iglob = ibool_inner_core(i,j,k,ispec)
           test_flag(iglob) = myrank + 1.0
         enddo
       enddo
@@ -1012,7 +1012,7 @@
             NGLOB2DMAX_XMIN_XMAX(IREGION_INNER_CORE),NGLOB2DMAX_YMIN_YMAX(IREGION_INNER_CORE),NGLOB2DMAX_XY,NCHUNKS_VAL)
 
   ! removes own myrank id (+1)
-  test_flag(:) = test_flag(:) - ( myrank + 1.0)  
+  test_flag(:) = test_flag(:) - ( myrank + 1.0)
   where( test_flag(:) < 0.0 ) test_flag(:) = 0.0
 
   ! debug: saves array
@@ -1022,14 +1022,14 @@
                         test_flag,filename)
 
 !  ! gets new interfaces for inner_core without central cube yet
-!  ! determines neighbor rank for shared faces 
+!  ! determines neighbor rank for shared faces
 !  call rmd_get_MPI_interfaces(myrank,NGLOB_INNER_CORE,NSPEC_INNER_CORE, &
 !                            test_flag,my_neighbours,nibool_neighbours,ibool_neighbours, &
 !                            num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
 !                            max_nibool,MAX_NEIGHBOURS, &
 !                            ibool_inner_core,&
 !                            is_on_a_slice_edge_inner_core, &
-!                            IREGION_INNER_CORE,.false.,idoubling_inner_core)  
+!                            IREGION_INNER_CORE,.false.,idoubling_inner_core)
 
   ! including central cube
   if(INCLUDE_CENTRAL_CUBE) then
@@ -1038,22 +1038,22 @@
     allocate(test_flag_cc(NGLOB_INNER_CORE), &
             stat=ier)
     if( ier /= 0 ) call exit_mpi(myrank,'error allocating test_flag_cc inner core')
-  
+
     ! re-sets flag to rank id (+1 to avoid problems with zero rank)
     test_flag_cc(:) = 0.0
     do ispec=1,NSPEC_INNER_CORE
       ! suppress fictitious elements in central cube
-      if(idoubling_inner_core(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle    
+      if(idoubling_inner_core(ispec) == IFLAG_IN_FICTITIOUS_CUBE) cycle
       do k = 1,NGLLZ
         do j = 1,NGLLY
           do i = 1,NGLLX
-            iglob = ibool_inner_core(i,j,k,ispec)          
+            iglob = ibool_inner_core(i,j,k,ispec)
             test_flag_cc(iglob) = myrank + 1.0
           enddo
         enddo
       enddo
-    enddo  
-  
+    enddo
+
     ! test_flag is a scalar, not a vector
     ndim_assemble = 1
     ! use central cube buffers to assemble the inner core mass matrix with the central cube
@@ -1065,12 +1065,12 @@
                  ibelm_bottom_inner_core, NSPEC2D_BOTTOM(IREGION_INNER_CORE), &
                  NGLOB_INNER_CORE, &
                  test_flag_cc,ndim_assemble)
-                 
 
+
     ! removes own myrank id (+1)
-    test_flag_cc(:) = test_flag_cc(:) - ( myrank + 1.0)  
+    test_flag_cc(:) = test_flag_cc(:) - ( myrank + 1.0)
     where( test_flag_cc(:) < 0.0 ) test_flag_cc(:) = 0.0
-                 
+
     write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_test_flag_inner_core_B_proc',myrank
     call write_VTK_glob_points(NGLOB_INNER_CORE, &
                         xstore_inner_core,ystore_inner_core,zstore_inner_core, &
@@ -1084,35 +1084,35 @@
 !                            ibool_inner_core,&
 !                            is_on_a_slice_edge_inner_core, &
 !                            IREGION_INNER_CORE,.true.,idoubling_inner_core)
-     
-    ! adds both together 
+
+    ! adds both together
     test_flag(:) = test_flag(:) + test_flag_cc(:)
 
-    deallocate(test_flag_cc)    
+    deallocate(test_flag_cc)
 
     ! debug: saves array
     write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_test_flag_inner_core_C_proc',myrank
     call write_VTK_glob_points(NGLOB_INNER_CORE, &
                               xstore_inner_core,ystore_inner_core,zstore_inner_core, &
                               test_flag,filename)
-  
+
   endif
 
   ! gets new interfaces for inner_core without central cube yet
-  ! determines neighbor rank for shared faces 
+  ! determines neighbor rank for shared faces
   call rmd_get_MPI_interfaces(myrank,NGLOB_INNER_CORE,NSPEC_INNER_CORE, &
                             test_flag,my_neighbours,nibool_neighbours,ibool_neighbours, &
                             num_interfaces_inner_core,max_nibool_interfaces_inner_core, &
                             max_nibool,MAX_NEIGHBOURS, &
                             ibool_inner_core,&
                             is_on_a_slice_edge_inner_core, &
-                            IREGION_INNER_CORE,.false.,idoubling_inner_core)  
-      
+                            IREGION_INNER_CORE,.false.,idoubling_inner_core)
+
   deallocate(test_flag)
-  
+
   ! stores MPI interfaces informations
   allocate(my_neighbours_inner_core(num_interfaces_inner_core), &
-          nibool_interfaces_inner_core(num_interfaces_inner_core), &          
+          nibool_interfaces_inner_core(num_interfaces_inner_core), &
           stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating array my_neighbours_inner_core etc.')
 
@@ -1127,7 +1127,7 @@
     ! number of global ibool entries on each interface
     nibool_interfaces_inner_core(:) = nibool_neighbours(1:num_interfaces_inner_core)
     ! global iglob point ids on each interface
-    ibool_interfaces_inner_core(:,:) = ibool_neighbours(1:max_nibool_interfaces_inner_core,1:num_interfaces_inner_core)    
+    ibool_interfaces_inner_core(:,:) = ibool_neighbours(1:max_nibool_interfaces_inner_core,1:num_interfaces_inner_core)
   else
     ! dummy allocation (fortran90 should allow allocate statement with zero array size)
     max_nibool_interfaces_inner_core = 0
@@ -1136,19 +1136,19 @@
 
   ! debug: saves 1. MPI interface
   if( myrank == 0 .and. num_interfaces_inner_core >= 1 ) then
-    write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_1_points_inner_core_proc',myrank  
+    write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_1_points_inner_core_proc',myrank
     call write_VTK_data_points(NGLOB_INNER_CORE, &
                         xstore_inner_core,ystore_inner_core,zstore_inner_core, &
                         ibool_interfaces_inner_core(1:nibool_interfaces_inner_core(1),1), &
                         nibool_interfaces_inner_core(1),filename)
     !print*,'saved: ',trim(filename)//'.vtk'
-  endif  
+  endif
 
   ! synchronizes MPI processes
   call sync_all()
 
   ! frees temporary array
-  deallocate(ibool_neighbours)  
+  deallocate(ibool_neighbours)
 
 
   ! allocates MPI buffers
@@ -1183,7 +1183,7 @@
             stat=ier)
     if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_outer_core etc.')
   endif
-  
+
   ! inner core
   allocate(buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_inner_core,num_interfaces_inner_core), &
           buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_inner_core,num_interfaces_inner_core), &
@@ -1199,7 +1199,7 @@
             stat=ier)
     if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_inner_core etc.')
   endif
-  
+
   end subroutine read_mesh_databases_MPIinter
 
 
@@ -1216,7 +1216,7 @@
                                     IREGION,add_central_cube,idoubling)
 
   use constants
-  
+
   implicit none
 
   include 'mpif.h'
@@ -1233,14 +1233,14 @@
   integer,intent(out) :: num_interfaces,max_nibool_interfaces
 
   integer,dimension(NGLLX,NGLLY,NGLLZ,NSPEC),intent(in) :: ibool
-  
+
   logical,dimension(NSPEC),intent(inout) :: is_on_a_slice_edge
-  
-  integer,intent(in) :: IREGION  
+
+  integer,intent(in) :: IREGION
   logical,intent(in) :: add_central_cube
   integer,dimension(NSPEC),optional:: idoubling
-  
-  ! local parameters  
+
+  ! local parameters
   integer :: ispec,iglob,j,k
   integer :: iface,iedge,icorner
   integer :: ii,iinterface,icurrent,rank
@@ -1248,12 +1248,12 @@
   logical :: is_done,ispec_is_outer
   integer,dimension(NGLOB) :: work_test_flag
   logical,dimension(NSPEC) :: work_ispec_is_outer
-  
-  ! initializes 
+
+  ! initializes
   if( add_central_cube) then
     ! adds points to existing inner_core interfaces
     iinterface = num_interfaces
-    work_ispec_is_outer(:) = is_on_a_slice_edge(:)    
+    work_ispec_is_outer(:) = is_on_a_slice_edge(:)
   else
     ! creates new interfaces
     iinterface = 0
@@ -1264,7 +1264,7 @@
     ibool_neighbours(:,:) = 0
     work_ispec_is_outer(:) = .false.
   endif
-  
+
   ! makes working copy (converted to nearest integers)
   work_test_flag(:) = nint( test_flag(:) )
 
@@ -1279,44 +1279,44 @@
       if( idoubling(ispec) == IFLAG_IN_FICTITIOUS_CUBE ) cycle
     endif
 
-    ! sets flag if element has global points shared with other processes    
-    ispec_is_outer = .false.    
+    ! sets flag if element has global points shared with other processes
+    ispec_is_outer = .false.
 
     ! 1. finds neighbours which share a whole face with this process
     ! (faces are shared only with 1 other neighbour process)
-        
+
     ! loops over all faces of element
     do iface = 1, 6
-    
+
       ! chooses a point inside face
       select case( iface )
       case( 1 )
-        ! face I == 1    
-        iglob = ibool(1,2,2,ispec) 
+        ! face I == 1
+        iglob = ibool(1,2,2,ispec)
       case( 2 )
         ! face I == NGLLX
-        iglob = ibool(NGLLX,2,2,ispec) 
+        iglob = ibool(NGLLX,2,2,ispec)
       case( 3 )
         ! face J == 1
-        iglob = ibool(2,1,2,ispec) 
+        iglob = ibool(2,1,2,ispec)
       case( 4 )
         ! face J == NGLLY
-        iglob = ibool(2,NGLLY,2,ispec) 
+        iglob = ibool(2,NGLLY,2,ispec)
       case( 5 )
         ! face K == 1
         iglob = ibool(2,2,1,ispec)
       case( 6 )
         ! face K == NGLLZ
         iglob = ibool(2,2,NGLLZ,ispec)
-      end select      
+      end select
 
       ! checks assembled flag on global point
       if( work_test_flag(iglob) > 0 ) then
         ispec_is_outer = .true.
-        
+
         ! rank of neighbor process
         rank = work_test_flag(iglob) - 1
-        
+
         ! checks ranks range
         if( rank < 0 .or. rank >= NPROCTOT_VAL ) then
           print*,'error face rank: ',myrank,'ispec=',ispec
@@ -1324,18 +1324,18 @@
           print*,'  face ',iface
           call exit_mpi(myrank,'error face neighbor mpi rank')
         endif
-        
+
         ! checks if already stored
         icurrent = 0
         is_done = .false.
         do ii = 1,iinterface
-          if( rank == my_neighbours(ii) ) then 
+          if( rank == my_neighbours(ii) ) then
             icurrent = ii
             is_done = .true.
             exit
           endif
         enddo
-        
+
         ! updates interfaces array
         if( .not. is_done ) then
           iinterface = iinterface + 1
@@ -1347,43 +1347,43 @@
         endif
         if( icurrent == 0 ) &
           call exit_mpi(myrank,'could not find current interface for this neighbor, please check my_neighbours')
-        
+
         ! adds interface points and removes neighbor flag from face
         ! assumes NGLLX == NGLLY == NGLLZ
         do k=1,NGLLX
           do j=1,NGLLX
             select case( iface )
             case( 1 )
-              ! face I == 1    
-              iglob = ibool(1,j,k,ispec) 
+              ! face I == 1
+              iglob = ibool(1,j,k,ispec)
             case( 2 )
               ! face I == NGLLX
-              iglob = ibool(NGLLX,j,k,ispec) 
+              iglob = ibool(NGLLX,j,k,ispec)
             case( 3 )
               ! face J == 1
-              iglob = ibool(j,1,k,ispec) 
+              iglob = ibool(j,1,k,ispec)
             case( 4 )
               ! face J == NGLLY
-              iglob = ibool(j,NGLLY,k,ispec) 
+              iglob = ibool(j,NGLLY,k,ispec)
             case( 5 )
               ! face K == 1
               iglob = ibool(j,k,1,ispec)
             case( 6 )
               ! face K == NGLLZ
               iglob = ibool(j,k,NGLLZ,ispec)
-            end select      
-            
+            end select
+
             ! checks that we take each global point (on edges and corners) only once
             if( work_test_flag(iglob) <= 0 ) cycle ! continues to next point
-            
+
             ! increases number of total points on this interface
             nibool_neighbours(icurrent) = nibool_neighbours(icurrent) + 1
             if( nibool_neighbours(icurrent) > max_nibool) &
               call exit_mpi(myrank,'interface face exceeds max_nibool range')
-              
+
             ! stores interface iglob index
             ibool_neighbours( nibool_neighbours(icurrent),icurrent ) = iglob
-            
+
             ! re-sets flag
             work_test_flag(iglob) = work_test_flag(iglob) - ( rank + 1 )
             ! debug
@@ -1394,7 +1394,7 @@
             endif
           enddo
         enddo
-      endif    
+      endif
     enddo ! iface
 
     ! 2. finds neighbours which share a single edge with this process
@@ -1402,12 +1402,12 @@
 
     ! loops over all edges of element
     do iedge = 1, 12
-    
+
       ! chooses a point inside edge but not corner
       select case( iedge )
       case( 1 )
         ! face I == 1, J == 1
-        iglob = ibool(1,1,2,ispec) 
+        iglob = ibool(1,1,2,ispec)
       case( 2 )
         ! face I == 1, J == NGLLY
         iglob = ibool(1,NGLLY,2,ispec)
@@ -1446,10 +1446,10 @@
       ! checks assembled flag on global point
       if( work_test_flag(iglob) > 0 ) then
         ispec_is_outer = .true.
-        
+
         ! rank of neighbor process
         rank = work_test_flag(iglob) - 1
-        
+
         ! checks ranks range
         if( rank < 0 .or. rank >= NPROCTOT_VAL ) then
           print*,'error egde rank: ',myrank
@@ -1457,18 +1457,18 @@
           print*,'  edge ',iedge
           call exit_mpi(myrank,'error edge neighbor mpi rank')
         endif
-        
+
         ! checks if already stored
         icurrent = 0
         is_done = .false.
         do ii = 1,iinterface
-          if( rank == my_neighbours(ii) ) then 
+          if( rank == my_neighbours(ii) ) then
             icurrent = ii
             is_done = .true.
             exit
           endif
         enddo
-        
+
         ! updates interfaces array
         if( .not. is_done ) then
           iinterface = iinterface + 1
@@ -1480,14 +1480,14 @@
         endif
         if( icurrent == 0 ) &
           call exit_mpi(myrank,'could not find current interface for this neighbor, please check my_neighbours')
-        
+
         ! adds interface points and removes neighbor flag from edge
         ! assumes NGLLX == NGLLY == NGLLZ
         do k = 1,NGLLX
           select case( iedge )
           case( 1 )
             ! face I == 1, J == 1
-            iglob = ibool(1,1,k,ispec) 
+            iglob = ibool(1,1,k,ispec)
           case( 2 )
             ! face I == 1, J == NGLLY
             iglob = ibool(1,NGLLY,k,ispec)
@@ -1530,18 +1530,18 @@
           nibool_neighbours(icurrent) = nibool_neighbours(icurrent) + 1
           if( nibool_neighbours(icurrent) > max_nibool) &
             call exit_mpi(myrank,'interface edge exceeds max_nibool range')
-            
-          ! stores interface iglob index          
+
+          ! stores interface iglob index
           ibool_neighbours( nibool_neighbours(icurrent),icurrent ) = iglob
-            
+
           ! re-sets flag
           work_test_flag(iglob) = work_test_flag(iglob) - ( rank + 1 )
 
           ! debug
           if( work_test_flag(iglob) < 0 ) call exit_mpi(myrank,'error edge flag')
-          
-        enddo        
-      endif    
+
+        enddo
+      endif
     enddo ! iedge
 
 
@@ -1550,7 +1550,7 @@
 
     ! loops over all corners of element
     do icorner = 1, 8
-    
+
       ! chooses a corner point
       select case( icorner )
       case( 1 )
@@ -1584,7 +1584,7 @@
       ! note: there can be elements which have an edge or corner shared with
       !          other mpi partitions, but have the work_test_flag value already set to zero
       !          since the iglob point was found before.
-      !          also, this check here would suffice to determine the outer flag, but we also include the 
+      !          also, this check here would suffice to determine the outer flag, but we also include the
       !          check everywhere we encounter it too
       if( test_flag(iglob) > 0.5 ) then
         ispec_is_outer = .true.
@@ -1593,10 +1593,10 @@
       ! checks assembled flag on global point
       if( work_test_flag(iglob) > 0 ) then
         ispec_is_outer = .true.
-        
+
         ! rank of neighbor process
         rank = work_test_flag(iglob) - 1
-        
+
         ! checks ranks range
         if( rank < 0 .or. rank >= NPROCTOT_VAL ) then
           print*,'error corner: ',myrank
@@ -1604,18 +1604,18 @@
           print*,'  corner ',icorner
           call exit_mpi(myrank,'error corner neighbor mpi rank')
         endif
-        
+
         ! checks if already stored
         icurrent = 0
         is_done = .false.
         do ii = 1,iinterface
-          if( rank == my_neighbours(ii) ) then 
+          if( rank == my_neighbours(ii) ) then
             icurrent = ii
             is_done = .true.
             exit
           endif
         enddo
-        
+
         ! updates interfaces array
         if( .not. is_done ) then
           iinterface = iinterface + 1
@@ -1627,41 +1627,41 @@
         endif
         if( icurrent == 0 ) &
           call exit_mpi(myrank,'could not find current interface for this neighbor, please check my_neighbours')
-        
+
         ! adds this corner as interface point and removes neighbor flag from face
         ! increases number of total points on this interface
         nibool_neighbours(icurrent) = nibool_neighbours(icurrent) + 1
         if( nibool_neighbours(icurrent) > max_nibool) &
           call exit_mpi(myrank,'interface corner exceeds max_nibool range')
-          
-        ! stores interface iglob index        
+
+        ! stores interface iglob index
         ibool_neighbours( nibool_neighbours(icurrent),icurrent ) = iglob
-            
+
         ! re-sets flag
         work_test_flag(iglob) = work_test_flag(iglob) - ( rank + 1 )
 
         ! debug
         if( work_test_flag(iglob) < 0 ) call exit_mpi(myrank,'error corner flag')
-        
+
       endif
-      
+
     enddo ! icorner
-        
-    ! stores flags for outer elements when recognized as such 
+
+    ! stores flags for outer elements when recognized as such
     ! (inner/outer elements separated for non-blocking mpi communications)
     if( ispec_is_outer ) then
       work_ispec_is_outer(ispec) = .true.
     endif
-    
-  enddo 
 
+  enddo
+
   ! number of outer elements (on MPI interfaces)
   npoin = count( work_ispec_is_outer )
-    
+
   ! debug: user output
   if( myrank == 0 ) then
     write(IMAIN,*) '  interfaces : ',iinterface
-    write(IMAIN,*) '  my_neighbours: ',my_neighbours(1:iinterface)  
+    write(IMAIN,*) '  my_neighbours: ',my_neighbours(1:iinterface)
     write(IMAIN,*) '  nibool_neighbours: ',nibool_neighbours(1:iinterface)
     write(IMAIN,*) '  test flag min/max: ',minval(work_test_flag),maxval(work_test_flag)
     write(IMAIN,*) '  outer elements: ',npoin
@@ -1688,13 +1688,13 @@
   max_nibool_interfaces = maxval( nibool_neighbours(1:num_interfaces) )
 
   ! optional: ibool usually is already sorted,
-  !                this makes sure ibool_neighbours arrays are still sorted 
-  !               (iglob indices in increasing order; we will access acceleration fields accel(:,iglob), 
+  !                this makes sure ibool_neighbours arrays are still sorted
+  !               (iglob indices in increasing order; we will access acceleration fields accel(:,iglob),
   !                thus it helps if iglob strides are short and accesses are close-by)
   do iinterface = 1,num_interfaces
-    npoin = nibool_neighbours(iinterface)    
+    npoin = nibool_neighbours(iinterface)
     call heap_sort( npoin, ibool_neighbours(1:npoin,iinterface) )
-    
+
     ! debug: checks if unique set of iglob values
     do j=1,npoin-1
       if( ibool_neighbours(j,iinterface) == ibool_neighbours(j+1,iinterface) ) then
@@ -1722,9 +1722,9 @@
   use specfem_par
   use specfem_par_crustmantle
   use specfem_par_innercore
-  use specfem_par_outercore  
+  use specfem_par_outercore
   implicit none
-  
+
   ! local parameters
   real :: percentage_edge
   integer :: ier,ispec,iinner,iouter
@@ -1734,16 +1734,16 @@
   !
   ! note: arrays is_on_a_slice_edge_.. have flags set for elements which need to
   !         communicate with other MPI processes
-  
+
   ! crust_mantle
   nspec_outer_crust_mantle = count( is_on_a_slice_edge_crust_mantle )
   nspec_inner_crust_mantle = NSPEC_CRUST_MANTLE - nspec_outer_crust_mantle
-  
+
   num_phase_ispec_crust_mantle = max(nspec_inner_crust_mantle,nspec_outer_crust_mantle)
-  
+
   allocate(phase_ispec_inner_crust_mantle(num_phase_ispec_crust_mantle,2),stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating array phase_ispec_inner_crust_mantle')
-  
+
   phase_ispec_inner_crust_mantle(:,:) = 0
   iinner = 0
   iouter = 0
@@ -1764,10 +1764,10 @@
   nspec_inner_outer_core = NSPEC_OUTER_CORE - nspec_outer_outer_core
 
   num_phase_ispec_outer_core = max(nspec_inner_outer_core,nspec_outer_outer_core)
-  
+
   allocate(phase_ispec_inner_outer_core(num_phase_ispec_outer_core,2),stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating array phase_ispec_inner_outer_core')
-  
+
   phase_ispec_inner_outer_core(:,:) = 0
   iinner = 0
   iouter = 0
@@ -1788,10 +1788,10 @@
   nspec_inner_inner_core = NSPEC_INNER_CORE - nspec_outer_inner_core
 
   num_phase_ispec_inner_core = max(nspec_inner_inner_core,nspec_outer_inner_core)
-  
+
   allocate(phase_ispec_inner_inner_core(num_phase_ispec_inner_core,2),stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating array phase_ispec_inner_inner_core')
-  
+
   phase_ispec_inner_inner_core(:,:) = 0
   iinner = 0
   iouter = 0
@@ -1806,7 +1806,7 @@
       phase_ispec_inner_inner_core(iinner,2) = ispec
     endif
   enddo
-  
+
   ! user output
   if(myrank == 0) then
 
@@ -1832,19 +1832,19 @@
 
   ! debug: saves element flags
   ! crust mantle
-  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_innerouter_crust_mantle_proc',myrank  
+  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_innerouter_crust_mantle_proc',myrank
   !call write_VTK_data_elem_l(NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
   !                          xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
   !                          ibool_crust_mantle, &
-  !                          is_on_a_slice_edge_crust_mantle,filename)  
+  !                          is_on_a_slice_edge_crust_mantle,filename)
   ! outer core
-  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_innerouter_outer_core_proc',myrank  
+  !write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_innerouter_outer_core_proc',myrank
   !call write_VTK_data_elem_l(NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
   !                          xstore_outer_core,ystore_outer_core,zstore_outer_core, &
   !                          ibool_outer_core, &
   !                          is_on_a_slice_edge_outer_core,filename)
   ! inner core
-  write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_innerouter_inner_core_proc',myrank  
+  write(filename,'(a,i6.6)') trim(OUTPUT_FILES)//'/MPI_innerouter_inner_core_proc',myrank
   call write_VTK_data_elem_l(NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
                             xstore_inner_core,ystore_inner_core,zstore_inner_core, &
                             ibool_inner_core, &
@@ -1967,7 +1967,7 @@
 
 
   ! crust and mantle
-  
+
   ! create name of database
   call create_name_database(prname,myrank,IREGION_CRUST_MANTLE,LOCAL_PATH)
 
@@ -2093,7 +2093,7 @@
 
 
   ! outer core
-  
+
   ! create name of database
   call create_name_database(prname,myrank,IREGION_OUTER_CORE,LOCAL_PATH)
 
@@ -2261,23 +2261,23 @@
 
 ! heap sort algorithm
 ! sorts integer array (in increasing order, like 1 - 5 - 6 - 9 - 12 - 13 - 14 -...)
-  
+
   implicit none
   integer,intent(in) :: N
   integer,dimension(N),intent(inout) :: array
-  
+
   ! local parameters
   integer :: tmp
   integer :: i
-  
+
   ! checks if anything to do
   if( N < 2 ) return
-  
+
   ! builds heap
-  do i = N/2, 1, -1  
+  do i = N/2, 1, -1
     call heap_sort_siftdown(N,array,i,N)
   enddo
-    
+
   ! sorts array
   do i = N, 2, -1
     ! swaps last and first entry in this section
@@ -2286,45 +2286,45 @@
     array(i) = tmp
     call heap_sort_siftdown(N,array,1,i-1)
   enddo
-  
+
   end subroutine heap_sort
 
 !
 !----
-!  
+!
 
   subroutine heap_sort_siftdown(N,array,start,bottom)
 
   implicit none
-  
+
   integer,intent(in):: N
   integer,dimension(N),intent(inout) :: array
   integer :: start,bottom
-  
+
   ! local parameters
   integer :: i,j
   integer :: tmp
-  
+
   i = start
-  tmp = array(i)  
+  tmp = array(i)
   j = 2*i
   do while( j <= bottom )
     ! chooses larger value first in this section
     if( j < bottom ) then
       if( array(j) <= array(j+1) ) j = j + 1
     endif
-    
+
     ! checks if section already smaller than inital value
     if( array(j) < tmp ) exit
-        
+
     array(i) = array(j)
     i = j
     j = 2*i
   enddo
-  
-  array(i) = tmp      
+
+  array(i) = tmp
   return
-  
+
   end subroutine heap_sort_siftdown
 
 !
@@ -2381,7 +2381,7 @@
     write(IOVTK,'(3e18.6)') xstore_dummy(iglob),ystore_dummy(iglob),zstore_dummy(iglob)
   enddo
   write(IOVTK,*) ""
-  
+
   close(IOVTK)
 
   end subroutine write_VTK_data_points
@@ -2438,7 +2438,7 @@
     write(IOVTK,*) glob_values(iglob)
   enddo
   write(IOVTK,*) ""
-  
+
   close(IOVTK)
 
   end subroutine write_VTK_glob_points

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_topography_bathymetry.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_topography_bathymetry.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_topography_bathymetry.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -37,7 +37,7 @@
 
   ! local parameters
   integer :: ier
-  
+
   ! get MPI starting time
   time_start = MPI_WTIME()
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_forward_arrays.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_forward_arrays.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/save_forward_arrays.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -41,7 +41,7 @@
   if(NUMBER_OF_RUNS > 1 .and. NUMBER_OF_THIS_RUN < NUMBER_OF_RUNS) then
     write(outputname,"('dump_all_arrays',i6.6)") myrank
     open(unit=55,file=trim(LOCAL_PATH)//'/'//outputname,status='unknown',form='unformatted',action='write')
-    
+
     write(55) displ_crust_mantle
     write(55) veloc_crust_mantle
     write(55) accel_crust_mantle
@@ -86,7 +86,7 @@
   if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
     write(outputname,'(a,i6.6,a)') 'proc',myrank,'_save_forward_arrays.bin'
     open(unit=55,file=trim(LOCAL_PATH)//'/'//outputname,status='unknown',form='unformatted',action='write')
-    
+
     write(55) displ_crust_mantle
     write(55) veloc_crust_mantle
     write(55) accel_crust_mantle

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/setup_sources_receivers.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -35,10 +35,10 @@
 
   ! reads in stations file and locates receivers
   call setup_receivers()
-  
+
   ! write source and receiver VTK files for Paraview
   call setup_sources_receivers_VTKfile()
-  
+
   ! pre-compute source arrays
   call setup_sources_precompute_arrays()
 
@@ -46,7 +46,7 @@
   call setup_receivers_precompute_intp()
 
   ! user output
-  if(myrank == 0) then  
+  if(myrank == 0) then
     write(IMAIN,*)
     write(IMAIN,*) 'Total number of samples for seismograms = ',NSTEP
     write(IMAIN,*)
@@ -62,7 +62,7 @@
   subroutine setup_sources()
 
   use specfem_par
-  use specfem_par_crustmantle  
+  use specfem_par_crustmantle
   use specfem_par_movie
   implicit none
 
@@ -70,7 +70,7 @@
   double precision :: min_tshift_cmt_original
   double precision :: sec
   integer :: yr,jda,ho,mi
-  integer :: isource 
+  integer :: isource
   character(len=256) :: filename
   integer :: ier
 
@@ -249,7 +249,7 @@
   integer :: irec,isource,nrec_tot_found
   integer :: icomp,itime,nadj_files_found,nadj_files_found_tot
   character(len=3),dimension(NDIM) :: comp
-  character(len=256) :: filename,adj_source_file 
+  character(len=256) :: filename,adj_source_file
   character(len=2) :: bic
   integer :: ier
 
@@ -307,7 +307,7 @@
 
   ! counter for adjoint receiver stations in local slice, used to allocate adjoint source arrays
   nadj_rec_local = 0
-  
+
   ! counts receivers for adjoint simulations
   if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
     ! by Ebru
@@ -380,7 +380,13 @@
       write(IMAIN,*) 'this total is okay'
     endif
   endif
-  
+
+  ! check that the sum of the number of receivers in each slice is nrec
+  if( SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3 ) then
+    if(myrank == 0 .and. nrec_tot_found /= nrec) &
+      call exit_MPI(myrank,'total number of receivers is incorrect')
+  endif
+
   end subroutine setup_receivers
 
 
@@ -442,7 +448,7 @@
 
   ! local parameters
   integer :: ier
-  
+
   ! allocates source arrays
   if (SIMULATION_TYPE == 1  .or. SIMULATION_TYPE == 3) then
     ! source interpolated on all GLL points in source element
@@ -489,7 +495,7 @@
     endif
   endif
 
-  end subroutine setup_sources_precompute_arrays  
+  end subroutine setup_sources_precompute_arrays
 
 !
 !-------------------------------------------------------------------------------------------------
@@ -507,7 +513,7 @@
 
   use specfem_par
   use specfem_par_crustmantle
-  
+
   implicit none
 
 !  include "constants.h"
@@ -669,7 +675,7 @@
 
   ! local parameters
   integer :: ier
-  
+
   ! define local to global receiver numbering mapping
   ! needs to be allocate for subroutine calls (even if nrec_local == 0)
   allocate(number_receiver_global(nrec_local),stat=ier)

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -360,11 +360,11 @@
   integer :: num_interfaces_outer_core
   integer :: max_nibool_interfaces_outer_core
   integer, dimension(:), allocatable :: my_neighbours_outer_core,nibool_interfaces_outer_core
-  integer, dimension(:,:), allocatable :: ibool_interfaces_outer_core  
+  integer, dimension(:,:), allocatable :: ibool_interfaces_outer_core
 
   real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core
   real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core
-  
+
   integer, dimension(:), allocatable :: request_send_scalar_outer_core,request_recv_scalar_outer_core
   integer, dimension(:), allocatable :: b_request_send_scalar_outer_core,b_request_recv_scalar_outer_core
 
@@ -548,7 +548,7 @@
   real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: noise_surface_movie
   integer :: irec_master_noise
   integer :: nspec_top
-  
+
   ! inner / outer elements crust/mantle region
   integer :: num_phase_ispec_crust_mantle
   integer :: nspec_inner_crust_mantle,nspec_outer_crust_mantle

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_output.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_output.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_movie_output.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -42,6 +42,14 @@
   ! save movie on surface
   if( MOVIE_SURFACE ) then
     if( mod(it,NTSTEP_BETWEEN_FRAMES) == 0) then
+
+      ! gets resulting array values onto CPU
+      if( GPU_MODE ) then
+        ! transfers whole fields
+        call transfer_displ_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle,Mesh_pointer)
+        call transfer_veloc_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,veloc_crust_mantle,Mesh_pointer)
+      endif
+
       ! save velocity here to avoid static offset on displacement for movies
       call write_movie_surface(myrank,nmovie_points,scale_veloc,veloc_crust_mantle, &
                     scale_displ,displ_crust_mantle, &
@@ -62,8 +70,19 @@
     if( mod(it-MOVIE_START,NTSTEP_BETWEEN_FRAMES) == 0  &
       .and. it >= MOVIE_START .and. it <= MOVIE_STOP) then
 
-      if (MOVIE_VOLUME_TYPE == 1) then  ! output strains
+      select case( MOVIE_VOLUME_TYPE )
+      case( 1 )
+        ! output strains
 
+        ! gets resulting array values onto CPU
+        if( GPU_MODE ) then
+          call transfer_strain_cm_from_device(Mesh_pointer, &
+                                eps_trace_over_3_crust_mantle, &
+                                epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle, &
+                                epsilondev_xy_crust_mantle,epsilondev_xz_crust_mantle, &
+                                epsilondev_yz_crust_mantle)
+        endif
+
         call  write_movie_volume_strains(myrank,npoints_3dmovie, &
                     LOCAL_PATH,MOVIE_VOLUME_TYPE,MOVIE_COARSE, &
                     it,eps_trace_over_3_crust_mantle, &
@@ -72,7 +91,7 @@
                     muvstore_crust_mantle_3dmovie, &
                     mask_3dmovie,nu_3dmovie)
 
-      else if (MOVIE_VOLUME_TYPE == 2 .or. MOVIE_VOLUME_TYPE == 3) then
+      case( 2, 3 )
         ! output the Time Integral of Strain, or \mu*TIS
         call  write_movie_volume_strains(myrank,npoints_3dmovie, &
                     LOCAL_PATH,MOVIE_VOLUME_TYPE,MOVIE_COARSE, &
@@ -80,8 +99,31 @@
                     muvstore_crust_mantle_3dmovie, &
                     mask_3dmovie,nu_3dmovie)
 
-      else if (MOVIE_VOLUME_TYPE == 4) then ! output divergence and curl in whole volume
+      case( 4 )
+        ! output divergence and curl in whole volume
 
+        ! gets resulting array values onto CPU
+        if( GPU_MODE ) then
+          ! strains
+          call transfer_strain_cm_from_device(Mesh_pointer, &
+                                eps_trace_over_3_crust_mantle, &
+                                epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle, &
+                                epsilondev_xy_crust_mantle,epsilondev_xz_crust_mantle, &
+                                epsilondev_yz_crust_mantle)
+          call transfer_strain_ic_from_device(Mesh_pointer, &
+                                eps_trace_over_3_inner_core, &
+                                epsilondev_xx_inner_core,epsilondev_yy_inner_core, &
+                                epsilondev_xy_inner_core,epsilondev_xz_inner_core, &
+                                epsilondev_yz_inner_core)
+          ! wavefields
+          call transfer_fields_cm_from_device(NDIM*NGLOB_CRUST_MANTLE, &
+                                displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle,Mesh_pointer)
+          call transfer_fields_ic_from_device(NDIM*NGLOB_INNER_CORE, &
+                                displ_inner_core,veloc_inner_core,accel_inner_core,Mesh_pointer)
+          call transfer_fields_oc_from_device(NGLOB_OUTER_CORE, &
+                                displ_outer_core,veloc_outer_core,accel_outer_core,Mesh_pointer)
+        endif
+
         call write_movie_volume_divcurl(myrank,it,eps_trace_over_3_crust_mantle,&
                         div_displ_outer_core, &
                         accel_outer_core,kappavstore_outer_core,rhostore_outer_core,ibool_outer_core, &
@@ -96,25 +138,34 @@
                         accel_crust_mantle,accel_inner_core, &
                         ibool_crust_mantle,ibool_inner_core)
 
-      else if (MOVIE_VOLUME_TYPE == 5) then !output displacement
+      case( 5 )
+        !output displacement
+        if( GPU_MODE ) then
+          call transfer_displ_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle,Mesh_pointer)
+        endif
+
         scalingval = scale_displ
         call write_movie_volume_vector(myrank,it,npoints_3dmovie, &
                     LOCAL_PATH,MOVIE_VOLUME_TYPE, &
                     MOVIE_COARSE,ibool_crust_mantle,displ_crust_mantle, &
                     scalingval,mask_3dmovie,nu_3dmovie)
 
-      else if (MOVIE_VOLUME_TYPE == 6) then !output velocity
+      case( 6 )
+        !output velocity
+        if( GPU_MODE ) then
+          call transfer_veloc_cm_from_device(NDIM*NGLOB_CRUST_MANTLE,veloc_crust_mantle,Mesh_pointer)
+        endif
+
         scalingval = scale_veloc
         call write_movie_volume_vector(myrank,it,npoints_3dmovie, &
                     LOCAL_PATH,MOVIE_VOLUME_TYPE, &
                     MOVIE_COARSE,ibool_crust_mantle,veloc_crust_mantle, &
                     scalingval,mask_3dmovie,nu_3dmovie)
 
-      else
-
+      case default
         call exit_MPI(myrank, 'MOVIE_VOLUME_TYPE has to be 1,2,3,4,5 or 6')
+      end select ! MOVIE_VOLUME_TYPE
 
-      endif ! MOVIE_VOLUME_TYPE
     endif
   endif ! MOVIE_VOLUME
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90	2012-02-21 07:23:26 UTC (rev 19658)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90	2012-02-22 04:38:32 UTC (rev 19659)
@@ -40,7 +40,22 @@
 
   ! compute & store the seismograms only if there is at least one receiver located in this slice
   if (nrec_local > 0) then
-    if (SIMULATION_TYPE == 1) then
+
+    ! gets resulting array values onto CPU
+    if( GPU_MODE ) then
+      ! this transfers fields only in elements with stations for efficiency
+      call write_seismograms_transfer_cuda(displ_crust_mantle,b_displ_crust_mantle, &
+                            eps_trace_over_3_crust_mantle, &
+                            epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                            epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                            Mesh_pointer,number_receiver_global, &
+                            ispec_selected_rec,ispec_selected_source, &
+                            ibool_crust_mantle)
+    endif
+
+    ! computes traces at interpolated receiver locations
+    select case( SIMULATION_TYPE )
+    case( 1 )
       call compute_seismograms(nrec_local,nrec,displ_crust_mantle, &
                                 nu,hxir_store,hetar_store,hgammar_store, &
                                 scale_displ,ibool_crust_mantle, &
@@ -48,7 +63,7 @@
                                 seismo_current,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
                                 seismograms)
 
-    else if (SIMULATION_TYPE == 2) then
+    case( 2 )
       call compute_seismograms_adjoint(NSOURCES,nrec_local,displ_crust_mantle, &
                     eps_trace_over_3_crust_mantle, &
                     epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
@@ -66,7 +81,7 @@
                     ibool_crust_mantle,ispec_selected_source,number_receiver_global, &
                     NSTEP,it,nit_written)
 
-    else if (SIMULATION_TYPE == 3) then
+    case( 3 )
       call compute_seismograms_backward(nrec_local,nrec,b_displ_crust_mantle, &
                                 nu,hxir_store,hetar_store,hgammar_store, &
                                 scale_displ,ibool_crust_mantle, &
@@ -74,22 +89,15 @@
                                 seismo_current,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
                                 seismograms)
 
-    endif
+    end select
+
   endif ! nrec_local
 
   ! write the current or final seismograms
   if(seismo_current == NTSTEP_BETWEEN_OUTPUT_SEISMOS .or. it == it_end) then
     if (SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3) then
-      call write_seismograms_to_file(myrank,seismograms,number_receiver_global,station_name, &
-            network_name,stlat,stlon,stele,stbur, &
-            nrec,nrec_local,ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,DT,t0,it_end, &
-            yr_SAC,jda_SAC,ho_SAC,mi_SAC,sec_SAC,t_cmt_SAC,t_shift_SAC, &
-            elat_SAC,elon_SAC,depth_SAC,event_name_SAC,cmt_lat_SAC,cmt_lon_SAC,&
-            cmt_depth_SAC,cmt_hdur_SAC,NPROCTOT_VAL, &
-            OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
-            OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-            seismo_offset,seismo_current,WRITE_SEISMOGRAMS_BY_MASTER, &
-            SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE,MODEL)
+      call write_seismograms_to_file()
+
       if(myrank==0) then
         write(IMAIN,*)
         write(IMAIN,*) ' Total number of time steps written: ', it-it_begin+1
@@ -108,84 +116,46 @@
 
   end subroutine write_seismograms
 
-!=====================================================================
+!
+!-------------------------------------------------------------------------------------------------
+!
 
 ! write seismograms to files
-  subroutine write_seismograms_to_file(myrank,seismograms,number_receiver_global,station_name, &
-            network_name,stlat,stlon,stele,stbur, &
-            nrec,nrec_local,ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,DT,hdur,it_end, &
-            yr,jda,ho,mi,sec,tshift_cmt,t_shift, &
-            elat,elon,depth,event_name,cmt_lat,cmt_lon, &
-            cmt_depth,cmt_hdur,NPROCTOT, &
-            OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
-            OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-            seismo_offset,seismo_current,WRITE_SEISMOGRAMS_BY_MASTER,&
-            SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE,MODEL)
+  subroutine write_seismograms_to_file()
 
- implicit none
+  use constants
+  use specfem_par,only: &
+          NPROCTOT_VAL,myrank,nrec,nrec_local, &
+          number_receiver_global,seismograms, &
+          islice_selected_rec, &
+          seismo_offset,seismo_current, &
+          OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
+          OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
+          SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE, &
+          MODEL,OUTPUT_FILES, &
+          WRITE_SEISMOGRAMS_BY_MASTER
 
-! standard include of the MPI library
- include 'mpif.h'
+  implicit none
 
- include "constants.h"
- include "precision.h"
+  ! standard include of the MPI library
+  include 'mpif.h'
+  include "precision.h"
 
-! parameters
- integer nrec,nrec_local,myrank,it_end,NPROCTOT,NEX_XI !,NSOURCES
- character(len=256) sisname
+  ! local parameters
+  double precision :: write_time_begin,write_time
+  real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: one_seismogram
 
- integer :: seismo_offset, seismo_current, NTSTEP_BETWEEN_OUTPUT_SEISMOS
- integer, dimension(nrec_local) :: number_receiver_global
+  integer :: iproc,sender,irec_local,irec,ier,receiver
+  integer :: nrec_local_received
+  integer :: total_seismos,total_seismos_local
+  integer,dimension(:),allocatable:: islice_num_rec_local
+  integer :: msg_status(MPI_STATUS_SIZE)
+  character(len=256) :: sisname
 
- real(kind=CUSTOM_REAL), dimension(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: seismograms
- double precision hdur,DT,ANGULAR_WIDTH_XI_IN_DEGREES
-
- character(len=MAX_LENGTH_STATION_NAME), dimension(nrec) :: station_name
- character(len=MAX_LENGTH_NETWORK_NAME), dimension(nrec) :: network_name
- double precision tshift_cmt,t_shift,elat,elon,depth
- double precision cmt_lat,cmt_lon,cmt_depth,cmt_hdur
- double precision, dimension(nrec) :: stlat,stlon,stele,stbur
- integer yr,jda,ho,mi
- double precision sec
- !real mb
-! character(len=12) ename
- character(len=20) event_name
-
-! variables
- integer :: iproc,sender,irec_local,irec,ier,receiver,nrec_local_received,nrec_tot_found
- integer :: total_seismos,total_seismos_local
- double precision :: write_time_begin,write_time
-
- real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: one_seismogram
-
- integer msg_status(MPI_STATUS_SIZE)
-
- character(len=150) OUTPUT_FILES, MODEL
-
-! new flags to decide on seismogram type BS BS 06/2007
-  logical OUTPUT_SEISMOS_ASCII_TEXT, OUTPUT_SEISMOS_SAC_ALPHANUM, &
-          OUTPUT_SEISMOS_SAC_BINARY
-! flag whether seismograms are ouput for North-East-Z component or Radial-Transverse-Z
-  logical ROTATE_SEISMOGRAMS_RT
-
-! flag to decide if seismograms are written by master proc only or
-! by all processes in parallel (doing the later may create problems on some
-! file systems)
-  logical WRITE_SEISMOGRAMS_BY_MASTER
-
-! save all seismograms in one large combined file instead of one file per seismogram
-! to avoid overloading shared non-local file systems such as GPFS for instance
-  logical SAVE_ALL_SEISMOS_IN_ONE_FILE
-  logical USE_BINARY_FOR_LARGE_FILE
-
+  ! allocates single station seismogram
   allocate(one_seismogram(NDIM,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
-  if(ier /= 0) stop 'error while allocating one temporary seismogram'
+  if(ier /= 0) call exit_mpi(myrank,'error while allocating one temporary seismogram')
 
-  ! check that the sum of the number of receivers in each slice is nrec
-  call MPI_REDUCE(nrec_local,nrec_tot_found,1,MPI_INTEGER,MPI_SUM,0,MPI_COMM_WORLD,ier)
-  if(myrank == 0 .and. nrec_tot_found /= nrec) &
-      call exit_MPI(myrank,'total number of receivers is incorrect')
-
   ! get the base pathname for output files
   call get_value_string(OUTPUT_FILES, 'OUTPUT_FILES', 'OUTPUT_FILES')
 
@@ -227,16 +197,7 @@
       one_seismogram = seismograms(:,irec_local,:)
 
       ! write this seismogram
-      call write_one_seismogram(one_seismogram,irec, &
-                             station_name,network_name,stlat,stlon,stele,stbur,nrec, &
-                             ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,DT,hdur,it_end, &
-                             yr,jda,ho,mi,sec,tshift_cmt,t_shift, &
-                             elat,elon,depth,event_name,cmt_lat, &
-                             cmt_lon,cmt_depth,cmt_hdur,OUTPUT_FILES, &
-                             OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
-                             OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT, &
-                             NTSTEP_BETWEEN_OUTPUT_SEISMOS,seismo_offset,seismo_current, &
-                             SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE,MODEL,myrank)
+      call write_one_seismogram(one_seismogram,irec)
 
     enddo
 
@@ -245,12 +206,12 @@
 
     if(total_seismos_local/= nrec_local) call exit_MPI(myrank,'incorrect total number of receivers saved')
 
-    write_time = MPI_WTIME() - write_time_begin
 
     if(myrank == 0) then
-     write(IMAIN,*)
-     write(IMAIN,*) 'Writing the seismograms in parallel took ',write_time,' seconds'
-     write(IMAIN,*)
+      write_time = MPI_WTIME() - write_time_begin
+      write(IMAIN,*)
+      write(IMAIN,*) 'Writing the seismograms in parallel took ',write_time,' seconds'
+      write(IMAIN,*)
     endif
 
   ! now only the master process does the writing of seismograms and
@@ -261,96 +222,102 @@
 
     if(myrank == 0) then ! on the master, gather all the seismograms
 
-       ! create one large file instead of one small file per station to avoid file system overload
-       if(OUTPUT_SEISMOS_ASCII_TEXT .and. SAVE_ALL_SEISMOS_IN_ONE_FILE) then
-           write(sisname,'(A)') '/all_seismograms'
+      ! create one large file instead of one small file per station to avoid file system overload
+      if(OUTPUT_SEISMOS_ASCII_TEXT .and. SAVE_ALL_SEISMOS_IN_ONE_FILE) then
+         write(sisname,'(A)') '/all_seismograms'
 
-         if(USE_BINARY_FOR_LARGE_FILE) then
-           if (seismo_offset==0) then
-             open(unit=IOUT,file=trim(OUTPUT_FILES)//trim(sisname)//'.bin',status='unknown',form='unformatted',action='write')
-           else
-             open(unit=IOUT,file=trim(OUTPUT_FILES)//trim(sisname)//'.bin',status='old',&
-                  form='unformatted',position='append',action='write')
-           endif
+       if(USE_BINARY_FOR_LARGE_FILE) then
+         if (seismo_offset==0) then
+           open(unit=IOUT,file=trim(OUTPUT_FILES)//trim(sisname)//'.bin',status='unknown',form='unformatted',action='write')
          else
-           if (seismo_offset==0) then
-             open(unit=IOUT,file=trim(OUTPUT_FILES)//trim(sisname)//'.ascii',status='unknown',form='formatted',action='write')
-           else
-             open(unit=IOUT,file=trim(OUTPUT_FILES)//trim(sisname)//'.ascii',status='old',&
-                  form='formatted',position='append',action='write')
-           endif
+           open(unit=IOUT,file=trim(OUTPUT_FILES)//trim(sisname)//'.bin',status='old',&
+                form='unformatted',position='append',action='write')
          endif
-
+       else
+         if (seismo_offset==0) then
+           open(unit=IOUT,file=trim(OUTPUT_FILES)//trim(sisname)//'.ascii',status='unknown',form='formatted',action='write')
+         else
+           open(unit=IOUT,file=trim(OUTPUT_FILES)//trim(sisname)//'.ascii',status='old',&
+                form='formatted',position='append',action='write')
+         endif
        endif
 
-       total_seismos = 0
+      endif
 
-       ! loop on all the slices
-       do iproc = 0,NPROCTOT-1
+      ! counts number of local receivers for each slice
+      allocate(islice_num_rec_local(0:NPROCTOT_VAL-1),stat=ier)
+      if( ier /= 0 ) call exit_mpi(myrank,'error allocating islice_num_rec_local')
 
-         ! receive except from proc 0, which is me and therefore I already have this value
-         sender = iproc
-         if(iproc /= 0) then
-           call MPI_RECV(nrec_local_received,1,MPI_INTEGER,sender,itag,MPI_COMM_WORLD,msg_status,ier)
-           if(nrec_local_received < 0) call exit_MPI(myrank,'error while receiving local number of receivers')
-         else
-           nrec_local_received = nrec_local
-         endif
-         if (nrec_local_received > 0) then
-           do irec_local = 1,nrec_local_received
-             ! receive except from proc 0, which is myself and therefore I already have these values
-             if(iproc == 0) then
-               ! get global number of that receiver
-               irec = number_receiver_global(irec_local)
-               one_seismogram(:,:) = seismograms(:,irec_local,:)
-             else
-               call MPI_RECV(irec,1,MPI_INTEGER,sender,itag,MPI_COMM_WORLD,msg_status,ier)
-               if(irec < 1 .or. irec > nrec) call exit_MPI(myrank,'error while receiving global receiver number')
-               call MPI_RECV(one_seismogram,NDIM*seismo_current,CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
-             endif
+      islice_num_rec_local(:) = 0
+      do irec = 1,nrec
+        iproc = islice_selected_rec(irec)
+        islice_num_rec_local(iproc) = islice_num_rec_local(iproc) + 1
+      enddo
 
-             total_seismos = total_seismos + 1
-             ! write this seismogram
-             call write_one_seismogram(one_seismogram,irec, &
-                                       station_name,network_name,stlat,stlon,stele,stbur,nrec, &
-                                       ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,DT,hdur,it_end, &
-                                       yr,jda,ho,mi,sec,tshift_cmt,t_shift, &
-                                       elat,elon,depth,event_name,cmt_lat, &
-                                       cmt_lon,cmt_depth,cmt_hdur,OUTPUT_FILES, &
-                                       OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
-                                       OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT, &
-                                       NTSTEP_BETWEEN_OUTPUT_SEISMOS,seismo_offset,seismo_current,&
-                                       SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE,MODEL,myrank)
-           enddo
-         endif
-       enddo
+      total_seismos = 0
 
-       write(IMAIN,*)
-       write(IMAIN,*) 'Total number of receivers saved is ',total_seismos,' out of ',nrec
-       write(IMAIN,*)
+      ! loop on all the slices
+      do iproc = 0,NPROCTOT_VAL-1
 
-       if(total_seismos /= nrec) call exit_MPI(myrank,'incorrect total number of receivers saved')
+       ! communicates only with processes which contain local receivers
+       if( islice_num_rec_local(iproc) == 0 ) cycle
 
-       ! create one large file instead of one small file per station to avoid file system overload
-       if(SAVE_ALL_SEISMOS_IN_ONE_FILE) close(IOUT)
+       ! receive except from proc 0, which is me and therefore I already have this value
+       sender = iproc
+       if(iproc /= 0) then
+         call MPI_RECV(nrec_local_received,1,MPI_INTEGER,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+         if(nrec_local_received < 0) call exit_MPI(myrank,'error while receiving local number of receivers')
+       else
+         nrec_local_received = nrec_local
+       endif
+       if (nrec_local_received > 0) then
+         do irec_local = 1,nrec_local_received
+           ! receive except from proc 0, which is myself and therefore I already have these values
+           if(iproc == 0) then
+             ! get global number of that receiver
+             irec = number_receiver_global(irec_local)
+             one_seismogram(:,:) = seismograms(:,irec_local,:)
+           else
+             call MPI_RECV(irec,1,MPI_INTEGER,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+             if(irec < 1 .or. irec > nrec) call exit_MPI(myrank,'error while receiving global receiver number')
+             call MPI_RECV(one_seismogram,NDIM*seismo_current,CUSTOM_MPI_TYPE,sender,itag,MPI_COMM_WORLD,msg_status,ier)
+           endif
 
-    else  ! on the nodes, send the seismograms to the master
-       receiver = 0
-       call MPI_SEND(nrec_local,1,MPI_INTEGER,receiver,itag,MPI_COMM_WORLD,ier)
-       if (nrec_local > 0) then
-         do irec_local = 1,nrec_local
-           ! get global number of that receiver
-           irec = number_receiver_global(irec_local)
-           call MPI_SEND(irec,1,MPI_INTEGER,receiver,itag,MPI_COMM_WORLD,ier)
-           one_seismogram(:,:) = seismograms(:,irec_local,:)
-           call MPI_SEND(one_seismogram,NDIM*seismo_current,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+           total_seismos = total_seismos + 1
+           ! write this seismogram
+           call write_one_seismogram(one_seismogram,irec)
+
          enddo
        endif
+      enddo
+      deallocate(islice_num_rec_local)
+
+      write(IMAIN,*)
+      write(IMAIN,*) 'Total number of receivers saved is ',total_seismos,' out of ',nrec
+      write(IMAIN,*)
+
+      if(total_seismos /= nrec) call exit_MPI(myrank,'incorrect total number of receivers saved')
+
+      ! create one large file instead of one small file per station to avoid file system overload
+      if(SAVE_ALL_SEISMOS_IN_ONE_FILE) close(IOUT)
+
+    else  ! on the nodes, send the seismograms to the master
+      receiver = 0
+      call MPI_SEND(nrec_local,1,MPI_INTEGER,receiver,itag,MPI_COMM_WORLD,ier)
+      if (nrec_local > 0) then
+        do irec_local = 1,nrec_local
+          ! get global number of that receiver
+          irec = number_receiver_global(irec_local)
+          call MPI_SEND(irec,1,MPI_INTEGER,receiver,itag,MPI_COMM_WORLD,ier)
+          one_seismogram(:,:) = seismograms(:,irec_local,:)
+          call MPI_SEND(one_seismogram,NDIM*seismo_current,CUSTOM_MPI_TYPE,receiver,itag,MPI_COMM_WORLD,ier)
+        enddo
+      endif
     endif
 
-    write_time  = MPI_WTIME() - write_time_begin
 
     if(myrank == 0) then
+      write_time  = MPI_WTIME() - write_time_begin
       write(IMAIN,*)
       write(IMAIN,*) 'Writing the seismograms by master proc alone took ',write_time,' seconds'
       write(IMAIN,*)
@@ -362,79 +329,50 @@
 
   end subroutine write_seismograms_to_file
 
-!=====================================================================
+!
+!-------------------------------------------------------------------------------------------------
+!
 
-  subroutine write_one_seismogram(one_seismogram,irec, &
-              station_name,network_name,stlat,stlon,stele,stbur,nrec, &
-              ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,DT,hdur,it_end, &
-              yr,jda,ho,mi,sec,tshift_cmt,t_shift,&
-              elat,elon,depth,event_name,cmt_lat,cmt_lon,cmt_depth,cmt_hdur, &
-              OUTPUT_FILES, &
-              OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
-              OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT, &
-              NTSTEP_BETWEEN_OUTPUT_SEISMOS,seismo_offset,seismo_current, &
-              SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE,MODEL,myrank)
+  subroutine write_one_seismogram(one_seismogram,irec)
 
-  implicit none
+  use constants
+  use specfem_par,only: &
+          ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI, &
+          myrank,nrec, &
+          number_receiver_global, &
+          station_name,network_name,stlat,stlon,stele,stbur, &
+          DT,seismo_offset,seismo_current,it_end, &
+          OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
+          OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
+          SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE, &
+          MODEL,OUTPUT_FILES
 
-  include "constants.h"
+  use specfem_par,only: &
+          hdur=>t0,yr=>yr_SAC,jda=>jda_SAC,ho=>ho_SAC,mi=>mi_SAC,sec=>sec_SAC, &
+          tshift_cmt=>t_cmt_SAC,t_shift=>t_shift_SAC, &
+          elat=>elat_SAC,elon=>elon_SAC,depth=>depth_SAC, &
+          event_name=>event_name_SAC,cmt_lat=>cmt_lat_SAC,cmt_lon=>cmt_lon_SAC,&
+          cmt_depth=>cmt_depth_SAC,cmt_hdur=>cmt_hdur_SAC
 
-  integer nrec,it_end,NEX_XI
+  implicit none
 
-  integer :: seismo_offset, seismo_current, NTSTEP_BETWEEN_OUTPUT_SEISMOS
-
+  integer :: irec
   real(kind=CUSTOM_REAL), dimension(NDIM,NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: one_seismogram
 
+  ! local parameters
   real(kind=CUSTOM_REAL), dimension(5,NTSTEP_BETWEEN_OUTPUT_SEISMOS) :: seismogram_tmp
+  integer :: iorientation,length_station_name,length_network_name
+  character(len=4) :: chn
+  character(len=256) :: sisname,sisname_big_file
+  character(len=2) :: bic
+  integer :: ior_start,ior_end
+  double precision :: backaz
+  real(kind=CUSTOM_REAL) :: phi,cphi,sphi
+  integer :: isample
 
-  integer myrank
-  double precision hdur,DT,ANGULAR_WIDTH_XI_IN_DEGREES
-
-  character(len=MAX_LENGTH_STATION_NAME), dimension(nrec) :: station_name
-  character(len=MAX_LENGTH_NETWORK_NAME), dimension(nrec) :: network_name
-
-  integer irec,length_station_name,length_network_name
-  integer iorientation
-
-  character(len=4) chn
-  character(len=256) sisname,sisname_big_file
-  character(len=150) OUTPUT_FILES
-
-  ! section added for SAC
-  double precision tshift_cmt,t_shift,elat,elon,depth
-  double precision cmt_lat,cmt_lon,cmt_depth,cmt_hdur
-
-  double precision, dimension(nrec) :: stlat,stlon,stele,stbur
-
-  ! variables for SAC header fields
-  integer yr,jda,ho,mi
-  double precision sec
-  character(len=20) event_name
-  character(len=150) MODEL
-
-  ! flags to determine seismogram type
-  logical OUTPUT_SEISMOS_ASCII_TEXT, OUTPUT_SEISMOS_SAC_ALPHANUM, &
-          OUTPUT_SEISMOS_SAC_BINARY
-  ! flag whether seismograms are ouput for North-East-Z component or Radial-Transverse-Z
-  logical ROTATE_SEISMOGRAMS_RT
-
-  ! save all seismograms in one large combined file instead of one file per seismogram
-  ! to avoid overloading shared non-local file systems such as GPFS for instance
-  logical SAVE_ALL_SEISMOS_IN_ONE_FILE
-  logical USE_BINARY_FOR_LARGE_FILE
-
-! local parameters
-  character(len=2) bic
-  ! variables used for calculation of backazimuth and
-  ! rotation of components if ROTATE_SEISMOGRAMS=.true.
-  integer ior_start,ior_end
-  double precision backaz
-  real(kind=CUSTOM_REAL) phi,cphi,sphi
-  integer isample
-
-  !----------------------------------------------------------------
-
+  ! gets band code
   call band_instrument_code(DT,bic)
+
   if (ROTATE_SEISMOGRAMS_RT) then ! iorientation 1=N,2=E,3=Z,4=R,5=T
     ior_start=3    ! starting from Z
     ior_end  =5    ! ending with T => ZRT
@@ -443,8 +381,6 @@
     ior_end  =3    ! ending with Z => NEZ
   endif
 
-    !do iorientation = 1,NDIM
-    !do iorientation = 1,5                   ! BS BS changed from 3 (NEZ) to 5 (NEZRT) components
   do iorientation = ior_start,ior_end      ! BS BS changed according to ROTATE_SEISMOGRAMS_RT
 
     if(iorientation == 1) then
@@ -528,7 +464,6 @@
 
     ! SAC output format
     if (OUTPUT_SEISMOS_SAC_ALPHANUM .or. OUTPUT_SEISMOS_SAC_BINARY) then
-
       call write_output_SAC(seismogram_tmp,irec, &
               station_name,network_name,stlat,stlon,stele,stbur,nrec, &
               ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI,DT,hdur,it_end, &
@@ -543,7 +478,6 @@
 
     ! ASCII output format
     if(OUTPUT_SEISMOS_ASCII_TEXT) then
-
       call write_output_ASCII(seismogram_tmp, &
               DT,hdur,OUTPUT_FILES, &
               NTSTEP_BETWEEN_OUTPUT_SEISMOS,seismo_offset,seismo_current, &
@@ -556,108 +490,109 @@
 
   end subroutine write_one_seismogram
 
-!=====================================================================
+!
+!-------------------------------------------------------------------------------------------------
+!
 
 ! write adjoint seismograms to text files
 
- subroutine write_adj_seismograms(seismograms,number_receiver_global, &
+  subroutine write_adj_seismograms(seismograms,number_receiver_global, &
               nrec_local,it,nit_written,DT,NSTEP, &
               NTSTEP_BETWEEN_OUTPUT_SEISMOS,hdur,LOCAL_PATH)
 
- implicit none
+  implicit none
 
- include "constants.h"
+  include "constants.h"
 
- integer nrec_local,NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS,it,nit_written
- integer, dimension(nrec_local) :: number_receiver_global
- real(kind=CUSTOM_REAL), dimension(9,nrec_local,NSTEP) :: seismograms
- double precision hdur,DT
- character(len=150) LOCAL_PATH
+  integer :: nrec_local,NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS,it,nit_written
+  integer, dimension(nrec_local) :: number_receiver_global
+  real(kind=CUSTOM_REAL), dimension(9,nrec_local,NSTEP) :: seismograms
+  double precision :: hdur,DT
+  character(len=150) :: LOCAL_PATH
 
- integer irec,irec_local
- integer iorientation,isample
+  integer :: irec,irec_local
+  integer :: iorientation,isample
 
- character(len=4) chn
- character(len=150) clean_LOCAL_PATH,final_LOCAL_PATH
- character(len=256) sisname
- character(len=2) bic
+  character(len=4) :: chn
+  character(len=150) :: clean_LOCAL_PATH,final_LOCAL_PATH
+  character(len=256) :: sisname
+  character(len=2) :: bic
 
- call band_instrument_code(DT,bic)
+  call band_instrument_code(DT,bic)
 
- do irec_local = 1,nrec_local
+  do irec_local = 1,nrec_local
 
-! get global number of that receiver
-   irec = number_receiver_global(irec_local)
+    ! get global number of that receiver
+    irec = number_receiver_global(irec_local)
 
-   do iorientation = 1,9
-
-     if(iorientation == 1) then
+    do iorientation = 1,9
+      if(iorientation == 1) then
        chn = 'SNN'
-     else if(iorientation == 2) then
+      else if(iorientation == 2) then
        chn = 'SEE'
-     else if(iorientation == 3) then
+      else if(iorientation == 3) then
        chn = 'SZZ'
-     else if(iorientation == 4) then
+      else if(iorientation == 4) then
        chn = 'SNE'
-     else if(iorientation == 5) then
+      else if(iorientation == 5) then
        chn = 'SNZ'
-     else if(iorientation == 6) then
+      else if(iorientation == 6) then
        chn = 'SEZ'
-     else if(iorientation == 7) then
+      else if(iorientation == 7) then
        !chn = 'LHN'
        chn = bic(1:2)//'N'
-     else if(iorientation == 8) then
+      else if(iorientation == 8) then
        chn = bic(1:2)//'E'
-     else if(iorientation == 9) then
+      else if(iorientation == 9) then
        chn = bic(1:2)//'Z'
-     endif
+      endif
 
 
-! create the name of the seismogram file for each slice
-! file name includes the name of the station, the network and the component
-     write(sisname,"(a,i6.6,'.',a,'.',a3,'.sem')") 'S',irec,'NT',chn
+      ! create the name of the seismogram file for each slice
+      ! file name includes the name of the station, the network and the component
+      write(sisname,"(a,i6.6,'.',a,'.',a3,'.sem')") 'S',irec,'NT',chn
 
-! suppress white spaces if any
-   clean_LOCAL_PATH = adjustl(LOCAL_PATH)
+      ! suppress white spaces if any
+      clean_LOCAL_PATH = adjustl(LOCAL_PATH)
 
-! create full final local path
-   final_LOCAL_PATH = clean_LOCAL_PATH(1:len_trim(clean_LOCAL_PATH)) // '/'
+      ! create full final local path
+      final_LOCAL_PATH = clean_LOCAL_PATH(1:len_trim(clean_LOCAL_PATH)) // '/'
 
-! save seismograms in text format with no subsampling.
-! Because we do not subsample the output, this can result in large files
-! if the simulation uses many time steps. However, subsampling the output
-! here would result in a loss of accuracy when one later convolves
-! the results with the source time function
-   if(it <= NTSTEP_BETWEEN_OUTPUT_SEISMOS) then
-      !open new file
-      open(unit=IOUT,file=final_LOCAL_PATH(1:len_trim(final_LOCAL_PATH))//sisname(1:len_trim(sisname)),&
-           status='unknown',action='write')
-   else if(it > NTSTEP_BETWEEN_OUTPUT_SEISMOS) then
-      !append to existing file
-      open(unit=IOUT,file=final_LOCAL_PATH(1:len_trim(final_LOCAL_PATH))//sisname(1:len_trim(sisname)),&
-           status='old',position='append',action='write')
-   endif
-! make sure we never write more than the maximum number of time steps
-! subtract half duration of the source to make sure travel time is correct
-     do isample = nit_written+1,min(it,NSTEP)
-! distinguish between single and double precision for reals
-       if(CUSTOM_REAL == SIZE_REAL) then
-         write(IOUT,*) sngl(dble(isample-1)*DT - hdur),' ',seismograms(iorientation,irec_local,isample-nit_written)
-       else
-         write(IOUT,*) dble(isample-1)*DT - hdur,' ',seismograms(iorientation,irec_local,isample-nit_written)
-       endif
-     enddo
+      ! save seismograms in text format with no subsampling.
+      ! Because we do not subsample the output, this can result in large files
+      ! if the simulation uses many time steps. However, subsampling the output
+      ! here would result in a loss of accuracy when one later convolves
+      ! the results with the source time function
+      if(it <= NTSTEP_BETWEEN_OUTPUT_SEISMOS) then
+        !open new file
+        open(unit=IOUT,file=final_LOCAL_PATH(1:len_trim(final_LOCAL_PATH))//sisname(1:len_trim(sisname)),&
+              status='unknown',action='write')
+      else if(it > NTSTEP_BETWEEN_OUTPUT_SEISMOS) then
+        !append to existing file
+        open(unit=IOUT,file=final_LOCAL_PATH(1:len_trim(final_LOCAL_PATH))//sisname(1:len_trim(sisname)),&
+              status='old',position='append',action='write')
+      endif
+      ! make sure we never write more than the maximum number of time steps
+      ! subtract half duration of the source to make sure travel time is correct
+      do isample = nit_written+1,min(it,NSTEP)
+        ! distinguish between single and double precision for reals
+        if(CUSTOM_REAL == SIZE_REAL) then
+          write(IOUT,*) sngl(dble(isample-1)*DT - hdur),' ',seismograms(iorientation,irec_local,isample-nit_written)
+        else
+          write(IOUT,*) dble(isample-1)*DT - hdur,' ',seismograms(iorientation,irec_local,isample-nit_written)
+        endif
+      enddo
 
-     close(IOUT)
+      close(IOUT)
+    enddo
+  enddo
 
-     enddo
+  end subroutine write_adj_seismograms
 
- enddo
+!
+!-------------------------------------------------------------------------------------------------
+!
 
- end subroutine write_adj_seismograms
-
-!=====================================================================
-
  subroutine band_instrument_code(DT,bic)
   ! This subroutine is to choose the appropriate band and instrument codes for channel names of seismograms
   ! based on the IRIS convention (first two letters of channel codes which were LH(Z/E/N) previously).