[cig-commits] r22782 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src: cuda specfem3D

Fri Sep 13 04:23:46 PDT 2013

Author: danielpeter
Date: 2013-09-13 04:23:45 -0700 (Fri, 13 Sep 2013)
New Revision: 22782

Modified:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90
Log:
updates cuda routines for forward&backward wavefields

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -48,11 +48,16 @@
                                                      int* d_ibool_interfaces) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int iglob,iloc;
 
   for( int iinterface=0; iinterface < num_interfaces; iinterface++) {
     if(id<d_nibool_interfaces[iinterface]) {
-      d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces*iinterface)] =
-        d_potential_dot_dot_acoustic[(d_ibool_interfaces[id+max_nibool_interfaces*iinterface]-1)];
+
+      iloc = id + max_nibool_interfaces*iinterface;
+      iglob = d_ibool_interfaces[iloc] - 1;
+
+      // fills buffer
+      d_send_potential_dot_dot_buffer[iloc] = d_potential_dot_dot_acoustic[iglob];
     }
   }
 
@@ -77,13 +82,10 @@
 
   int blocksize = BLOCKSIZE_TRANSFER;
   int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_oc))/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -94,20 +96,29 @@
                                                            mp->max_nibool_interfaces_oc,
                                                            mp->d_nibool_interfaces_outer_core,
                                                            mp->d_ibool_interfaces_outer_core);
+
+    print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_accel_buffer_outer_core,
+                                       (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+                                       cudaMemcpyDeviceToHost),98000);
+
   }
   else if(*FORWARD_OR_ADJOINT == 3) {
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
     prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_accel_outer_core,
-                                                           mp->d_send_accel_buffer_outer_core,
+                                                           mp->d_b_send_accel_buffer_outer_core,
                                                            mp->num_interfaces_outer_core,
                                                            mp->max_nibool_interfaces_oc,
                                                            mp->d_nibool_interfaces_outer_core,
                                                            mp->d_ibool_interfaces_outer_core);
+
+    print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_b_send_accel_buffer_outer_core,
+                                       (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+                                       cudaMemcpyDeviceToHost),98001);
+
   }
 
-  print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_accel_buffer_outer_core,
-                                     (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
-                                     cudaMemcpyDeviceToHost),98000);
-
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_boundary_kernel");
 #endif
@@ -125,11 +136,17 @@
                                                       int* d_ibool_interfaces) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  int iglob,iloc;
 
   for( int iinterface=0; iinterface < num_interfaces; iinterface++) {
     if(id<d_nibool_interfaces[iinterface]) {
-      atomicAdd(&d_potential_dot_dot_acoustic[(d_ibool_interfaces[id+max_nibool_interfaces*iinterface]-1)],
-                d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces*iinterface)]);
+
+      iloc = id + max_nibool_interfaces*iinterface;
+
+      iglob = d_ibool_interfaces[iloc]-1;
+
+      // assembles values
+      atomicAdd(&d_potential_dot_dot_acoustic[iglob],d_send_potential_dot_dot_buffer[iloc]);
     }
   }
 }
@@ -151,25 +168,22 @@
   // checks if anything to do
   if( mp->num_interfaces_outer_core == 0 ) return;
 
-  // copies scalar buffer onto GPU
-  cudaMemcpy(mp->d_send_accel_buffer_outer_core, buffer_recv_scalar,
-             (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
-             cudaMemcpyHostToDevice);
-
   // assembles on GPU
   int blocksize = BLOCKSIZE_TRANSFER;
   int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_oc)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
   if(*FORWARD_OR_ADJOINT == 1) {
+    // copies scalar buffer onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_outer_core, buffer_recv_scalar,
+                                       (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),99000);
+
     //assemble forward field
     assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_accel_outer_core,
                                                             mp->d_send_accel_buffer_outer_core,
@@ -179,9 +193,17 @@
                                                             mp->d_ibool_interfaces_outer_core);
   }
   else if(*FORWARD_OR_ADJOINT == 3) {
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
+    // copies scalar buffer onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_outer_core, buffer_recv_scalar,
+                                       (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+                                       cudaMemcpyHostToDevice),99001);
+
     //assemble reconstructed/backward field
     assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_accel_outer_core,
-                                                            mp->d_send_accel_buffer_outer_core,
+                                                            mp->d_b_send_accel_buffer_outer_core,
                                                             mp->num_interfaces_outer_core,
                                                             mp->max_nibool_interfaces_oc,
                                                             mp->d_nibool_interfaces_outer_core,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -47,15 +47,18 @@
                                                  int* d_ibool_interfaces) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-  int iglob;
+  int iglob,iloc;
 
   for( int iinterface=0; iinterface < num_interfaces; iinterface++) {
     if(id<d_nibool_interfaces[iinterface]) {
-      iglob = d_ibool_interfaces[id+max_nibool_interfaces*iinterface]-1;
+
+      iloc = id + max_nibool_interfaces*iinterface;
+      iglob = d_ibool_interfaces[iloc]-1;
+
       // fills buffer
-      d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)] = d_accel[3*iglob];
-      d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)+1] = d_accel[3*iglob + 1];
-      d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)+2] = d_accel[3*iglob + 2];
+      d_send_accel_buffer[3*iloc] = d_accel[3*iglob];
+      d_send_accel_buffer[3*iloc + 1] = d_accel[3*iglob + 1];
+      d_send_accel_buffer[3*iloc + 2] = d_accel[3*iglob + 2];
     }
   }
 
@@ -68,12 +71,15 @@
 extern "C"
 void FC_FUNC_(transfer_boun_accel_from_device,
               TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
-                                                  realw* send_accel_buffer,
-                                                  int* IREGION,
-                                                  int* FORWARD_OR_ADJOINT){
+                                               realw* send_accel_buffer,
+                                               int* IREGION,
+                                               int* FORWARD_OR_ADJOINT){
   TRACE("transfer_boun_accel_from_device");
-  int blocksize,size_padded,num_blocks_x,num_blocks_y;
 
+  int blocksize,size_padded;
+  int num_blocks_x,num_blocks_y;
+  dim3 grid,threads;
+
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
   // crust/mantle region
@@ -82,15 +88,12 @@
 
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_cm)/((double)blocksize)))*blocksize;
-      num_blocks_x = size_padded/blocksize;
-      num_blocks_y = 1;
-      while(num_blocks_x > MAXIMUM_GRID_DIM) {
-        num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-        num_blocks_y = num_blocks_y*2;
-      }
-      dim3 grid(num_blocks_x,num_blocks_y);
-      dim3 threads(blocksize,1,1);
 
+      get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+      grid = dim3(num_blocks_x,num_blocks_y);
+      threads = dim3(blocksize,1,1);
+
       if(*FORWARD_OR_ADJOINT == 1) {
         prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle,
                                                            mp->d_send_accel_buffer_crust_mantle,
@@ -98,20 +101,30 @@
                                                            mp->max_nibool_interfaces_cm,
                                                            mp->d_nibool_interfaces_crust_mantle,
                                                            mp->d_ibool_interfaces_crust_mantle);
+
+        // copies buffer to CPU
+        print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_crust_mantle,
+                                           NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle*sizeof(realw),
+                                           cudaMemcpyDeviceToHost),41000);
+
       }
       else if(*FORWARD_OR_ADJOINT == 3) {
+        // debug
+        DEBUG_EMPTY_BACKWARD();
+
         prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
-                                                           mp->d_send_accel_buffer_crust_mantle,
+                                                           mp->d_b_send_accel_buffer_crust_mantle,
                                                            mp->num_interfaces_crust_mantle,
                                                            mp->max_nibool_interfaces_cm,
                                                            mp->d_nibool_interfaces_crust_mantle,
                                                            mp->d_ibool_interfaces_crust_mantle);
+        // copies buffer to CPU
+        print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_crust_mantle,
+                                           NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle*sizeof(realw),
+                                           cudaMemcpyDeviceToHost),41001);
+
       }
 
-      // copies buffer to CPU
-      cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_crust_mantle,
-                 3*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle*sizeof(realw),
-                 cudaMemcpyDeviceToHost);
 
     }
   }
@@ -122,15 +135,12 @@
 
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ic)/((double)blocksize)))*blocksize;
-      num_blocks_x = size_padded/blocksize;
-      num_blocks_y = 1;
-      while(num_blocks_x > MAXIMUM_GRID_DIM) {
-        num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-        num_blocks_y = num_blocks_y*2;
-      }
-      dim3 grid(num_blocks_x,num_blocks_y);
-      dim3 threads(blocksize,1,1);
 
+      get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+      grid = dim3(num_blocks_x,num_blocks_y);
+      threads = dim3(blocksize,1,1);
+
       if(*FORWARD_OR_ADJOINT == 1) {
         prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
                                                            mp->d_send_accel_buffer_inner_core,
@@ -138,21 +148,29 @@
                                                            mp->max_nibool_interfaces_ic,
                                                            mp->d_nibool_interfaces_inner_core,
                                                            mp->d_ibool_interfaces_inner_core);
+
+        // copies buffer to CPU
+        print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_inner_core,
+                                           NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core*sizeof(realw),
+                                           cudaMemcpyDeviceToHost),41000);
+
       }
       else if(*FORWARD_OR_ADJOINT == 3) {
+        // debug
+        DEBUG_EMPTY_BACKWARD();
+
         prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core,
-                                                           mp->d_send_accel_buffer_inner_core,
+                                                           mp->d_b_send_accel_buffer_inner_core,
                                                            mp->num_interfaces_inner_core,
                                                            mp->max_nibool_interfaces_ic,
                                                            mp->d_nibool_interfaces_inner_core,
                                                            mp->d_ibool_interfaces_inner_core);
+        // copies buffer to CPU
+        print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_inner_core,
+                                           NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core*sizeof(realw),
+                                           cudaMemcpyDeviceToHost),41001);
       }
 
-      // copies buffer to CPU
-      cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_inner_core,
-                 3*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core*sizeof(realw),
-                 cudaMemcpyDeviceToHost);
-
     }
   }
 
@@ -170,15 +188,18 @@
                                                   int* d_ibool_interfaces) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-  int iglob;
+  int iglob,iloc;
 
   for( int iinterface=0; iinterface < num_interfaces; iinterface++) {
     if(id < d_nibool_interfaces[iinterface]) {
-      iglob = d_ibool_interfaces[id + max_nibool_interfaces*iinterface]-1;
+
+      iloc = id + max_nibool_interfaces*iinterface;
+      iglob = d_ibool_interfaces[iloc]-1;
+
       // assembles acceleration: adds contributions from buffer array
-      atomicAdd(&d_accel[3*iglob],d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)]);
-      atomicAdd(&d_accel[3*iglob + 1],d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)+1]);
-      atomicAdd(&d_accel[3*iglob + 2],d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)+2]);
+      atomicAdd(&d_accel[3*iglob],d_send_accel_buffer[3*iloc]);
+      atomicAdd(&d_accel[3*iglob + 1],d_send_accel_buffer[3*iloc + 1]);
+      atomicAdd(&d_accel[3*iglob + 2],d_send_accel_buffer[3*iloc + 2]);
     }
   }
 }
@@ -193,32 +214,31 @@
                                               int* IREGION,
                                               int* FORWARD_OR_ADJOINT) {
   TRACE("transfer_asmbl_accel_to_device");
-  int blocksize,size_padded,num_blocks_x,num_blocks_y;
 
+  int blocksize,size_padded;
+  int num_blocks_x,num_blocks_y;
+  dim3 grid,threads;
+
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // crust/mantle region
   if( *IREGION == IREGION_CRUST_MANTLE ){
     if( mp->num_interfaces_crust_mantle > 0 ){
-      // copies vector buffer values to GPU
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_crust_mantle, buffer_recv_vector,
-                                         NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
-                                         cudaMemcpyHostToDevice),41000);
-
       // assembles values
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_cm)/((double)blocksize)))*blocksize;
-      num_blocks_x = size_padded/blocksize;
-      num_blocks_y = 1;
-      while(num_blocks_x > MAXIMUM_GRID_DIM) {
-        num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-        num_blocks_y = num_blocks_y*2;
-      }
 
-      dim3 grid(num_blocks_x,num_blocks_y);
-      dim3 threads(blocksize,1,1);
+      get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
+      grid = dim3(num_blocks_x,num_blocks_y);
+      threads = dim3(blocksize,1,1);
+
       if(*FORWARD_OR_ADJOINT == 1) {
+        // copies vector buffer values to GPU
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_crust_mantle, buffer_recv_vector,
+                                           NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
+                                           cudaMemcpyHostToDevice),41000);
+
         //assemble forward accel
         assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle,
                                                             mp->d_send_accel_buffer_crust_mantle,
@@ -228,9 +248,17 @@
                                                             mp->d_ibool_interfaces_crust_mantle);
       }
       else if(*FORWARD_OR_ADJOINT == 3) {
+        // debug
+        DEBUG_EMPTY_BACKWARD();
+
+        // copies vector buffer values to GPU
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_crust_mantle, buffer_recv_vector,
+                                           NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
+                                           cudaMemcpyHostToDevice),41000);
+
         //assemble adjoint accel
         assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
-                                                            mp->d_send_accel_buffer_crust_mantle,
+                                                            mp->d_b_send_accel_buffer_crust_mantle,
                                                             mp->num_interfaces_crust_mantle,
                                                             mp->max_nibool_interfaces_cm,
                                                             mp->d_nibool_interfaces_crust_mantle,
@@ -246,25 +274,21 @@
   // inner core region
   if( *IREGION == IREGION_INNER_CORE ){
     if( mp->num_interfaces_inner_core > 0 ){
-      // copies buffer values to GPU
-      print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_inner_core, buffer_recv_vector,
-                                         NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw),
-                                         cudaMemcpyHostToDevice),41001);
-
       // assembles values
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ic)/((double)blocksize)))*blocksize;
-      num_blocks_x = size_padded/blocksize;
-      num_blocks_y = 1;
-      while(num_blocks_x > MAXIMUM_GRID_DIM) {
-        num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-        num_blocks_y = num_blocks_y*2;
-      }
 
-      dim3 grid(num_blocks_x,num_blocks_y);
-      dim3 threads(blocksize,1,1);
+      get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
+      grid = dim3(num_blocks_x,num_blocks_y);
+      threads = dim3(blocksize,1,1);
+
       if(*FORWARD_OR_ADJOINT == 1) {
+        // copies buffer values to GPU
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_inner_core, buffer_recv_vector,
+                                           NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw),
+                                           cudaMemcpyHostToDevice),41001);
+
         //assemble forward accel
         assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
                                                             mp->d_send_accel_buffer_inner_core,
@@ -274,9 +298,17 @@
                                                             mp->d_ibool_interfaces_inner_core);
       }
       else if(*FORWARD_OR_ADJOINT == 3) {
+        // debug
+        DEBUG_EMPTY_BACKWARD();
+
+        // copies buffer values to GPU
+        print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_inner_core, buffer_recv_vector,
+                                           NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw),
+                                           cudaMemcpyHostToDevice),41001);
+
         //assemble adjoint accel
         assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core,
-                                                            mp->d_send_accel_buffer_inner_core,
+                                                            mp->d_b_send_accel_buffer_inner_core,
                                                             mp->num_interfaces_inner_core,
                                                             mp->max_nibool_interfaces_ic,
                                                             mp->d_nibool_interfaces_inner_core,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -212,6 +212,45 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+void synchronize_cuda(){
+#if CUDA_VERSION >= 4000
+    cudaDeviceSynchronize();
+#else
+    cudaThreadSynchronize();
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void synchronize_mpi(){
+#ifdef WITH_MPI
+    MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
+
+// Initially sets the blocks_x to be the num_blocks, and adds rows as needed (block size limit of 65535).
+// If an additional row is added, the row length is cut in
+// half. If the block count is odd, there will be 1 too many blocks,
+// which must be managed at runtime with an if statement.
+
+  *num_blocks_x = num_blocks;
+  *num_blocks_y = 1;
+
+  while(*num_blocks_x > MAXIMUM_GRID_DIM) {
+    *num_blocks_x = (int) ceil(*num_blocks_x * 0.5f);
+    *num_blocks_y = *num_blocks_y * 2;
+  }
+
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
 void get_free_memory(double* free_db, double* used_db, double* total_db) {
 
   // gets memory usage in byte
@@ -285,6 +324,56 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// Auxiliary functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+__global__ void memset_to_realw_kernel(realw* array, int size, realw value){
+
+  unsigned int tid = threadIdx.x;
+  unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  unsigned int i = tid + bx*blockDim.x;
+
+  if( i < size ){
+    array[i] = *value;
+  }
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+realw get_device_array_maximum_value(realw* array, int size){
+
+// get maximum of array on GPU by copying over to CPU and handle it there
+
+  realw max = 0.0f;
+
+  // checks if anything to do
+  if( size > 0 ){
+    realw* h_array;
+
+    // explicitly wait for cuda kernels to finish
+    // (cudaMemcpy implicitly synchronizes all other cuda operations)
+    synchronize_cuda();
+
+    h_array = (realw*)calloc(size,sizeof(realw));
+    print_CUDA_error_if_any(cudaMemcpy(h_array,array,sizeof(realw)*size,cudaMemcpyDeviceToHost),33001);
+
+    // finds maximum value in array
+    max = h_array[0];
+    for( int i=1; i < size; i++){
+      if( abs(h_array[i]) > max ) max = abs(h_array[i]);
+    }
+    free(h_array);
+  }
+  return max;
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // scalar arrays (acoustic/fluid outer core)
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -340,7 +429,7 @@
 void FC_FUNC_(check_norm_acoustic_from_device,
               CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
-                                                  int* SIMULATION_TYPE) {
+                                                  int* FORWARD_OR_ADJOINT) {
 
 TRACE("check_norm_acoustic_from_device");
   //double start_time = get_time();
@@ -390,22 +479,19 @@
   int size = mp->NGLOB_OUTER_CORE;
 
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
-  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
-  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+  int num_blocks_x,num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  if(*SIMULATION_TYPE == 1 ){
+  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+
+  if(*FORWARD_OR_ADJOINT == 1 ){
     get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_displ_outer_core,size,d_max);
-  }else if(*SIMULATION_TYPE == 3 ){
+  }else if(*FORWARD_OR_ADJOINT == 3 ){
     get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,size,d_max);
   }
 
@@ -513,7 +599,7 @@
 void FC_FUNC_(check_norm_elastic_from_device,
               CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
                                               long* Mesh_pointer_f,
-                                              int* SIMULATION_TYPE) {
+                                              int* FORWARD_OR_ADJOINT) {
 
   TRACE("check_norm_elastic_from_device");
   //double start_time = get_time();
@@ -522,7 +608,6 @@
 
   realw max,max_crust_mantle,max_inner_core;
   realw *d_max;
-  int num_blocks_x,num_blocks_y;
   int size,size_padded;
   dim3 grid,threads;
 
@@ -535,22 +620,19 @@
   size = mp->NGLOB_CRUST_MANTLE;
 
   size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-  num_blocks_x = size_padded/blocksize;
-  num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
-  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
-  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+  int num_blocks_x,num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   grid = dim3(num_blocks_x,num_blocks_y);
   threads = dim3(blocksize,1,1);
 
-  if(*SIMULATION_TYPE == 1 ){
+  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+
+  if(*FORWARD_OR_ADJOINT == 1 ){
     get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,size,d_max);
-  }else if(*SIMULATION_TYPE == 3 ){
+  }else if(*FORWARD_OR_ADJOINT == 3 ){
     get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,size,d_max);
   }
 
@@ -573,22 +655,18 @@
   size = mp->NGLOB_INNER_CORE;
 
   size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-  num_blocks_x = size_padded/blocksize;
-  num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
-  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
-  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   grid = dim3(num_blocks_x,num_blocks_y);
   threads = dim3(blocksize,1,1);
 
-  if(*SIMULATION_TYPE == 1 ){
+  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+
+  if(*FORWARD_OR_ADJOINT == 1 ){
     get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ_inner_core,size,d_max);
-  }else if(*SIMULATION_TYPE == 3 ){
+  }else if(*FORWARD_OR_ADJOINT == 3 ){
     get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,size,d_max);
   }
 
@@ -632,10 +710,9 @@
 
   realw max,max_eps;
   realw *d_max;
-  int num_blocks_x,num_blocks_y;
+  int num_blocks_x, num_blocks_y;
   int size,size_padded;
-  dim3 grid;
-  dim3 threads;
+  dim3 grid,threads;
 
   // launch simple reduction kernel
   realw* h_max;
@@ -645,19 +722,15 @@
   size = NGLL3*(mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
 
   size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-  num_blocks_x = size_padded/blocksize;
-  num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
-  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
-  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   grid = dim3(num_blocks_x,num_blocks_y);
   threads = dim3(blocksize,1,1);
 
+  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+
   max = 0.0f;
 
   // determines max for: eps_trace_over_3_crust_mantle
@@ -681,19 +754,15 @@
   size = NGLL3*(mp->NSPEC_CRUST_MANTLE);
 
   size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-  num_blocks_x = size_padded/blocksize;
-  num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
 
-  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
-  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   grid = dim3(num_blocks_x,num_blocks_y);
   threads = dim3(blocksize,1,1);
 
+  h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+  cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+
   max_eps = 0.0f;
 
   // determines max for: epsilondev_xx_crust_mantle

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -86,12 +86,12 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(compute_add_sources_el_cuda,
-              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
-                                           int* NSOURCESf,
-                                           double* h_stf_pre_compute) {
+void FC_FUNC_(compute_add_sources_cuda,
+              COMPUTE_ADD_SOURCES_CUDA)(long* Mesh_pointer_f,
+                                        int* NSOURCESf,
+                                        double* h_stf_pre_compute) {
 
-  TRACE("compute_add_sources_el_cuda");
+  TRACE("compute_add_sources_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
@@ -100,18 +100,15 @@
 
   int NSOURCES = *NSOURCESf;
 
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(NGLLX,NGLLX,NGLLX);
 
   // copies source time function buffer values to GPU
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
-                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),71018);
 
   // adds source contributions
   compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
@@ -124,7 +121,7 @@
                                                NSOURCES);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_kernel");
+  exit_on_cuda_error("compute_add_sources_cuda");
 #endif
 }
 
@@ -135,11 +132,13 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(compute_add_sources_el_s3_cuda,
-              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer_f,
-                                              int* NSOURCESf,
-                                              double* h_stf_pre_compute) {
-  TRACE("compute_add_sources_el_s3_cuda");
+void FC_FUNC_(compute_add_sources_backward_cuda,
+              COMPUTE_ADD_SOURCES_BACKWARD_CUDA)(long* Mesh_pointer_f,
+                                                 int* NSOURCESf,
+                                                 double* h_stf_pre_compute) {
+  TRACE("compute_add_sources_backward_cuda");
+  // debug
+  DEBUG_EMPTY_BACKWARD();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
@@ -148,18 +147,15 @@
 
   int NSOURCES = *NSOURCESf;
 
-  int num_blocks_x = NSOURCES;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(NGLLX,NGLLX,NGLLX);
 
   // copies source time function buffer values to GPU
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
-                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),19);
+                                     NSOURCES*sizeof(double),cudaMemcpyHostToDevice),71019);
 
   compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
                                                mp->d_ibool_crust_mantle,
@@ -171,7 +167,7 @@
                                                NSOURCES);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+  exit_on_cuda_error("compute_add_sources_backward_cuda");
 #endif
 }
 
@@ -182,13 +178,13 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
-                                                     int nrec,
-                                                     realw* adj_sourcearrays,
-                                                     int* ibool,
-                                                     int* ispec_selected_rec,
-                                                     int* pre_computed_irec,
-                                                     int nadj_rec_local) {
+__global__ void compute_add_sources_adjoint_cuda_kernel(realw* accel,
+                                                        int nrec,
+                                                        realw* adj_sourcearrays,
+                                                        int* ibool,
+                                                        int* ispec_selected_rec,
+                                                        int* pre_computed_irec,
+                                                        int nadj_rec_local) {
 
   int ispec,iglob;
   int irec,i,j,k;
@@ -220,28 +216,24 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(add_sources_el_sim_type_2_or_3,
-              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
-                                              int* nrec,
-                                              realw* h_adj_sourcearrays,
-                                              int* h_islice_selected_rec,
-                                              int* h_ispec_selected_rec,
-                                              int* time_index) {
+void FC_FUNC_(compute_add_sources_adjoint_cuda,
+              COMPUTE_ADD_SOURCES_ADJOINT_CUDA)(long* Mesh_pointer,
+                                                int* nrec,
+                                                realw* h_adj_sourcearrays,
+                                                int* h_islice_selected_rec,
+                                                int* h_ispec_selected_rec,
+                                                int* time_index) {
 
-  TRACE("add_sources_el_sim_type_2_or_3");
+  TRACE("compute_add_sources_adjoint_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   // check if anything to do
   if( mp->nadj_rec_local == 0 ) return;
 
-  // make sure grid dimension is less than MAXIMUM_GRID_DIM in x dimension
-  int num_blocks_x = mp->nadj_rec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nadj_rec_local,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(NGLLX,NGLLX,NGLLX);
 
@@ -284,23 +276,23 @@
   if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
 
   // copies extracted array values onto GPU
-  cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
-             (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),cudaMemcpyHostToDevice);
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+                                     (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),71000);
 
 
   // the irec_local variable needs to be precomputed (as
   // h_pre_comp..), because normally it is in the loop updating accel,
   // and due to how it's incremented, it cannot be parallelized
-  add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
-                                                         *nrec,
-                                                         mp->d_adj_sourcearrays,
-                                                         mp->d_ibool_crust_mantle,
-                                                         mp->d_ispec_selected_rec,
-                                                         mp->d_pre_computed_irec,
-                                                         mp->nadj_rec_local);
+  compute_add_sources_adjoint_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
+                                                            *nrec,
+                                                            mp->d_adj_sourcearrays,
+                                                            mp->d_ibool_crust_mantle,
+                                                            mp->d_ispec_selected_rec,
+                                                            mp->d_pre_computed_irec,
+                                                            mp->nadj_rec_local);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
+  exit_on_cuda_error("compute_add_sources_adjoint_cuda");
 #endif
 }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -56,6 +56,7 @@
 
   int i = threadIdx.x;
   int j = threadIdx.y;
+
   int iface = blockIdx.x + gridDim.x*blockIdx.y;
 
   int k,k_corresp,iglob_cm,iglob_oc,ispec,ispec_selected;
@@ -106,6 +107,60 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+extern "C"
+void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
+              COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f,
+                                               int* FORWARD_OR_ADJOINT) {
+
+  TRACE("compute_coupling_fluid_cmb_cuda");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nspec2D_top_outer_core,&num_blocks_x,&num_blocks_y);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(NGLLX,NGLLX,1);
+
+  // launches GPU kernel
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+                                                        mp->d_accel_outer_core,
+                                                        mp->d_ibool_crust_mantle,
+                                                        mp->d_ibelm_bottom_crust_mantle,
+                                                        mp->d_normal_top_outer_core,
+                                                        mp->d_jacobian2D_top_outer_core,
+                                                        mp->d_wgllwgll_xy,
+                                                        mp->d_ibool_outer_core,
+                                                        mp->d_ibelm_top_outer_core,
+                                                        mp->nspec2D_top_outer_core);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
+    // adjoint simulations
+    compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+                                                        mp->d_b_accel_outer_core,
+                                                        mp->d_ibool_crust_mantle,
+                                                        mp->d_ibelm_bottom_crust_mantle,
+                                                        mp->d_normal_top_outer_core,
+                                                        mp->d_jacobian2D_top_outer_core,
+                                                        mp->d_wgllwgll_xy,
+                                                        mp->d_ibool_outer_core,
+                                                        mp->d_ibelm_top_outer_core,
+                                                        mp->nspec2D_top_outer_core);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_coupling_fluid_CMB_kernel");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
 __global__ void compute_coupling_fluid_ICB_kernel(realw* displ_inner_core,
                                                   realw* accel_outer_core,
                                                   int* ibool_inner_core,
@@ -119,6 +174,7 @@
 
   int i = threadIdx.x;
   int j = threadIdx.y;
+
   int iface = blockIdx.x + gridDim.x*blockIdx.y;
 
   int k,k_corresp,iglob_ic,iglob_oc,ispec,ispec_selected;
@@ -168,61 +224,7 @@
   }
 }
 
-/* ----------------------------------------------------------------------------------------------- */
 
-extern "C"
-void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
-              COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f,
-                                               int* FORWARD_OR_ADJOINT) {
-
-  TRACE("compute_coupling_fluid_cmb_cuda");
-  //double start_time = get_time();
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  int num_blocks_x = mp->nspec2D_top_outer_core;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,1);
-
-  // launches GPU kernel
-  if( *FORWARD_OR_ADJOINT == 1 ){
-    compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
-                                                        mp->d_accel_outer_core,
-                                                        mp->d_ibool_crust_mantle,
-                                                        mp->d_ibelm_bottom_crust_mantle,
-                                                        mp->d_normal_top_outer_core,
-                                                        mp->d_jacobian2D_top_outer_core,
-                                                        mp->d_wgllwgll_xy,
-                                                        mp->d_ibool_outer_core,
-                                                        mp->d_ibelm_top_outer_core,
-                                                        mp->nspec2D_top_outer_core);
-  }else if( *FORWARD_OR_ADJOINT == 3 ){
-    // adjoint simulations
-    compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
-                                                        mp->d_b_accel_outer_core,
-                                                        mp->d_ibool_crust_mantle,
-                                                        mp->d_ibelm_bottom_crust_mantle,
-                                                        mp->d_normal_top_outer_core,
-                                                        mp->d_jacobian2D_top_outer_core,
-                                                        mp->d_wgllwgll_xy,
-                                                        mp->d_ibool_outer_core,
-                                                        mp->d_ibelm_top_outer_core,
-                                                        mp->nspec2D_top_outer_core);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("compute_coupling_fluid_CMB_kernel");
-#endif
-}
-
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
@@ -235,15 +237,11 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  int num_blocks_x = mp->nspec2D_bottom_outer_core;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nspec2D_bottom_outer_core,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,1);
+  dim3 threads(NGLLX,NGLLX,1);
 
   // launches GPU kernel
   if( *FORWARD_OR_ADJOINT == 1 ){
@@ -258,6 +256,9 @@
                                                         mp->d_ibelm_bottom_outer_core,
                                                         mp->nspec2D_bottom_outer_core);
   }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
     // adjoint simulations
     compute_coupling_fluid_ICB_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
                                                         mp->d_b_accel_outer_core,
@@ -297,7 +298,7 @@
                                                   realw RHO_TOP_OC,
                                                   realw minus_g_cmb,
                                                   int GRAVITY,
-                                                  int NSPEC_BOTTOM_CM) {
+                                                  int NSPEC2D_BOTTOM_CM) {
 
   int i = threadIdx.x;
   int j = threadIdx.y;
@@ -309,7 +310,7 @@
   realw weight;
 
   // for surfaces elements exactly at the bottom of the crust mantle (outer core top)
-  if( iface < NSPEC_BOTTOM_CM ){
+  if( iface < NSPEC2D_BOTTOM_CM ){
 
     // "-1" from index values to convert from Fortran-> C indexing
     ispec = ibelm_bottom_crust_mantle[iface] - 1;
@@ -352,6 +353,68 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+extern "C"
+void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
+              COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
+                                               int* FORWARD_OR_ADJOINT) {
+
+  TRACE("compute_coupling_cmb_fluid_cuda");
+  //double start_time = get_time();
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nspec2D_bottom_crust_mantle,&num_blocks_x,&num_blocks_y);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(5,5,1);
+
+  // launches GPU kernel
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+                                                        mp->d_accel_crust_mantle,
+                                                        mp->d_accel_outer_core,
+                                                        mp->d_ibool_crust_mantle,
+                                                        mp->d_ibelm_bottom_crust_mantle,
+                                                        mp->d_normal_top_outer_core,
+                                                        mp->d_jacobian2D_top_outer_core,
+                                                        mp->d_wgllwgll_xy,
+                                                        mp->d_ibool_outer_core,
+                                                        mp->d_ibelm_top_outer_core,
+                                                        mp->RHO_TOP_OC,
+                                                        mp->minus_g_cmb,
+                                                        mp->gravity,
+                                                        mp->nspec2D_bottom_crust_mantle);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
+    //  adjoint simulations
+    compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+                                                        mp->d_b_accel_crust_mantle,
+                                                        mp->d_b_accel_outer_core,
+                                                        mp->d_ibool_crust_mantle,
+                                                        mp->d_ibelm_bottom_crust_mantle,
+                                                        mp->d_normal_top_outer_core,
+                                                        mp->d_jacobian2D_top_outer_core,
+                                                        mp->d_wgllwgll_xy,
+                                                        mp->d_ibool_outer_core,
+                                                        mp->d_ibelm_top_outer_core,
+                                                        mp->RHO_TOP_OC,
+                                                        mp->minus_g_cmb,
+                                                        mp->gravity,
+                                                        mp->nspec2D_bottom_crust_mantle);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_coupling_CMB_fluid_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
 __global__ void compute_coupling_ICB_fluid_kernel(realw* displ_inner_core,
                                                   realw* accel_inner_core,
                                                   realw* accel_outer_core,
@@ -365,10 +428,11 @@
                                                   realw RHO_BOTTOM_OC,
                                                   realw minus_g_icb,
                                                   int GRAVITY,
-                                                  int NSPEC_TOP_IC) {
+                                                  int NSPEC2D_TOP_IC) {
 
   int i = threadIdx.x;
   int j = threadIdx.y;
+
   int iface = blockIdx.x + gridDim.x*blockIdx.y;
 
   int k,k_corresp,iglob_ic,iglob_oc,ispec,ispec_selected;
@@ -377,7 +441,7 @@
   realw weight;
 
   // for surfaces elements exactly at the top of the inner core (outer core bottom)
-  if( iface < NSPEC_TOP_IC ){
+  if( iface < NSPEC2D_TOP_IC ){
 
     // "-1" from index values to convert from Fortran-> C indexing
     ispec = ibelm_top_inner_core[iface] - 1;
@@ -401,9 +465,9 @@
     // compute pressure, taking gravity into account
     if( GRAVITY ){
       pressure = RHO_BOTTOM_OC * ( - accel_outer_core[iglob_oc]
-        + minus_g_icb * (displ_inner_core[iglob_ic*3]*nx
-                         + displ_inner_core[iglob_ic*3+1]*ny
-                         + displ_inner_core[iglob_ic*3+2]*nz) );
+                                   + minus_g_icb * ( displ_inner_core[iglob_ic*3]*nx
+                                                   + displ_inner_core[iglob_ic*3+1]*ny
+                                                   + displ_inner_core[iglob_ic*3+2]*nz) );
     }else{
       pressure = - RHO_BOTTOM_OC * accel_outer_core[iglob_oc];
     }
@@ -419,69 +483,7 @@
   }
 }
 
-/* ----------------------------------------------------------------------------------------------- */
 
-extern "C"
-void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
-              COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
-                                               int* FORWARD_OR_ADJOINT) {
-
-  TRACE("compute_coupling_cmb_fluid_cuda");
-  //double start_time = get_time();
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  int num_blocks_x = mp->nspec2D_bottom_crust_mantle;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,1);
-
-  // launches GPU kernel
-  if( *FORWARD_OR_ADJOINT == 1 ){
-    compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
-                                                        mp->d_accel_crust_mantle,
-                                                        mp->d_accel_outer_core,
-                                                        mp->d_ibool_crust_mantle,
-                                                        mp->d_ibelm_bottom_crust_mantle,
-                                                        mp->d_normal_top_outer_core,
-                                                        mp->d_jacobian2D_top_outer_core,
-                                                        mp->d_wgllwgll_xy,
-                                                        mp->d_ibool_outer_core,
-                                                        mp->d_ibelm_top_outer_core,
-                                                        mp->RHO_TOP_OC,
-                                                        mp->minus_g_cmb,
-                                                        mp->gravity,
-                                                        mp->nspec2D_bottom_crust_mantle);
-  }else if( *FORWARD_OR_ADJOINT == 3 ){
-    //  adjoint simulations
-    compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
-                                                        mp->d_b_accel_crust_mantle,
-                                                        mp->d_b_accel_outer_core,
-                                                        mp->d_ibool_crust_mantle,
-                                                        mp->d_ibelm_bottom_crust_mantle,
-                                                        mp->d_normal_top_outer_core,
-                                                        mp->d_jacobian2D_top_outer_core,
-                                                        mp->d_wgllwgll_xy,
-                                                        mp->d_ibool_outer_core,
-                                                        mp->d_ibelm_top_outer_core,
-                                                        mp->RHO_TOP_OC,
-                                                        mp->minus_g_cmb,
-                                                        mp->gravity,
-                                                        mp->nspec2D_bottom_crust_mantle);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("compute_coupling_CMB_fluid_cuda");
-#endif
-}
-
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
@@ -494,15 +496,11 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  int num_blocks_x = mp->nspec2D_top_inner_core;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nspec2D_top_inner_core,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,1);
+  dim3 threads(NGLLX,NGLLX,1);
 
   // launches GPU kernel
   if( *FORWARD_OR_ADJOINT == 1 ){
@@ -521,6 +519,9 @@
                                                         mp->gravity,
                                                         mp->nspec2D_top_inner_core);
   }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
     //  adjoint simulations
     compute_coupling_ICB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
                                                         mp->d_b_accel_inner_core,
@@ -592,7 +593,7 @@
     additional_term_y = (rmass - rmassy_crust_mantle[iglob]) * force_normal_comp;
     additional_term_z = (rmass - rmassz_crust_mantle[iglob]) * force_normal_comp;
 
-    // since we access this global point only once...
+    // since we access this global point only once, no need to use atomics ...
     accel_crust_mantle[iglob*3] += additional_term_x * nx;
     accel_crust_mantle[iglob*3+1] += additional_term_y * ny;
     accel_crust_mantle[iglob*3+2] += additional_term_z * nz;
@@ -606,8 +607,6 @@
 extern "C"
 void FC_FUNC_(compute_coupling_ocean_cuda,
               COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
-                                           int* NCHUNKS_VAL,
-                                           int* exact_mass_matrix_for_rotation,
                                            int* FORWARD_OR_ADJOINT) {
 
   TRACE("compute_coupling_ocean_cuda");
@@ -617,60 +616,35 @@
   int blocksize = BLOCKSIZE_TRANSFER;
   int size_padded = ((int)ceil(((double)mp->npoin_oceans)/((double)blocksize)))*blocksize;
 
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  if( ( *NCHUNKS_VAL != 6 && mp->absorbing_conditions) || (mp->rotation && *exact_mass_matrix_for_rotation) ){
-    // uses corrected mass matrices
-    if( *FORWARD_OR_ADJOINT == 1 ){
-      compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
-                                                           mp->d_rmassx_crust_mantle,
-                                                           mp->d_rmassy_crust_mantle,
-                                                           mp->d_rmassz_crust_mantle,
-                                                           mp->d_rmass_ocean_load,
-                                                           mp->npoin_oceans,
-                                                           mp->d_ibool_ocean_load,
-                                                           mp->d_normal_ocean_load);
-    }else if( *FORWARD_OR_ADJOINT == 3){
-      // for backward/reconstructed potentials
-      compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
-                                                           mp->d_b_rmassx_crust_mantle,
-                                                           mp->d_b_rmassy_crust_mantle,
-                                                           mp->d_b_rmassz_crust_mantle,
-                                                           mp->d_rmass_ocean_load,
-                                                           mp->npoin_oceans,
-                                                           mp->d_ibool_ocean_load,
-                                                           mp->d_normal_ocean_load);
-    }
-  }else{
-    // uses only rmassz
-    if( *FORWARD_OR_ADJOINT == 1 ){
-      compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
-                                                           mp->d_rmassz_crust_mantle,
-                                                           mp->d_rmassz_crust_mantle,
-                                                           mp->d_rmassz_crust_mantle,
-                                                           mp->d_rmass_ocean_load,
-                                                           mp->npoin_oceans,
-                                                           mp->d_ibool_ocean_load,
-                                                           mp->d_normal_ocean_load);
-    }else if( *FORWARD_OR_ADJOINT == 3){
-      // for backward/reconstructed potentials
-      compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
-                                                           mp->d_b_rmassz_crust_mantle,
-                                                           mp->d_b_rmassz_crust_mantle,
-                                                           mp->d_b_rmassz_crust_mantle,
-                                                           mp->d_rmass_ocean_load,
-                                                           mp->npoin_oceans,
-                                                           mp->d_ibool_ocean_load,
-                                                           mp->d_normal_ocean_load);
-    }
+  // uses corrected mass matrices
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
+                                                         mp->d_rmassx_crust_mantle,
+                                                         mp->d_rmassy_crust_mantle,
+                                                         mp->d_rmassz_crust_mantle,
+                                                         mp->d_rmass_ocean_load,
+                                                         mp->npoin_oceans,
+                                                         mp->d_ibool_ocean_load,
+                                                         mp->d_normal_ocean_load);
+  }else if( *FORWARD_OR_ADJOINT == 3){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
+    // for backward/reconstructed potentials
+    compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+                                                         mp->d_b_rmassx_crust_mantle,
+                                                         mp->d_b_rmassy_crust_mantle,
+                                                         mp->d_b_rmassz_crust_mantle,
+                                                         mp->d_rmass_ocean_load,
+                                                         mp->npoin_oceans,
+                                                         mp->d_ibool_ocean_load,
+                                                         mp->d_normal_ocean_load);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -713,7 +713,6 @@
                                           realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
                                           realw* epsilondev_xz,realw* epsilondev_yz,
                                           realw* epsilon_trace_over_3,
-                                          int SIMULATION_TYPE,
                                           int ATTENUATION,
                                           int PARTIAL_PHYS_DISPERSION_ONLY,
                                           int USE_3D_ATTENUATION_ARRAYS,
@@ -862,7 +861,6 @@
   if (active) {
 
 #ifndef MANUALLY_UNROLLED_LOOPS
-
     tempx1l = 0.f;
     tempx2l = 0.f;
     tempx3l = 0.f;
@@ -891,7 +889,6 @@
         tempy3l += s_dummyy_loc[l*NGLL2+J*NGLLX+I]*fac3;
         tempz3l += s_dummyz_loc[l*NGLL2+J*NGLLX+I]*fac3;
     }
-
 #else
 
     tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*sh_hprime_xx[I]
@@ -1305,6 +1302,13 @@
                           realw* d_R_xy,
                           realw* d_R_xz,
                           realw* d_R_yz,
+                          realw* d_c11store,realw* d_c12store,realw* d_c13store,
+                          realw* d_c14store,realw* d_c15store,realw* d_c16store,
+                          realw* d_c22store,realw* d_c23store,realw* d_c24store,
+                          realw* d_c25store,realw* d_c26store,realw* d_c33store,
+                          realw* d_c34store,realw* d_c35store,realw* d_c36store,
+                          realw* d_c44store,realw* d_c45store,realw* d_c46store,
+                          realw* d_c55store,realw* d_c56store,realw* d_c66store,
                           realw* d_b_epsilondev_xx,
                           realw* d_b_epsilondev_yy,
                           realw* d_b_epsilondev_xy,
@@ -1316,30 +1320,21 @@
                           realw* d_b_R_xy,
                           realw* d_b_R_xz,
                           realw* d_b_R_yz,
-                          realw* d_c11store,realw* d_c12store,realw* d_c13store,
-                          realw* d_c14store,realw* d_c15store,realw* d_c16store,
-                          realw* d_c22store,realw* d_c23store,realw* d_c24store,
-                          realw* d_c25store,realw* d_c26store,realw* d_c33store,
-                          realw* d_c34store,realw* d_c35store,realw* d_c36store,
-                          realw* d_c44store,realw* d_c45store,realw* d_c46store,
-                          realw* d_c55store,realw* d_c56store,realw* d_c66store){
+                          int FORWARD_OR_ADJOINT){
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("before kernel Kernel_2_crust_mantle");
 #endif
 
-  /* if the grid can handle the number of blocks, we let it be 1D */
-  /* grid_2_x = nb_elem_color; */
-  /* nb_elem_color is just how many blocks we are computing now */
+  // if the grid can handle the number of blocks, we let it be 1D
+  // grid_2_x = nb_elem_color;
+  // nb_elem_color is just how many blocks we are computing now
 
-  int num_blocks_x = nb_blocks_to_compute;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
   int blocksize = NGLL3_PADDED;
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(nb_blocks_to_compute,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -1350,56 +1345,57 @@
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
 
-  Kernel_2_crust_mantle_impl<<<grid,threads>>>(nb_blocks_to_compute,
-                                                mp->NGLOB_CRUST_MANTLE,
-                                                d_ibool,
-                                                d_ispec_is_tiso,
-                                                mp->d_phase_ispec_inner_crust_mantle,
-                                                mp->num_phase_ispec_crust_mantle,
-                                                d_iphase,
-                                                mp->deltat,
-                                                mp->use_mesh_coloring_gpu,
-                                                mp->d_displ_crust_mantle,
-                                                mp->d_veloc_crust_mantle,
-                                                mp->d_accel_crust_mantle,
-                                                d_xix, d_xiy, d_xiz,
-                                                d_etax, d_etay, d_etaz,
-                                                d_gammax, d_gammay, d_gammaz,
-                                                mp->d_hprime_xx,
-                                                mp->d_hprimewgll_xx,
-                                                mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
-                                                d_kappavstore, d_muvstore,
-                                                d_kappahstore, d_muhstore,
-                                                d_eta_anisostore,
-                                                mp->compute_and_store_strain,
-                                                d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
-                                                d_epsilondev_xz,d_epsilondev_yz,
-                                                d_epsilon_trace_over_3,
-                                                mp->simulation_type,
-                                                mp->attenuation,
-                                                mp->partial_phys_dispersion_only,
-                                                mp->use_3d_attenuation_arrays,
-                                                d_one_minus_sum_beta,d_factor_common,
-                                                d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
-                                                mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
-                                                mp->anisotropic_3D_mantle,
-                                                d_c11store,d_c12store,d_c13store,
-                                                d_c14store,d_c15store,d_c16store,
-                                                d_c22store,d_c23store,d_c24store,
-                                                d_c25store,d_c26store,d_c33store,
-                                                d_c34store,d_c35store,d_c36store,
-                                                d_c44store,d_c45store,d_c46store,
-                                                d_c55store,d_c56store,d_c66store,
-                                                mp->gravity,
-                                                mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
-                                                mp->d_minus_gravity_table,
-                                                mp->d_minus_deriv_gravity_table,
-                                                mp->d_density_table,
-                                                mp->d_wgll_cube,
-                                                mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
+  if( FORWARD_OR_ADJOINT == 1 ){
+    Kernel_2_crust_mantle_impl<<<grid,threads>>>(nb_blocks_to_compute,
+                                                  mp->NGLOB_CRUST_MANTLE,
+                                                  d_ibool,
+                                                  d_ispec_is_tiso,
+                                                  mp->d_phase_ispec_inner_crust_mantle,
+                                                  mp->num_phase_ispec_crust_mantle,
+                                                  d_iphase,
+                                                  mp->deltat,
+                                                  mp->use_mesh_coloring_gpu,
+                                                  mp->d_displ_crust_mantle,
+                                                  mp->d_veloc_crust_mantle,
+                                                  mp->d_accel_crust_mantle,
+                                                  d_xix, d_xiy, d_xiz,
+                                                  d_etax, d_etay, d_etaz,
+                                                  d_gammax, d_gammay, d_gammaz,
+                                                  mp->d_hprime_xx,
+                                                  mp->d_hprimewgll_xx,
+                                                  mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                  d_kappavstore, d_muvstore,
+                                                  d_kappahstore, d_muhstore,
+                                                  d_eta_anisostore,
+                                                  mp->compute_and_store_strain,
+                                                  d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
+                                                  d_epsilondev_xz,d_epsilondev_yz,
+                                                  d_epsilon_trace_over_3,
+                                                  mp->attenuation,
+                                                  mp->partial_phys_dispersion_only,
+                                                  mp->use_3d_attenuation_arrays,
+                                                  d_one_minus_sum_beta,d_factor_common,
+                                                  d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+                                                  mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+                                                  mp->anisotropic_3D_mantle,
+                                                  d_c11store,d_c12store,d_c13store,
+                                                  d_c14store,d_c15store,d_c16store,
+                                                  d_c22store,d_c23store,d_c24store,
+                                                  d_c25store,d_c26store,d_c33store,
+                                                  d_c34store,d_c35store,d_c36store,
+                                                  d_c44store,d_c45store,d_c46store,
+                                                  d_c55store,d_c56store,d_c66store,
+                                                  mp->gravity,
+                                                  mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
+                                                  mp->d_minus_gravity_table,
+                                                  mp->d_minus_deriv_gravity_table,
+                                                  mp->d_density_table,
+                                                  mp->d_wgll_cube,
+                                                  mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
+  }else if( FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
 
-
-  if(mp->simulation_type == 3) {
     Kernel_2_crust_mantle_impl<<< grid,threads>>>(nb_blocks_to_compute,
                                                    mp->NGLOB_CRUST_MANTLE,
                                                    d_ibool,
@@ -1425,7 +1421,6 @@
                                                    d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
                                                    d_b_epsilondev_xz,d_b_epsilondev_yz,
                                                    d_b_epsilon_trace_over_3,
-                                                   mp->simulation_type,
                                                    mp->attenuation,
                                                    mp->partial_phys_dispersion_only,
                                                    mp->use_3d_attenuation_arrays,
@@ -1469,7 +1464,8 @@
 extern "C"
 void FC_FUNC_(compute_forces_crust_mantle_cuda,
               COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
-                                                int* iphase) {
+                                                int* iphase,
+                                                int* FORWARD_OR_ADJOINT_f) {
 
   TRACE("compute_forces_crust_mantle_cuda");
 
@@ -1479,6 +1475,8 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
+  int FORWARD_OR_ADJOINT = *FORWARD_OR_ADJOINT_f;
+
   int num_elements;
 
   if( *iphase == 1 )
@@ -1563,15 +1561,9 @@
                             *iphase,
                             mp->d_ibool_crust_mantle + offset_nonpadded,
                             mp->d_ispec_is_tiso_crust_mantle + offset_ispec,
-                            mp->d_xix_crust_mantle + offset,
-                            mp->d_xiy_crust_mantle + offset,
-                            mp->d_xiz_crust_mantle + offset,
-                            mp->d_etax_crust_mantle + offset,
-                            mp->d_etay_crust_mantle + offset,
-                            mp->d_etaz_crust_mantle + offset,
-                            mp->d_gammax_crust_mantle + offset,
-                            mp->d_gammay_crust_mantle + offset,
-                            mp->d_gammaz_crust_mantle + offset,
+                            mp->d_xix_crust_mantle + offset,mp->d_xiy_crust_mantle + offset,mp->d_xiz_crust_mantle + offset,
+                            mp->d_etax_crust_mantle + offset,mp->d_etay_crust_mantle + offset,mp->d_etaz_crust_mantle + offset,
+                            mp->d_gammax_crust_mantle + offset,mp->d_gammay_crust_mantle + offset,mp->d_gammaz_crust_mantle + offset,
                             mp->d_kappavstore_crust_mantle + offset,
                             mp->d_muvstore_crust_mantle + offset,
                             mp->d_kappahstore_crust_mantle + offset,
@@ -1590,6 +1582,13 @@
                             mp->d_R_xy_crust_mantle + offset_nonpadded_att1,
                             mp->d_R_xz_crust_mantle + offset_nonpadded_att1,
                             mp->d_R_yz_crust_mantle + offset_nonpadded_att1,
+                            mp->d_c11store_crust_mantle + offset,mp->d_c12store_crust_mantle + offset,mp->d_c13store_crust_mantle + offset,
+                            mp->d_c14store_crust_mantle + offset,mp->d_c15store_crust_mantle + offset,mp->d_c16store_crust_mantle + offset,
+                            mp->d_c22store_crust_mantle + offset,mp->d_c23store_crust_mantle + offset,mp->d_c24store_crust_mantle + offset,
+                            mp->d_c25store_crust_mantle + offset,mp->d_c26store_crust_mantle + offset,mp->d_c33store_crust_mantle + offset,
+                            mp->d_c34store_crust_mantle + offset,mp->d_c35store_crust_mantle + offset,mp->d_c36store_crust_mantle + offset,
+                            mp->d_c44store_crust_mantle + offset,mp->d_c45store_crust_mantle + offset,mp->d_c46store_crust_mantle + offset,
+                            mp->d_c55store_crust_mantle + offset,mp->d_c56store_crust_mantle + offset,mp->d_c66store_crust_mantle + offset,
                             mp->d_b_epsilondev_xx_crust_mantle + offset_nonpadded,
                             mp->d_b_epsilondev_yy_crust_mantle + offset_nonpadded,
                             mp->d_b_epsilondev_xy_crust_mantle + offset_nonpadded,
@@ -1601,27 +1600,7 @@
                             mp->d_b_R_xy_crust_mantle + offset_nonpadded_att1,
                             mp->d_b_R_xz_crust_mantle + offset_nonpadded_att1,
                             mp->d_b_R_yz_crust_mantle + offset_nonpadded_att1,
-                            mp->d_c11store_crust_mantle + offset,
-                            mp->d_c12store_crust_mantle + offset,
-                            mp->d_c13store_crust_mantle + offset,
-                            mp->d_c14store_crust_mantle + offset,
-                            mp->d_c15store_crust_mantle + offset,
-                            mp->d_c16store_crust_mantle + offset,
-                            mp->d_c22store_crust_mantle + offset,
-                            mp->d_c23store_crust_mantle + offset,
-                            mp->d_c24store_crust_mantle + offset,
-                            mp->d_c25store_crust_mantle + offset,
-                            mp->d_c26store_crust_mantle + offset,
-                            mp->d_c33store_crust_mantle + offset,
-                            mp->d_c34store_crust_mantle + offset,
-                            mp->d_c35store_crust_mantle + offset,
-                            mp->d_c36store_crust_mantle + offset,
-                            mp->d_c44store_crust_mantle + offset,
-                            mp->d_c45store_crust_mantle + offset,
-                            mp->d_c46store_crust_mantle + offset,
-                            mp->d_c55store_crust_mantle + offset,
-                            mp->d_c56store_crust_mantle + offset,
-                            mp->d_c66store_crust_mantle + offset);
+                            FORWARD_OR_ADJOINT);
 
       // for padded and aligned arrays
       offset += nb_blocks_to_compute * NGLL3_PADDED;
@@ -1650,7 +1629,6 @@
   }else{
 
     // no mesh coloring: uses atomic updates
-
     Kernel_2_crust_mantle(num_elements,mp,
                           *iphase,
                           mp->d_ibool_crust_mantle,
@@ -1674,6 +1652,13 @@
                           mp->d_R_xy_crust_mantle,
                           mp->d_R_xz_crust_mantle,
                           mp->d_R_yz_crust_mantle,
+                          mp->d_c11store_crust_mantle,mp->d_c12store_crust_mantle,mp->d_c13store_crust_mantle,
+                          mp->d_c14store_crust_mantle,mp->d_c15store_crust_mantle,mp->d_c16store_crust_mantle,
+                          mp->d_c22store_crust_mantle,mp->d_c23store_crust_mantle,mp->d_c24store_crust_mantle,
+                          mp->d_c25store_crust_mantle,mp->d_c26store_crust_mantle,mp->d_c33store_crust_mantle,
+                          mp->d_c34store_crust_mantle,mp->d_c35store_crust_mantle,mp->d_c36store_crust_mantle,
+                          mp->d_c44store_crust_mantle,mp->d_c45store_crust_mantle,mp->d_c46store_crust_mantle,
+                          mp->d_c55store_crust_mantle,mp->d_c56store_crust_mantle,mp->d_c66store_crust_mantle,
                           mp->d_b_epsilondev_xx_crust_mantle,
                           mp->d_b_epsilondev_yy_crust_mantle,
                           mp->d_b_epsilondev_xy_crust_mantle,
@@ -1685,13 +1670,7 @@
                           mp->d_b_R_xy_crust_mantle,
                           mp->d_b_R_xz_crust_mantle,
                           mp->d_b_R_yz_crust_mantle,
-                          mp->d_c11store_crust_mantle,mp->d_c12store_crust_mantle,mp->d_c13store_crust_mantle,
-                          mp->d_c14store_crust_mantle,mp->d_c15store_crust_mantle,mp->d_c16store_crust_mantle,
-                          mp->d_c22store_crust_mantle,mp->d_c23store_crust_mantle,mp->d_c24store_crust_mantle,
-                          mp->d_c25store_crust_mantle,mp->d_c26store_crust_mantle,mp->d_c33store_crust_mantle,
-                          mp->d_c34store_crust_mantle,mp->d_c35store_crust_mantle,mp->d_c36store_crust_mantle,
-                          mp->d_c44store_crust_mantle,mp->d_c45store_crust_mantle,mp->d_c46store_crust_mantle,
-                          mp->d_c55store_crust_mantle,mp->d_c56store_crust_mantle,mp->d_c66store_crust_mantle);
+                          FORWARD_OR_ADJOINT);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -326,7 +326,6 @@
                                          realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
                                          realw* epsilondev_xz,realw* epsilondev_yz,
                                          realw* epsilon_trace_over_3,
-                                         int SIMULATION_TYPE,
                                          int ATTENUATION,
                                          int PARTIAL_PHYS_DISPERSION_ONLY,
                                          int USE_3D_ATTENUATION_ARRAYS,
@@ -937,6 +936,8 @@
                          realw* d_R_xy,
                          realw* d_R_xz,
                          realw* d_R_yz,
+                         realw* d_c11store,realw* d_c12store,realw* d_c13store,
+                         realw* d_c33store,realw* d_c44store,
                          realw* d_b_epsilondev_xx,
                          realw* d_b_epsilondev_yy,
                          realw* d_b_epsilondev_xy,
@@ -948,25 +949,21 @@
                          realw* d_b_R_xy,
                          realw* d_b_R_xz,
                          realw* d_b_R_yz,
-                         realw* d_c11store,realw* d_c12store,realw* d_c13store,
-                         realw* d_c33store,realw* d_c44store){
+                         int FORWARD_OR_ADJOINT){
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("before kernel Kernel_2_inner_core");
 #endif
 
-  /* if the grid can handle the number of blocks, we let it be 1D */
-  /* grid_2_x = nb_elem_color; */
-  /* nb_elem_color is just how many blocks we are computing now */
+  // if the grid can handle the number of blocks, we let it be 1D
+  // grid_2_x = nb_elem_color;
+  // nb_elem_color is just how many blocks we are computing now
 
-  int num_blocks_x = nb_blocks_to_compute;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
   int blocksize = NGLL3_PADDED;
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(nb_blocks_to_compute,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -976,55 +973,55 @@
   // cudaEventCreate(&start);
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
+  if( FORWARD_OR_ADJOINT == 1 ){
+    Kernel_2_inner_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
+                                               mp->NGLOB_INNER_CORE,
+                                               d_ibool,
+                                               d_idoubling,
+                                               mp->d_phase_ispec_inner_inner_core,
+                                               mp->num_phase_ispec_inner_core,
+                                               d_iphase,
+                                               mp->deltat,
+                                               mp->use_mesh_coloring_gpu,
+                                               mp->d_displ_inner_core,
+                                               mp->d_veloc_inner_core,
+                                               mp->d_accel_inner_core,
+                                               d_xix, d_xiy, d_xiz,
+                                               d_etax, d_etay, d_etaz,
+                                               d_gammax, d_gammay, d_gammaz,
+                                               mp->d_hprime_xx,
+                                               mp->d_hprimewgll_xx,
+                                               mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                               d_kappav, d_muv,
+                                               mp->compute_and_store_strain,
+                                               d_epsilondev_xx,
+                                               d_epsilondev_yy,
+                                               d_epsilondev_xy,
+                                               d_epsilondev_xz,
+                                               d_epsilondev_yz,
+                                               d_epsilon_trace_over_3,
+                                               mp->attenuation,
+                                               mp->partial_phys_dispersion_only,
+                                               mp->use_3d_attenuation_arrays,
+                                               d_one_minus_sum_beta,
+                                               d_factor_common,
+                                               d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+                                               mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+                                               mp->anisotropic_inner_core,
+                                               d_c11store,d_c12store,d_c13store,
+                                               d_c33store,d_c44store,
+                                               mp->gravity,
+                                               mp->d_xstore_inner_core,mp->d_ystore_inner_core,mp->d_zstore_inner_core,
+                                               mp->d_minus_gravity_table,
+                                               mp->d_minus_deriv_gravity_table,
+                                               mp->d_density_table,
+                                               mp->d_wgll_cube,
+                                               mp->NSPEC_INNER_CORE_STRAIN_ONLY,
+                                               mp->NSPEC_INNER_CORE);
+  }else if( FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
 
-  Kernel_2_inner_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
-                                             mp->NGLOB_INNER_CORE,
-                                             d_ibool,
-                                             d_idoubling,
-                                             mp->d_phase_ispec_inner_inner_core,
-                                             mp->num_phase_ispec_inner_core,
-                                             d_iphase,
-                                             mp->deltat,
-                                             mp->use_mesh_coloring_gpu,
-                                             mp->d_displ_inner_core,
-                                             mp->d_veloc_inner_core,
-                                             mp->d_accel_inner_core,
-                                             d_xix, d_xiy, d_xiz,
-                                             d_etax, d_etay, d_etaz,
-                                             d_gammax, d_gammay, d_gammaz,
-                                             mp->d_hprime_xx,
-                                             mp->d_hprimewgll_xx,
-                                             mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
-                                             d_kappav, d_muv,
-                                             mp->compute_and_store_strain,
-                                             d_epsilondev_xx,
-                                             d_epsilondev_yy,
-                                             d_epsilondev_xy,
-                                             d_epsilondev_xz,
-                                             d_epsilondev_yz,
-                                             d_epsilon_trace_over_3,
-                                             mp->simulation_type,
-                                             mp->attenuation,
-                                             mp->partial_phys_dispersion_only,
-                                             mp->use_3d_attenuation_arrays,
-                                             d_one_minus_sum_beta,
-                                             d_factor_common,
-                                             d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
-                                             mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
-                                             mp->anisotropic_inner_core,
-                                             d_c11store,d_c12store,d_c13store,
-                                             d_c33store,d_c44store,
-                                             mp->gravity,
-                                             mp->d_xstore_inner_core,mp->d_ystore_inner_core,mp->d_zstore_inner_core,
-                                             mp->d_minus_gravity_table,
-                                             mp->d_minus_deriv_gravity_table,
-                                             mp->d_density_table,
-                                             mp->d_wgll_cube,
-                                             mp->NSPEC_INNER_CORE_STRAIN_ONLY,
-                                             mp->NSPEC_INNER_CORE);
-
-
-  if(mp->simulation_type == 3) {
     Kernel_2_inner_core_impl<<< grid,threads>>>(nb_blocks_to_compute,
                                                 mp->NGLOB_INNER_CORE,
                                                 d_ibool,
@@ -1051,7 +1048,6 @@
                                                 d_b_epsilondev_xz,
                                                 d_b_epsilondev_yz,
                                                 d_b_epsilon_trace_over_3,
-                                                mp->simulation_type,
                                                 mp->attenuation,
                                                 mp->partial_phys_dispersion_only,
                                                 mp->use_3d_attenuation_arrays,
@@ -1092,7 +1088,8 @@
 extern "C"
 void FC_FUNC_(compute_forces_inner_core_cuda,
               COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
-                                              int* iphase) {
+                                              int* iphase,
+                                              int* FORWARD_OR_ADJOINT_f) {
 
   TRACE("compute_forces_inner_core_cuda");
 
@@ -1102,6 +1099,8 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
+  int FORWARD_OR_ADJOINT = *FORWARD_OR_ADJOINT_f;
+
   int num_elements;
 
   if( *iphase == 1 )
@@ -1231,15 +1230,9 @@
                           *iphase,
                           mp->d_ibool_inner_core + offset_nonpadded,
                           mp->d_idoubling_inner_core + offset_ispec,
-                          mp->d_xix_inner_core + offset,
-                          mp->d_xiy_inner_core + offset,
-                          mp->d_xiz_inner_core + offset,
-                          mp->d_etax_inner_core + offset,
-                          mp->d_etay_inner_core + offset,
-                          mp->d_etaz_inner_core + offset,
-                          mp->d_gammax_inner_core + offset,
-                          mp->d_gammay_inner_core + offset,
-                          mp->d_gammaz_inner_core + offset,
+                          mp->d_xix_inner_core + offset,mp->d_xiy_inner_core + offset,mp->d_xiz_inner_core + offset,
+                          mp->d_etax_inner_core + offset,mp->d_etay_inner_core + offset,mp->d_etaz_inner_core + offset,
+                          mp->d_gammax_inner_core + offset,mp->d_gammay_inner_core + offset,mp->d_gammaz_inner_core + offset,
                           mp->d_kappavstore_inner_core + offset,
                           mp->d_muvstore_inner_core + offset,
                           mp->d_epsilondev_xx_inner_core + offset_nonpadded,
@@ -1255,6 +1248,11 @@
                           mp->d_R_xy_inner_core + offset_nonpadded_att1,
                           mp->d_R_xz_inner_core + offset_nonpadded_att1,
                           mp->d_R_yz_inner_core + offset_nonpadded_att1,
+                          mp->d_c11store_inner_core + offset,
+                          mp->d_c12store_inner_core + offset,
+                          mp->d_c13store_inner_core + offset,
+                          mp->d_c33store_inner_core + offset,
+                          mp->d_c44store_inner_core + offset,
                           mp->d_b_epsilondev_xx_inner_core + offset_nonpadded,
                           mp->d_b_epsilondev_yy_inner_core + offset_nonpadded,
                           mp->d_b_epsilondev_xy_inner_core + offset_nonpadded,
@@ -1266,11 +1264,7 @@
                           mp->d_b_R_xy_inner_core + offset_nonpadded_att1,
                           mp->d_b_R_xz_inner_core + offset_nonpadded_att1,
                           mp->d_b_R_yz_inner_core + offset_nonpadded_att1,
-                          mp->d_c11store_inner_core + offset,
-                          mp->d_c12store_inner_core + offset,
-                          mp->d_c13store_inner_core + offset,
-                          mp->d_c33store_inner_core + offset,
-                          mp->d_c44store_inner_core + offset);
+                          FORWARD_OR_ADJOINT);
 
       // for padded and aligned arrays
       offset += nb_blocks_to_compute * NGLL3_PADDED;
@@ -1320,6 +1314,8 @@
                         mp->d_R_xy_inner_core,
                         mp->d_R_xz_inner_core,
                         mp->d_R_yz_inner_core,
+                        mp->d_c11store_inner_core,mp->d_c12store_inner_core,mp->d_c13store_inner_core,
+                        mp->d_c33store_inner_core,mp->d_c44store_inner_core,
                         mp->d_b_epsilondev_xx_inner_core,
                         mp->d_b_epsilondev_yy_inner_core,
                         mp->d_b_epsilondev_xy_inner_core,
@@ -1331,8 +1327,7 @@
                         mp->d_b_R_xy_inner_core,
                         mp->d_b_R_xz_inner_core,
                         mp->d_b_R_yz_inner_core,
-                        mp->d_c11store_inner_core,mp->d_c12store_inner_core,mp->d_c13store_inner_core,
-                        mp->d_c33store_inner_core,mp->d_c44store_inner_core);
+                        FORWARD_OR_ADJOINT);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -474,25 +474,18 @@
   exit_on_cuda_error("before outer_core kernel Kernel_2");
 #endif
 
-  /* if the grid can handle the number of blocks, we let it be 1D */
-  /* grid_2_x = nb_elem_color; */
-  /* nb_elem_color is just how many blocks we are computing now */
+  // if the grid can handle the number of blocks, we let it be 1D
+  // grid_2_x = nb_elem_color;
+  // nb_elem_color is just how many blocks we are computing now
 
-  int num_blocks_x = nb_blocks_to_compute;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
   int blocksize = NGLL3_PADDED;
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(nb_blocks_to_compute,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
-  // 2d grid
-  //int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2;
-  //dim3 grid_2(num_blocks_x,num_blocks_y);
-
   // Cuda timing
   // cudaEvent_t start, stop;
   // realw time;
@@ -502,60 +495,63 @@
 
   if( FORWARD_OR_ADJOINT == 1 ){
     Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
-                                                          mp->NGLOB_OUTER_CORE,
-                                                          d_ibool,
-                                                          mp->d_phase_ispec_inner_outer_core,
-                                                          mp->num_phase_ispec_outer_core,
-                                                          d_iphase,
-                                                          mp->use_mesh_coloring_gpu,
-                                                          mp->d_displ_outer_core,
-                                                          mp->d_accel_outer_core,
-                                                          d_xix, d_xiy, d_xiz,
-                                                          d_etax, d_etay, d_etaz,
-                                                          d_gammax, d_gammay, d_gammaz,
-                                                          mp->d_hprime_xx,
-                                                          mp->d_hprimewgll_xx,
-                                                          mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
-                                                          mp->gravity,
-                                                          mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
-                                                          mp->d_d_ln_density_dr_table,
-                                                          mp->d_minus_rho_g_over_kappa_fluid,
-                                                          mp->d_wgll_cube,
-                                                          mp->rotation,
-                                                          time,
-                                                          mp->two_omega_earth,
-                                                          mp->deltat,
-                                                          d_A_array_rotation,
-                                                          d_B_array_rotation,
-                                                          mp->NSPEC_OUTER_CORE);
+                                                mp->NGLOB_OUTER_CORE,
+                                                d_ibool,
+                                                mp->d_phase_ispec_inner_outer_core,
+                                                mp->num_phase_ispec_outer_core,
+                                                d_iphase,
+                                                mp->use_mesh_coloring_gpu,
+                                                mp->d_displ_outer_core,
+                                                mp->d_accel_outer_core,
+                                                d_xix, d_xiy, d_xiz,
+                                                d_etax, d_etay, d_etaz,
+                                                d_gammax, d_gammay, d_gammaz,
+                                                mp->d_hprime_xx,
+                                                mp->d_hprimewgll_xx,
+                                                mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                mp->gravity,
+                                                mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
+                                                mp->d_d_ln_density_dr_table,
+                                                mp->d_minus_rho_g_over_kappa_fluid,
+                                                mp->d_wgll_cube,
+                                                mp->rotation,
+                                                time,
+                                                mp->two_omega_earth,
+                                                mp->deltat,
+                                                d_A_array_rotation,
+                                                d_B_array_rotation,
+                                                mp->NSPEC_OUTER_CORE);
   }else if( FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
     Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
-                                                            mp->NGLOB_OUTER_CORE,
-                                                            d_ibool,
-                                                            mp->d_phase_ispec_inner_outer_core,
-                                                            mp->num_phase_ispec_outer_core,
-                                                            d_iphase,
-                                                            mp->use_mesh_coloring_gpu,
-                                                            mp->d_b_displ_outer_core,
-                                                            mp->d_b_accel_outer_core,
-                                                            d_xix, d_xiy, d_xiz,
-                                                            d_etax, d_etay, d_etaz,
-                                                            d_gammax, d_gammay, d_gammaz,
-                                                            mp->d_hprime_xx,
-                                                            mp->d_hprimewgll_xx,
-                                                            mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
-                                                            mp->gravity,
-                                                            mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
-                                                            mp->d_d_ln_density_dr_table,
-                                                            mp->d_minus_rho_g_over_kappa_fluid,
-                                                            mp->d_wgll_cube,
-                                                            mp->rotation,
-                                                            time,
-                                                            mp->b_two_omega_earth,
-                                                            mp->b_deltat,
-                                                            d_b_A_array_rotation,
-                                                            d_b_B_array_rotation,
-                                                            mp->NSPEC_OUTER_CORE);
+                                                mp->NGLOB_OUTER_CORE,
+                                                d_ibool,
+                                                mp->d_phase_ispec_inner_outer_core,
+                                                mp->num_phase_ispec_outer_core,
+                                                d_iphase,
+                                                mp->use_mesh_coloring_gpu,
+                                                mp->d_b_displ_outer_core,
+                                                mp->d_b_accel_outer_core,
+                                                d_xix, d_xiy, d_xiz,
+                                                d_etax, d_etay, d_etaz,
+                                                d_gammax, d_gammay, d_gammaz,
+                                                mp->d_hprime_xx,
+                                                mp->d_hprimewgll_xx,
+                                                mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+                                                mp->gravity,
+                                                mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
+                                                mp->d_d_ln_density_dr_table,
+                                                mp->d_minus_rho_g_over_kappa_fluid,
+                                                mp->d_wgll_cube,
+                                                mp->rotation,
+                                                time,
+                                                mp->b_two_omega_earth,
+                                                mp->b_deltat,
+                                                d_b_A_array_rotation,
+                                                d_b_B_array_rotation,
+                                                mp->NSPEC_OUTER_CORE);
   }
 
   // cudaEventRecord( stop, 0 );
@@ -583,7 +579,7 @@
               COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
                                               int* iphase,
                                               realw* time_f,
-                                              int* FORWARD_OR_ADJOINT) {
+                                              int* FORWARD_OR_ADJOINT_f) {
 
   TRACE("compute_forces_outer_core_cuda");
 
@@ -592,8 +588,11 @@
   //double start_time = get_time();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
   realw time = *time_f;
 
+  int FORWARD_OR_ADJOINT = *FORWARD_OR_ADJOINT_f;
+
   int num_elements;
 
   if( *iphase == 1 )
@@ -612,7 +611,7 @@
 
     int nb_colors,nb_blocks_to_compute;
     int istart;
-    int color_offset,color_offset_nonpadded;
+    int offset,offset_nonpadded;
 
     // sets up color loop
     if( mp->NSPEC_OUTER_CORE > COLORING_MIN_NSPEC_OUTER_CORE ){
@@ -622,16 +621,16 @@
         istart = 0;
 
         // array offsets
-        color_offset = 0;
-        color_offset_nonpadded = 0;
+        offset = 0;
+        offset_nonpadded = 0;
       }else{
         // inner element colors (start after outer elements)
         nb_colors = mp->num_colors_outer_outer_core + mp->num_colors_inner_outer_core;
         istart = mp->num_colors_outer_outer_core;
 
         // array offsets (inner elements start after outer ones)
-        color_offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
-        color_offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
+        offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
+        offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
       }
     }else{
 
@@ -643,16 +642,16 @@
         istart = 0;
 
         // array offsets
-        color_offset = 0;
-        color_offset_nonpadded = 0;
+        offset = 0;
+        offset_nonpadded = 0;
       }else{
         // inner element colors (start after outer elements)
         nb_colors = 1;
         istart = 0;
 
         // array offsets (inner elements start after outer ones)
-        color_offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
-        color_offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
+        offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
+        offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
       }
     }
 
@@ -668,27 +667,21 @@
 
       Kernel_2_outer_core(nb_blocks_to_compute,mp,
                           *iphase,
-                          mp->d_ibool_outer_core + color_offset_nonpadded,
-                          mp->d_xix_outer_core + color_offset,
-                          mp->d_xiy_outer_core + color_offset,
-                          mp->d_xiz_outer_core + color_offset,
-                          mp->d_etax_outer_core + color_offset,
-                          mp->d_etay_outer_core + color_offset,
-                          mp->d_etaz_outer_core + color_offset,
-                          mp->d_gammax_outer_core + color_offset,
-                          mp->d_gammay_outer_core + color_offset,
-                          mp->d_gammaz_outer_core + color_offset,
+                          mp->d_ibool_outer_core + offset_nonpadded,
+                          mp->d_xix_outer_core + offset,mp->d_xiy_outer_core + offset,mp->d_xiz_outer_core + offset,
+                          mp->d_etax_outer_core + offset,mp->d_etay_outer_core + offset,mp->d_etaz_outer_core + offset,
+                          mp->d_gammax_outer_core + offset,mp->d_gammay_outer_core + offset,mp->d_gammaz_outer_core + offset,
                           time,
-                          mp->d_A_array_rotation + color_offset_nonpadded,
-                          mp->d_B_array_rotation + color_offset_nonpadded,
-                          mp->d_b_A_array_rotation + color_offset_nonpadded,
-                          mp->d_b_B_array_rotation + color_offset_nonpadded,
-                          *FORWARD_OR_ADJOINT);
+                          mp->d_A_array_rotation + offset_nonpadded,
+                          mp->d_B_array_rotation + offset_nonpadded,
+                          mp->d_b_A_array_rotation + offset_nonpadded,
+                          mp->d_b_B_array_rotation + offset_nonpadded,
+                          FORWARD_OR_ADJOINT);
 
       // for padded and aligned arrays
-      color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+      offset += nb_blocks_to_compute * NGLL3_PADDED;
       // for no-aligned arrays
-      color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+      offset_nonpadded += nb_blocks_to_compute * NGLL3;
     }
 
   }else{
@@ -705,7 +698,7 @@
                         mp->d_B_array_rotation,
                         mp->d_b_A_array_rotation,
                         mp->d_b_B_array_rotation,
-                        *FORWARD_OR_ADJOINT);
+                        FORWARD_OR_ADJOINT);
 
   }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -223,19 +223,18 @@
 void FC_FUNC_(compute_kernels_cm_cuda,
               COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {
 
-TRACE("compute_kernels_cm_cuda");
+  TRACE("compute_kernels_cm_cuda");
+  // debug
+  DEBUG_EMPTY_BACKWARD();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
   int blocksize = NGLL3;
   realw deltat = *deltat_f;
 
-  int num_blocks_x = mp->NSPEC_CRUST_MANTLE;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->NSPEC_CRUST_MANTLE,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -311,12 +310,9 @@
   int blocksize = NGLL3;
   realw deltat = *deltat_f;
 
-  int num_blocks_x = mp->NSPEC_INNER_CORE;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->NSPEC_INNER_CORE,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -532,12 +528,8 @@
   int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
   realw deltat = *deltat_f;
 
-  int num_blocks_x = mp->NSPEC_OUTER_CORE;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->NSPEC_OUTER_CORE,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
@@ -624,12 +616,8 @@
 
   realw deltat = *deltat_f;
 
-  int num_blocks_x = mp->nspec2D_top_crust_mantle;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nspec2D_top_crust_mantle,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(NGLL2,1,1);
@@ -702,12 +690,8 @@
   int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
   realw deltat = *deltat_f;
 
-  int num_blocks_x = mp->NSPEC_CRUST_MANTLE;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->NSPEC_CRUST_MANTLE,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -245,12 +245,9 @@
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
   int blocksize = NGLL2;
 
-  int num_blocks_x = num_abs_boundary_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -455,12 +452,9 @@
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
   int blocksize = NGLL2;
 
-  int num_blocks_x = num_abs_boundary_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -488,3 +482,117 @@
 #endif
 }
 
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// undo_attenuation simulation: stacey for backward/reconstructed wavefield
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_stacey_acoustic_undoatt_cuda,
+              COMPUTE_STACEY_ACOUSTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
+                                                    int* itype) {
+  TRACE("compute_stacey_acoustic_undoatt_cuda");
+  //double start_time = get_time();
+
+  int num_abs_boundary_faces;
+  int* d_abs_boundary_ispec;
+  realw* d_abs_boundary_jacobian2D;
+  realw* d_wgllwgll;
+  realw* d_b_absorb_potential = NULL;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // checks if anything to do
+  if( mp->simulation_type /= 3 ) return;
+  if( mp->save_forward ) return;
+
+  // absorbing boundary type
+  int interface_type = *itype;
+  switch( interface_type ){
+    case 4:
+      // xmin
+      num_abs_boundary_faces = mp->nspec2D_xmin_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_xmin_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmin_outer_core;
+      d_wgllwgll = mp->d_wgllwgll_yz;
+      break;
+
+    case 5:
+      // xmax
+      num_abs_boundary_faces = mp->nspec2D_xmax_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_xmax_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmax_outer_core;
+      d_wgllwgll = mp->d_wgllwgll_yz;
+      break;
+
+    case 6:
+      // ymin
+      num_abs_boundary_faces = mp->nspec2D_ymin_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_ymin_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymin_outer_core;
+      d_wgllwgll = mp->d_wgllwgll_xz;
+      break;
+
+    case 7:
+      // ymax
+      num_abs_boundary_faces = mp->nspec2D_ymax_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_ymax_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymax_outer_core;
+      d_wgllwgll = mp->d_wgllwgll_xz;
+      break;
+
+    case 8:
+      // zmin
+      num_abs_boundary_faces = mp->nspec2D_zmin_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_bottom_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_bottom_outer_core;
+      d_wgllwgll = mp->d_wgllwgll_xy;
+      break;
+
+    default:
+      exit_on_cuda_error("compute_stacey_acoustic_cuda: unknown interface type");
+      break;
+  }
+
+  // checks if anything to do
+  if( num_abs_boundary_faces == 0 ) return;
+
+  // way 1: Elapsed time: 4.385948e-03
+  // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+  //  int blocksize = 32;
+
+  // way 2: Elapsed time: 4.379034e-03
+  // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+  int blocksize = NGLL2;
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_b_veloc_outer_core,
+                                                   mp->d_b_accel_outer_core,
+                                                   interface_type,
+                                                   num_abs_boundary_faces,
+                                                   d_abs_boundary_ispec,
+                                                   mp->d_nkmin_xi_outer_core,
+                                                   mp->d_nkmin_eta_outer_core,
+                                                   mp->d_njmin_outer_core,
+                                                   mp->d_njmax_outer_core,
+                                                   mp->d_nimin_outer_core,
+                                                   mp->d_nimax_outer_core,
+                                                   d_abs_boundary_jacobian2D,
+                                                   d_wgllwgll,
+                                                   mp->d_ibool_outer_core,
+                                                   mp->d_vp_outer_core,
+                                                   mp->save_forward,
+                                                   d_b_absorb_potential);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_stacey_acoustic_undoatt_cuda");
+#endif
+}
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -252,12 +252,9 @@
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
   int blocksize = NGLL2;
 
-  int num_blocks_x = num_abs_boundary_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -401,7 +398,7 @@
                                                     realw* absorb_field,
                                                     int* itype) {
 
-TRACE("compute_stacey_elastic_backward_cuda");
+  TRACE("compute_stacey_elastic_backward_cuda");
 
   int num_abs_boundary_faces;
   int* d_abs_boundary_ispec;
@@ -456,21 +453,16 @@
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
   int blocksize = NGLL2;
 
-  int num_blocks_x = num_abs_boundary_faces;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
   // adjoint simulations: needs absorbing boundary buffer
-  if( num_abs_boundary_faces > 0 ){
-    // copies array to GPU
-    print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_field,absorb_field,
-                                       NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
-  }
+  // copies array to GPU
+  print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_field,absorb_field,
+                                     NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
 
   // absorbing boundary contributions
   compute_stacey_elastic_backward_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
@@ -489,3 +481,117 @@
 #endif
 }
 
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// undo_attenuation simulation: stacey for backward/reconstructed wavefield
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_stacey_elastic_undoatt_cuda,
+              COMPUTE_STACEY_ELASTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
+                                                   int* itype) {
+
+  TRACE("compute_stacey_elastic_undoatt_cuda");
+
+  int num_abs_boundary_faces;
+  int* d_abs_boundary_ispec;
+  realw* d_abs_boundary_normal;
+  realw* d_abs_boundary_jacobian2D;
+  realw* d_wgllwgll;
+  realw* d_b_absorb_field = NULL;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // checks if anything to do
+  if( mp->simulation_type /= 3 ) return;
+  if( mp->save_forward ) return;
+
+  // absorbing boundary type
+  int interface_type = *itype;
+  switch( interface_type ){
+    case 0:
+      // xmin
+      num_abs_boundary_faces = mp->nspec2D_xmin_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_xmin_crust_mantle;
+      d_abs_boundary_normal = mp->d_normal_xmin_crust_mantle;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmin_crust_mantle;
+      d_wgllwgll = mp->d_wgllwgll_yz;
+      break;
+
+    case 1:
+      // xmax
+      num_abs_boundary_faces = mp->nspec2D_xmax_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_xmax_crust_mantle;
+      d_abs_boundary_normal = mp->d_normal_xmax_crust_mantle;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmax_crust_mantle;
+      d_wgllwgll = mp->d_wgllwgll_yz;
+      break;
+
+    case 2:
+      // ymin
+      num_abs_boundary_faces = mp->nspec2D_ymin_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_ymin_crust_mantle;
+      d_abs_boundary_normal = mp->d_normal_ymin_crust_mantle;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymin_crust_mantle;
+      d_wgllwgll = mp->d_wgllwgll_xz;
+      break;
+
+    case 3:
+      // ymax
+      num_abs_boundary_faces = mp->nspec2D_ymax_crust_mantle;
+      d_abs_boundary_ispec = mp->d_ibelm_ymax_crust_mantle;
+      d_abs_boundary_normal = mp->d_normal_ymax_crust_mantle;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymax_crust_mantle;
+      d_wgllwgll = mp->d_wgllwgll_xz;
+      break;
+
+    default:
+      exit_on_cuda_error("compute_stacey_elastic_undoatt_cuda: unknown interface type");
+      break;
+  }
+
+  // checks if anything to do
+  if( num_abs_boundary_faces == 0 ) return;
+
+  // way 1
+  // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+  //int blocksize = 32;
+
+  // way 2: seems sligthly faster
+  // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+  int blocksize = NGLL2;
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // absorbing boundary contributions
+  compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_b_veloc_crust_mantle,
+                                                  mp->d_b_accel_crust_mantle,
+                                                  interface_type,
+                                                  num_abs_boundary_faces,
+                                                  d_abs_boundary_ispec,
+                                                  mp->d_nkmin_xi_crust_mantle,
+                                                  mp->d_nkmin_eta_crust_mantle,
+                                                  mp->d_njmin_crust_mantle,
+                                                  mp->d_njmax_crust_mantle,
+                                                  mp->d_nimin_crust_mantle,
+                                                  mp->d_nimax_crust_mantle,
+                                                  d_abs_boundary_normal,
+                                                  d_abs_boundary_jacobian2D,
+                                                  d_wgllwgll,
+                                                  mp->d_ibool_crust_mantle,
+                                                  mp->d_rho_vp_crust_mantle,
+                                                  mp->d_rho_vs_crust_mantle,
+                                                  mp->save_forward,
+                                                  d_b_absorb_field);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("compute_stacey_elastic_undoatt_cuda");
+#endif
+}
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -88,7 +88,7 @@
   // Gets number of GPU devices
   device_count = 0;
   cudaGetDeviceCount(&device_count);
-  exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\ncheck if driver and runtime libraries work together\nexiting...\n");
+  exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on titan enable environment: CRAY_CUDA_PROXY=1 to use single GPU with multiple MPI processes\n\nexiting...\n");
 
   // returns device count to fortran
   if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2013-09-13 11:23:45 UTC (rev 22782)
@@ -70,20 +70,46 @@
 #define PRINT5(var,offset) // for(i=0;i<10;i++) printf("var(%d)=%f\n",i,var[offset+i]);
 #endif
 
+// daniel debug: run backward simulations with empty arrays to check
+#define DEBUG_BACKWARD_SIMULATIONS 0
+#if DEBUG_BACKWARD_SIMULATIONS == 1
+#define DEBUG_EMPTY_BACKWARD() return;
+#else
+#define DEBUG_EMPTY_BACKWARD()
+#endif
+
+
+// error checking after cuda function calls
+#define ENABLE_VERY_SLOW_ERROR_CHECKING
+
 // maximum function
 #define MAX(x,y)                    (((x) < (y)) ? (y) : (x))
 
+/* ----------------------------------------------------------------------------------------------- */
+
+// type of "working" variables: see also CUSTOM_REAL
+// double precision temporary variables leads to 10% performance decrease
+// in Kernel_2_impl (not very much..)
+typedef float realw;
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // utility functions: defined in check_fields_cuda.cu
+
+/* ----------------------------------------------------------------------------------------------- */
+
 double get_time();
 void get_free_memory(double* free_db, double* used_db, double* total_db);
 void print_CUDA_error_if_any(cudaError_t err, int num);
 void pause_for_debugger(int pause);
 void exit_on_cuda_error(char* kernel_name);
 void exit_on_error(char* info);
+void synchronize_cuda();
+void synchronize_mpi();
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y);
+realw get_device_array_maximum_value(realw* array,int size);
 
-// error checking after cuda function calls
-#define ENABLE_VERY_SLOW_ERROR_CHECKING
-
 /* ----------------------------------------------------------------------------------------------- */
 
 // cuda constant arrays
@@ -122,14 +148,6 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-// type of "working" variables: see also CUSTOM_REAL
-// double precision temporary variables leads to 10% performance decrease
-// in Kernel_2_impl (not very much..)
-typedef float realw;
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
 // (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
 // leads up to ~ 5% performance increase
 //#define USE_MESH_COLORING_GPU
@@ -246,7 +264,6 @@
   realw* d_b_rmassy_crust_mantle;
   realw* d_b_rmassz_crust_mantle;
 
-
   // global indexing
   int* d_ibool_crust_mantle;
   int* d_ispec_is_tiso_crust_mantle;
@@ -283,12 +300,6 @@
   // backward/reconstructed elastic wavefield
   realw* d_b_displ_crust_mantle; realw* d_b_veloc_crust_mantle; realw* d_b_accel_crust_mantle;
 
-//#ifdef USE_TEXTURES_FIELDS
-//  // Texture references for fast non-coalesced scattered access
-//  const textureReference* d_displ_cm_tex_ref_ptr;
-//  const textureReference* d_accel_cm_tex_ref_ptr;
-//#endif
-
   // attenuation
   realw* d_R_xx_crust_mantle;
   realw* d_R_yy_crust_mantle;
@@ -370,12 +381,6 @@
   // backward/reconstructed elastic wavefield
   realw* d_b_displ_outer_core; realw* d_b_veloc_outer_core; realw* d_b_accel_outer_core;
 
-//#ifdef USE_TEXTURES_FIELDS
-//  // Texture references for fast non-coalesced scattered access
-//  const textureReference* d_displ_oc_tex_ref_ptr;
-//  const textureReference* d_accel_oc_tex_ref_ptr;
-//#endif
-
   // kernels
   realw* d_rho_kl_outer_core;
   realw* d_alpha_kl_outer_core;
@@ -447,12 +452,6 @@
   // backward/reconstructed elastic wavefield
   realw* d_b_displ_inner_core; realw* d_b_veloc_inner_core; realw* d_b_accel_inner_core;
 
-//#ifdef USE_TEXTURES_FIELDS
-//  // Texture references for fast non-coalesced scattered access
-//  const textureReference* d_displ_ic_tex_ref_ptr;
-//  const textureReference* d_accel_ic_tex_ref_ptr;
-//#endif
-
   // attenuation
   realw* d_R_xx_inner_core;
   realw* d_R_yy_inner_core;
@@ -466,7 +465,6 @@
   realw* d_b_R_xz_inner_core;
   realw* d_b_R_yz_inner_core;
 
-
   realw* d_factor_common_inner_core;
   realw* d_one_minus_sum_beta_inner_core;
 
@@ -534,11 +532,6 @@
   //realw* d_hprime_yy; // only needed if NGLLX != NGLLY != NGLLZ
   //realw* d_hprime_zz; // only needed if NGLLX != NGLLY != NGLLZ
 
-//#ifdef USE_TEXTURES_CONSTANTS
-//  const textureReference* d_hprime_xx_tex_ptr;
-//  realw* d_hprime_xx_tex;
-//#endif
-
   realw* d_hprimewgll_xx;
   //realw* d_hprimewgll_yy; // only needed if NGLLX != NGLLY != NGLLZ
   //realw* d_hprimewgll_zz; // only needed if NGLLX != NGLLY != NGLLZ
@@ -635,19 +628,25 @@
   int max_nibool_interfaces_cm;
   int* d_nibool_interfaces_crust_mantle;
   int* d_ibool_interfaces_crust_mantle;
+
   realw* d_send_accel_buffer_crust_mantle;
+  realw* d_b_send_accel_buffer_crust_mantle;
 
   int num_interfaces_inner_core;
   int max_nibool_interfaces_ic;
   int* d_nibool_interfaces_inner_core;
   int* d_ibool_interfaces_inner_core;
+
   realw* d_send_accel_buffer_inner_core;
+  realw* d_b_send_accel_buffer_inner_core;
 
   int num_interfaces_outer_core;
   int max_nibool_interfaces_oc;
   int* d_nibool_interfaces_outer_core;
   int* d_ibool_interfaces_outer_core;
+
   realw* d_send_accel_buffer_outer_core;
+  realw* d_b_send_accel_buffer_outer_core;
 
   // ------------------------------------------------------------------ //
   // absorbing boundaries

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -155,12 +155,9 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
-  int num_blocks_x = mp->nspec2D_top_crust_mantle;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nspec2D_top_crust_mantle,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(NGLL2,1,1);
 
@@ -305,12 +302,9 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  int num_blocks_x = mp->nspec2D_top_crust_mantle;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nspec2D_top_crust_mantle,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y,1);
   dim3 threads(NGLL2,1,1);
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -912,6 +912,11 @@
     // allocates mpi buffer for exchange with cpu
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_crust_mantle),
                                        NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);
+    if( mp->simulation_type == 3){
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_crust_mantle),
+                                        NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);
+    }
+
   }
 
   // inner core mesh
@@ -927,6 +932,11 @@
     // allocates mpi buffer for exchange with cpu
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_inner_core),
                                        NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);
+    if( mp->simulation_type == 3){
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_inner_core),
+                                        NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);
+    }
+
   }
 
   // outer core mesh
@@ -943,6 +953,10 @@
     // allocates mpi buffer for exchange with cpu
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_outer_core),
                                        (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);
+    if( mp->simulation_type == 3){
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_outer_core),
+                                        (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);
+    }
   }
 }
 
@@ -1318,6 +1332,13 @@
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_crust_mantle),sizeof(realw)*size),4011);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_crust_mantle),sizeof(realw)*size),4012);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_crust_mantle),sizeof(realw)*size),4013);
+    // debug
+    #if DEBUG_BACKWARD_SIMULATIONS == 1
+    //debugging with empty arrays
+    print_CUDA_error_if_any(cudaMemset(mp->d_b_displ_crust_mantle,0,sizeof(realw)*size),5111);
+    print_CUDA_error_if_any(cudaMemset(mp->d_b_veloc_crust_mantle,0,sizeof(realw)*size),5111);
+    print_CUDA_error_if_any(cudaMemset(mp->d_b_accel_crust_mantle,0,sizeof(realw)*size),5111);
+    #endif
   }
 
   #ifdef USE_TEXTURES_FIELDS
@@ -1522,26 +1543,32 @@
   mp->nspec2D_top_outer_core = *NSPEC2D_TOP_OC;
   mp->nspec2D_bottom_outer_core = *NSPEC2D_BOTTOM_OC;
 
+  copy_todevice_int((void**)&mp->d_ibelm_top_outer_core,h_ibelm_top_outer_core,mp->nspec2D_top_outer_core);
   int size_toc = NGLL2*(mp->nspec2D_top_outer_core);
-  copy_todevice_int((void**)&mp->d_ibelm_top_outer_core,h_ibelm_top_outer_core,mp->nspec2D_top_outer_core);
   copy_todevice_realw((void**)&mp->d_jacobian2D_top_outer_core,h_jacobian2D_top_outer_core,size_toc);
   copy_todevice_realw((void**)&mp->d_normal_top_outer_core,h_normal_top_outer_core,NDIM*size_toc);
 
+  copy_todevice_int((void**)&mp->d_ibelm_bottom_outer_core,h_ibelm_bottom_outer_core,mp->nspec2D_bottom_outer_core);
   int size_boc = NGLL2*(mp->nspec2D_bottom_outer_core);
-  copy_todevice_int((void**)&mp->d_ibelm_bottom_outer_core,h_ibelm_bottom_outer_core,mp->nspec2D_bottom_outer_core);
   copy_todevice_realw((void**)&mp->d_jacobian2D_bottom_outer_core,h_jacobian2D_bottom_outer_core,size_boc);
   copy_todevice_realw((void**)&mp->d_normal_bottom_outer_core,h_normal_bottom_outer_core,NDIM*size_boc);
 
   // wavefield
-  int size = mp->NGLOB_OUTER_CORE;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ_outer_core),sizeof(realw)*size),5001);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc_outer_core),sizeof(realw)*size),5002);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel_outer_core),sizeof(realw)*size),5003);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ_outer_core),sizeof(realw)*size_glob),5001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc_outer_core),sizeof(realw)*size_glob),5002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel_outer_core),sizeof(realw)*size_glob),5003);
   // backward/reconstructed wavefield
   if( mp->simulation_type == 3 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_outer_core),sizeof(realw)*size),5011);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_outer_core),sizeof(realw)*size),5022);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_outer_core),sizeof(realw)*size),5033);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_outer_core),sizeof(realw)*size_glob),5011);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_outer_core),sizeof(realw)*size_glob),5022);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_outer_core),sizeof(realw)*size_glob),5033);
+    // debug
+    #if DEBUG_BACKWARD_SIMULATIONS == 1
+    //debugging with empty arrays
+    print_CUDA_error_if_any(cudaMemset(mp->d_b_displ_outer_core,0,sizeof(realw)*size_glob),5111);
+    print_CUDA_error_if_any(cudaMemset(mp->d_b_veloc_outer_core,0,sizeof(realw)*size_glob),5111);
+    print_CUDA_error_if_any(cudaMemset(mp->d_b_accel_outer_core,0,sizeof(realw)*size_glob),5111);
+    #endif
   }
 
   #ifdef USE_TEXTURES_FIELDS
@@ -1551,21 +1578,21 @@
       print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_oc_tex_ref_ptr, "d_displ_oc_tex"), 5021);
       cudaChannelFormatDesc channelDesc1 = cudaCreateChannelDesc<realw>();
       print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_oc_tex_ref_ptr, mp->d_displ_outer_core,
-                                              &channelDesc1, sizeof(realw)*size), 5021);
+                                              &channelDesc1, sizeof(realw)*size_glob), 5021);
 
       const textureReference* d_accel_oc_tex_ref_ptr;
       print_CUDA_error_if_any(cudaGetTextureReference(&d_accel_oc_tex_ref_ptr, "d_accel_oc_tex"), 5023);
       cudaChannelFormatDesc channelDesc2 = cudaCreateChannelDesc<realw>();
       print_CUDA_error_if_any(cudaBindTexture(0, d_accel_oc_tex_ref_ptr, mp->d_accel_outer_core,
-                                              &channelDesc2, sizeof(realw)*size), 5023);
+                                              &channelDesc2, sizeof(realw)*size_glob), 5023);
     #else
       cudaChannelFormatDesc channelDesc1 = cudaCreateChannelDesc<float>();
       print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_oc_tex, mp->d_displ_outer_core,
-                                              &channelDesc1, sizeof(realw)*size), 5021);
+                                              &channelDesc1, sizeof(realw)*size_glob), 5021);
 
       cudaChannelFormatDesc channelDesc2 = cudaCreateChannelDesc<float>();
       print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_oc_tex, mp->d_accel_outer_core,
-                                              &channelDesc2, sizeof(realw)*size), 5023);
+                                              &channelDesc2, sizeof(realw)*size_glob), 5023);
     #endif
   }
   #endif
@@ -1579,7 +1606,7 @@
     mp->d_b_rmass_outer_core = mp->d_rmass_outer_core;
 
     //kernels
-    size = NGLL3*(mp->NSPEC_OUTER_CORE);
+    int size = NGLL3*(mp->NSPEC_OUTER_CORE);
 
     // density kernel
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl_outer_core),
@@ -1750,6 +1777,8 @@
 
   mp->nspec_outer_inner_core = *nspec_outer;
   mp->nspec_inner_inner_core = *nspec_inner;
+
+  // boundary elements on top
   mp->nspec2D_top_inner_core = *NSPEC2D_TOP_IC;
   copy_todevice_int((void**)&mp->d_ibelm_top_inner_core,h_ibelm_top_inner_core,mp->nspec2D_top_inner_core);
 
@@ -1763,6 +1792,13 @@
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_inner_core),sizeof(realw)*size),6011);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_inner_core),sizeof(realw)*size),6012);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_inner_core),sizeof(realw)*size),6013);
+    // debug
+    #if DEBUG_BACKWARD_SIMULATIONS == 1
+    // debugging with empty arrays
+    print_CUDA_error_if_any(cudaMemset(mp->d_b_displ_inner_core,0,sizeof(realw)*size),5111);
+    print_CUDA_error_if_any(cudaMemset(mp->d_b_veloc_inner_core,0,sizeof(realw)*size),5111);
+    print_CUDA_error_if_any(cudaMemset(mp->d_b_accel_inner_core,0,sizeof(realw)*size),5111);
+    #endif
   }
 
   #ifdef USE_TEXTURES_FIELDS
@@ -2074,16 +2110,19 @@
     cudaFree(mp->d_nibool_interfaces_crust_mantle);
     cudaFree(mp->d_ibool_interfaces_crust_mantle);
     cudaFree(mp->d_send_accel_buffer_crust_mantle);
+    if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_crust_mantle);
   }
   if( mp->num_interfaces_inner_core > 0 ){
     cudaFree(mp->d_nibool_interfaces_inner_core);
     cudaFree(mp->d_ibool_interfaces_inner_core);
     cudaFree(mp->d_send_accel_buffer_inner_core);
+    if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_inner_core);
   }
   if( mp->num_interfaces_outer_core > 0 ){
     cudaFree(mp->d_nibool_interfaces_outer_core);
     cudaFree(mp->d_ibool_interfaces_outer_core);
     cudaFree(mp->d_send_accel_buffer_outer_core);
+    if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_outer_core);
   }
 
   //------------------------------------------

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2013-09-13 11:23:45 UTC (rev 22782)
@@ -57,9 +57,9 @@
 
 void FC_FUNC_(transfer_boun_accel_from_device,
               TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
-                                                  realw* send_accel_buffer,
-                                                  int* IREGION,
-                                                  int* FORWARD_OR_ADJOINT){} 
+                                               realw* send_accel_buffer,
+                                               int* IREGION,
+                                               int* FORWARD_OR_ADJOINT){} 
 
 void FC_FUNC_(transfer_asmbl_accel_to_device,
               TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
@@ -84,12 +84,12 @@
 void FC_FUNC_(check_norm_acoustic_from_device,
               CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
-                                                  int* SIMULATION_TYPE) {} 
+                                                  int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(check_norm_elastic_from_device,
               CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
                                               long* Mesh_pointer_f,
-                                              int* SIMULATION_TYPE) {} 
+                                              int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(check_norm_strain_from_device,
               CHECK_NORM_STRAIN_FROM_DEVICE)(realw* strain_norm,
@@ -131,23 +131,23 @@
 // src/cuda/compute_add_sources_elastic_cuda.cu
 //
 
-void FC_FUNC_(compute_add_sources_el_cuda,
-              COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
-                                           int* NSOURCESf,
-                                           double* h_stf_pre_compute) {} 
+void FC_FUNC_(compute_add_sources_cuda,
+              COMPUTE_ADD_SOURCES_CUDA)(long* Mesh_pointer_f,
+                                        int* NSOURCESf,
+                                        double* h_stf_pre_compute) {} 
 
-void FC_FUNC_(compute_add_sources_el_s3_cuda,
-              COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer_f,
-                                              int* NSOURCESf,
-                                              double* h_stf_pre_compute) {} 
+void FC_FUNC_(compute_add_sources_backward_cuda,
+              COMPUTE_ADD_SOURCES_BACKWARD_CUDA)(long* Mesh_pointer_f,
+                                                 int* NSOURCESf,
+                                                 double* h_stf_pre_compute) {} 
 
-void FC_FUNC_(add_sources_el_sim_type_2_or_3,
-              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
-                                              int* nrec,
-                                              realw* h_adj_sourcearrays,
-                                              int* h_islice_selected_rec,
-                                              int* h_ispec_selected_rec,
-                                              int* time_index) {} 
+void FC_FUNC_(compute_add_sources_adjoint_cuda,
+              COMPUTE_ADD_SOURCES_ADJOINT_CUDA)(long* Mesh_pointer,
+                                                int* nrec,
+                                                realw* h_adj_sourcearrays,
+                                                int* h_islice_selected_rec,
+                                                int* h_ispec_selected_rec,
+                                                int* time_index) {} 
 
 
 //
@@ -172,8 +172,6 @@
 
 void FC_FUNC_(compute_coupling_ocean_cuda,
               COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
-                                           int* NCHUNKS_VAL,
-                                           int* exact_mass_matrix_for_rotation,
                                            int* FORWARD_OR_ADJOINT) {} 
 
 
@@ -183,7 +181,8 @@
 
 void FC_FUNC_(compute_forces_crust_mantle_cuda,
               COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
-                                                int* iphase) {} 
+                                                int* iphase,
+                                                int* FORWARD_OR_ADJOINT_f) {} 
 
 
 //
@@ -192,7 +191,8 @@
 
 void FC_FUNC_(compute_forces_inner_core_cuda,
               COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
-                                              int* iphase) {} 
+                                              int* iphase,
+                                              int* FORWARD_OR_ADJOINT_f) {} 
 
 
 //
@@ -203,7 +203,7 @@
               COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
                                               int* iphase,
                                               realw* time_f,
-                                              int* FORWARD_OR_ADJOINT) {} 
+                                              int* FORWARD_OR_ADJOINT_f) {} 
 
 
 //
@@ -243,7 +243,11 @@
                                                      realw* absorb_potential,
                                                      int* itype) {} 
 
+void FC_FUNC_(compute_stacey_acoustic_undoatt_cuda,
+              COMPUTE_STACEY_ACOUSTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
+                                                    int* itype) {} 
 
+
 //
 // src/cuda/compute_stacey_elastic_cuda.cu
 //
@@ -258,7 +262,11 @@
                                                     realw* absorb_field,
                                                     int* itype) {} 
 
+void FC_FUNC_(compute_stacey_elastic_undoatt_cuda,
+              COMPUTE_STACEY_ELASTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
+                                                   int* itype) {} 
 
+
 //
 // src/cuda/initialize_cuda.cu
 //
@@ -685,19 +693,19 @@
 
 void FC_FUNC_(transfer_b_rmemory_cm_to_device,
               TRANSFER_B_RMEMORY_CM_TO_DEVICE)(long* Mesh_pointer,
-                                              realw* b_R_xx,
-                                              realw* b_R_yy,
-                                              realw* b_R_xy,
-                                              realw* b_R_xz,
-                                              realw* b_R_yz) {} 
+                                               realw* b_R_xx,
+                                               realw* b_R_yy,
+                                               realw* b_R_xy,
+                                               realw* b_R_xz,
+                                               realw* b_R_yz) {} 
 
 void FC_FUNC_(transfer_b_rmemory_ic_to_device,
               TRANSFER_B_RMEMORY_IC_TO_DEVICE)(long* Mesh_pointer,
-                                              realw* b_R_xx,
-                                              realw* b_R_yy,
-                                              realw* b_R_xy,
-                                              realw* b_R_xz,
-                                              realw* b_R_yz) {} 
+                                               realw* b_R_xx,
+                                               realw* b_R_yy,
+                                               realw* b_R_xy,
+                                               realw* b_R_xz,
+                                               realw* b_R_yz) {} 
 
 void FC_FUNC_(transfer_rotation_from_device,
               TRANSFER_ROTATION_FROM_DEVICE)(long* Mesh_pointer,
@@ -747,42 +755,44 @@
 
 void FC_FUNC_(update_displacement_ic_cuda,
               UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
-                                          realw* deltat_F,
-                                          realw* deltatsqover2_F,
-                                          realw* deltatover2_F,
+                                          realw* deltat_f,
+                                          realw* deltatsqover2_f,
+                                          realw* deltatover2_f,
                                           int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(update_displacement_cm_cuda,
               UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
-                                          realw* deltat_F,
-                                          realw* deltatsqover2_F,
-                                          realw* deltatover2_F,
+                                          realw* deltat_f,
+                                          realw* deltatsqover2_f,
+                                          realw* deltatover2_f,
                                           int* FORWARD_OR_ADJOINT) {} 
 
 void FC_FUNC_(update_displacement_oc_cuda,
               UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
-                                           realw* deltat_F,
-                                           realw* deltatsqover2_F,
-                                           realw* deltatover2_F,
+                                           realw* deltat_f,
+                                           realw* deltatsqover2_f,
+                                           realw* deltatover2_f,
                                            int* FORWARD_OR_ADJOINT) {} 
 
-void FC_FUNC_(update_accel_3_a_cuda,
-              UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
-                                     realw* deltatover2_F,
-                                     int* NCHUNKS_VAL,
-                                     int* FORWARD_OR_ADJOINT) {} 
+void FC_FUNC_(multiply_accel_elastic_cuda,
+              MULTIPLY_ACCEL_ELASTIC_CUDA)(long* Mesh_pointer,
+                                           int* FORWARD_OR_ADJOINT) {} 
 
-void FC_FUNC_(update_veloc_3_b_cuda,
-              UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
-                                     realw* deltatover2_F,
-                                     int* FORWARD_OR_ADJOINT) {} 
+void FC_FUNC_(update_veloc_elastic_cuda,
+              UPDATE_VELOC_ELASTIC_CUDA)(long* Mesh_pointer,
+                                         realw* deltatover2_f,
+                                         int* FORWARD_OR_ADJOINT) {} 
 
-void FC_FUNC_(kernel_3_outer_core_cuda,
-              KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
-                                        realw* deltatover2_F,
-                                        int* FORWARD_OR_ADJOINT) {} 
+void FC_FUNC_(multiply_accel_acoustic_cuda,
+              MULTIPLY_ACCEL_ACOUSTIC_CUDA)(long* Mesh_pointer,
+                                            int* FORWARD_OR_ADJOINT) {} 
 
+void FC_FUNC_(update_veloc_acoustic_cuda,
+              UPDATE_VELOC_ACOUSTIC_CUDA)(long* Mesh_pointer,
+                                          realw* deltatover2_f,
+                                          int* FORWARD_OR_ADJOINT) {} 
 
+
 //
 // src/cuda/write_seismograms_cuda.cu
 //

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -106,6 +106,8 @@
                                               long* Mesh_pointer_f) {
 
   TRACE("transfer_fields_b_cm_to_device");
+  // debug
+  DEBUG_EMPTY_BACKWARD();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_displ_crust_mantle,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
@@ -121,6 +123,8 @@
                                               long* Mesh_pointer_f) {
 
   TRACE("transfer_fields_b_ic_to_device");
+  // debug
+  DEBUG_EMPTY_BACKWARD();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_displ_inner_core,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
@@ -136,6 +140,8 @@
                                               long* Mesh_pointer_f) {
 
   TRACE("transfer_fields_b_oc_to_device");
+  // debug
+  DEBUG_EMPTY_BACKWARD();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   print_CUDA_error_if_any(cudaMemcpy(mp->d_b_displ_outer_core,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
@@ -483,13 +489,13 @@
 
   int size = NGLL3*mp->NSPEC_CRUST_MANTLE;
 
-  cudaMemcpy(eps_trace_over_3,mp->d_eps_trace_over_3_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  print_CUDA_error_if_any(cudaMemcpy(eps_trace_over_3,mp->d_eps_trace_over_3_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320001);
 
-  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320002);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320003);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320004);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320005);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320006);
 
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -510,16 +516,21 @@
                                               realw* epsilondev_xz,
                                               realw* epsilondev_yz) {
   TRACE("transfer_b_strain_cm_to_device");
+  // debug
+  DEBUG_EMPTY_BACKWARD();
+
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   int size = NGLL3*mp->NSPEC_CRUST_MANTLE;
 
-  cudaMemcpy(mp->d_b_epsilondev_xx_crust_mantle,epsilondev_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_yy_crust_mantle,epsilondev_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_xy_crust_mantle,epsilondev_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_xz_crust_mantle,epsilondev_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_yz_crust_mantle,epsilondev_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+  if( ! mp->undo_attenuation ){
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx_crust_mantle,epsilondev_xx,size*sizeof(realw),cudaMemcpyHostToDevice),330001);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy_crust_mantle,epsilondev_yy,size*sizeof(realw),cudaMemcpyHostToDevice),330002);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy_crust_mantle,epsilondev_xy,size*sizeof(realw),cudaMemcpyHostToDevice),330003);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz_crust_mantle,epsilondev_xz,size*sizeof(realw),cudaMemcpyHostToDevice),330004);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz_crust_mantle,epsilondev_yz,size*sizeof(realw),cudaMemcpyHostToDevice),330005);
+  }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_b_strain_cm_to_device");
@@ -545,13 +556,13 @@
 
   int size = NGLL3*mp->NSPEC_INNER_CORE;
 
-  cudaMemcpy(eps_trace_over_3,mp->d_eps_trace_over_3_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  print_CUDA_error_if_any(cudaMemcpy(eps_trace_over_3,mp->d_eps_trace_over_3_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340001);
 
-  cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340002);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340003);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340004);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340005);
+  print_CUDA_error_if_any(cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340006);
 
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -572,16 +583,21 @@
                                               realw* epsilondev_xz,
                                               realw* epsilondev_yz) {
   TRACE("transfer_b_strain_cm_to_device");
+  // debug
+  DEBUG_EMPTY_BACKWARD();
+
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   int size = NGLL3*mp->NSPEC_INNER_CORE;
 
-  cudaMemcpy(mp->d_b_epsilondev_xx_inner_core,epsilondev_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_yy_inner_core,epsilondev_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_xy_inner_core,epsilondev_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_xz_inner_core,epsilondev_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_epsilondev_yz_inner_core,epsilondev_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+  if( ! mp->undo_attenuation ){
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx_inner_core,epsilondev_xx,size*sizeof(realw),cudaMemcpyHostToDevice),350001);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy_inner_core,epsilondev_yy,size*sizeof(realw),cudaMemcpyHostToDevice),350002);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy_inner_core,epsilondev_xy,size*sizeof(realw),cudaMemcpyHostToDevice),350003);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz_inner_core,epsilondev_xz,size*sizeof(realw),cudaMemcpyHostToDevice),350004);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz_inner_core,epsilondev_yz,size*sizeof(realw),cudaMemcpyHostToDevice),350005);
+  }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("after transfer_b_strain_ic_to_device");
@@ -599,25 +615,30 @@
 extern "C"
 void FC_FUNC_(transfer_b_rmemory_cm_to_device,
               TRANSFER_B_RMEMORY_CM_TO_DEVICE)(long* Mesh_pointer,
-                                              realw* b_R_xx,
-                                              realw* b_R_yy,
-                                              realw* b_R_xy,
-                                              realw* b_R_xz,
-                                              realw* b_R_yz) {
+                                               realw* b_R_xx,
+                                               realw* b_R_yy,
+                                               realw* b_R_xy,
+                                               realw* b_R_xz,
+                                               realw* b_R_yz) {
   TRACE("transfer_b_Rmemory_cm_to_device");
+  // debug
+  DEBUG_EMPTY_BACKWARD();
+
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   int size = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
 
-  cudaMemcpy(mp->d_b_R_xx_crust_mantle,b_R_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_yy_crust_mantle,b_R_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_xy_crust_mantle,b_R_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_xz_crust_mantle,b_R_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_yz_crust_mantle,b_R_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+  if( ! mp->partial_phys_dispersion_only ){
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx_crust_mantle,b_R_xx,size*sizeof(realw),cudaMemcpyHostToDevice),360001);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy_crust_mantle,b_R_yy,size*sizeof(realw),cudaMemcpyHostToDevice),360002);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy_crust_mantle,b_R_xy,size*sizeof(realw),cudaMemcpyHostToDevice),360003);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz_crust_mantle,b_R_xz,size*sizeof(realw),cudaMemcpyHostToDevice),360004);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz_crust_mantle,b_R_yz,size*sizeof(realw),cudaMemcpyHostToDevice),360005);
+  }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("after transfer_b_Rmemory_cm_to_device");
+  exit_on_cuda_error("after transfer_b_rmemory_cm_to_device");
 #endif
 }
 
@@ -629,25 +650,30 @@
 extern "C"
 void FC_FUNC_(transfer_b_rmemory_ic_to_device,
               TRANSFER_B_RMEMORY_IC_TO_DEVICE)(long* Mesh_pointer,
-                                              realw* b_R_xx,
-                                              realw* b_R_yy,
-                                              realw* b_R_xy,
-                                              realw* b_R_xz,
-                                              realw* b_R_yz) {
-  TRACE("transfer_b_Rmemory_ic_to_device");
+                                               realw* b_R_xx,
+                                               realw* b_R_yy,
+                                               realw* b_R_xy,
+                                               realw* b_R_xz,
+                                               realw* b_R_yz) {
+  TRACE("transfer_b_rmemory_ic_to_device");
+  // debug
+  DEBUG_EMPTY_BACKWARD();
+
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 
   int size = N_SLS*NGLL3*mp->NSPEC_INNER_CORE;
 
-  cudaMemcpy(mp->d_b_R_xx_inner_core,b_R_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_yy_inner_core,b_R_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_xy_inner_core,b_R_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_xz_inner_core,b_R_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
-  cudaMemcpy(mp->d_b_R_yz_inner_core,b_R_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+  if( ! mp->partial_phys_dispersion_only ){
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx_inner_core,b_R_xx,size*sizeof(realw),cudaMemcpyHostToDevice),370001);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy_inner_core,b_R_yy,size*sizeof(realw),cudaMemcpyHostToDevice),370002);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy_inner_core,b_R_xy,size*sizeof(realw),cudaMemcpyHostToDevice),370003);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz_inner_core,b_R_xz,size*sizeof(realw),cudaMemcpyHostToDevice),370004);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz_inner_core,b_R_yz,size*sizeof(realw),cudaMemcpyHostToDevice),370005);
+  }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("after transfer_b_Rmemory_ic_to_device");
+  exit_on_cuda_error("after transfer_b_rmemory_ic_to_device");
 #endif
 }
 
@@ -688,6 +714,9 @@
                                               realw* A_array_rotation,
                                               realw* B_array_rotation) {
   TRACE("transfer_b_rotation_to_device");
+  // debug
+  DEBUG_EMPTY_BACKWARD();
+
   //get mesh pointer out of fortran integer container
   Mesh* mp = (Mesh*)(*Mesh_pointer);
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -48,10 +48,11 @@
                                        realw deltat,
                                        realw deltatsqover2,
                                        realw deltatover2) {
+
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id];
     veloc[id] = veloc[id] + deltatover2*accel[id];
@@ -69,9 +70,9 @@
 extern "C"
 void FC_FUNC_(update_displacement_ic_cuda,
               UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
-                                          realw* deltat_F,
-                                          realw* deltatsqover2_F,
-                                          realw* deltatover2_F,
+                                          realw* deltat_f,
+                                          realw* deltatsqover2_f,
+                                          realw* deltatover2_f,
                                           int* FORWARD_OR_ADJOINT) {
 
 TRACE("update_displacement_ic_cuda");
@@ -80,23 +81,30 @@
 
   int size = NDIM * mp->NGLOB_INNER_CORE;
 
-  realw deltat = *deltat_F;
-  realw deltatsqover2 = *deltatsqover2_F;
-  realw deltatover2 = *deltatover2_F;
+  // debug
+#if DEBUG_BACKWARD_SIMULATIONS == 1
+  realw max_d,max_v,max_a;
+  max_d = get_device_array_maximum_value(mp->d_b_displ_inner_core, size);
+  max_v = get_device_array_maximum_value(mp->d_b_veloc_inner_core, size);
+  max_a = get_device_array_maximum_value(mp->d_b_accel_inner_core, size);
+  printf("rank %d - max inner_core displ: %f veloc: %f accel: %f\n",mp->myrank,max_d,max_v,max_a);
+  fflush(stdout);
+  synchronize_mpi();
+#endif
 
   int blocksize = BLOCKSIZE_KERNEL1;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  realw deltat = *deltat_f;
+  realw deltatsqover2 = *deltatsqover2_f;
+  realw deltatover2 = *deltatover2_f;
+
   if( *FORWARD_OR_ADJOINT == 1 ){
     //launch kernel
     UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
@@ -104,6 +112,9 @@
                                              mp->d_accel_inner_core,
                                              size,deltat,deltatsqover2,deltatover2);
   }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
     // kernel for backward fields
     UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
                                              mp->d_b_veloc_inner_core,
@@ -126,9 +137,9 @@
 extern "C"
 void FC_FUNC_(update_displacement_cm_cuda,
               UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
-                                          realw* deltat_F,
-                                          realw* deltatsqover2_F,
-                                          realw* deltatover2_F,
+                                          realw* deltat_f,
+                                          realw* deltatsqover2_f,
+                                          realw* deltatover2_f,
                                           int* FORWARD_OR_ADJOINT) {
 
   TRACE("update_displacement_cm_cuda");
@@ -137,23 +148,30 @@
 
   int size = NDIM * mp->NGLOB_CRUST_MANTLE;
 
-  realw deltat = *deltat_F;
-  realw deltatsqover2 = *deltatsqover2_F;
-  realw deltatover2 = *deltatover2_F;
+  // debug
+#if DEBUG_BACKWARD_SIMULATIONS == 1
+  realw max_d,max_v,max_a;
+  max_d = get_device_array_maximum_value(mp->d_b_displ_crust_mantle, size);
+  max_v = get_device_array_maximum_value(mp->d_b_veloc_crust_mantle, size);
+  max_a = get_device_array_maximum_value(mp->d_b_accel_crust_mantle, size);
+  printf("rank %d - max crust_mantle displ: %f veloc: %f accel: %f\n",mp->myrank,max_d,max_v,max_a);
+  fflush(stdout);
+  synchronize_mpi();
+#endif
 
   int blocksize = BLOCKSIZE_KERNEL1;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  realw deltat = *deltat_f;
+  realw deltatsqover2 = *deltatsqover2_f;
+  realw deltatover2 = *deltatover2_f;
+
   if( *FORWARD_OR_ADJOINT == 1 ){
     //launch kernel
     UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
@@ -161,6 +179,9 @@
                                              mp->d_accel_crust_mantle,
                                              size,deltat,deltatsqover2,deltatover2);
   }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
     // kernel for backward fields
     UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
                                              mp->d_b_veloc_crust_mantle,
@@ -214,9 +235,9 @@
 extern "C"
 void FC_FUNC_(update_displacement_oc_cuda,
               UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
-                                           realw* deltat_F,
-                                           realw* deltatsqover2_F,
-                                           realw* deltatover2_F,
+                                           realw* deltat_f,
+                                           realw* deltatsqover2_f,
+                                           realw* deltatover2_f,
                                            int* FORWARD_OR_ADJOINT) {
 
   TRACE("update_displacement_oc_cuda");
@@ -225,30 +246,40 @@
 
   int size = mp->NGLOB_OUTER_CORE;
 
-  realw deltat = *deltat_F;
-  realw deltatsqover2 = *deltatsqover2_F;
-  realw deltatover2 = *deltatover2_F;
+  // debug
+#if DEBUG_BACKWARD_SIMULATIONS == 1
+  realw max_d,max_v,max_a;
+  max_d = get_device_array_maximum_value(mp->d_b_displ_outer_core, size);
+  max_v = get_device_array_maximum_value(mp->d_b_veloc_outer_core, size);
+  max_a = get_device_array_maximum_value(mp->d_b_accel_outer_core, size);
+  printf("rank %d - max outer_core displ: %f veloc: %f accel: %f\n",mp->myrank,max_d,max_v,max_a);
+  fflush(stdout);
+  synchronize_mpi();
+#endif
 
   int blocksize = BLOCKSIZE_KERNEL1;
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  realw deltat = *deltat_f;
+  realw deltatsqover2 = *deltatsqover2_f;
+  realw deltatover2 = *deltatover2_f;
+
   if( *FORWARD_OR_ADJOINT == 1 ){
     //launch kernel
     UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
                                              mp->d_veloc_outer_core,
                                              mp->d_accel_outer_core,
                                              size,deltat,deltatsqover2,deltatover2);
-  }else if( *FORWARD_OR_ADJOINT == 1 ){
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
     UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,
                                              mp->d_b_veloc_outer_core,
                                              mp->d_b_accel_outer_core,
@@ -265,10 +296,12 @@
 
 // KERNEL 3
 //
-// crust/mantle and inner core regions
+// elastic domains: crust/mantle and inner core regions
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// unused...
+/*
 __global__ void kernel_3_cuda_device(realw* veloc,
                                      realw* accel,
                                      int size,
@@ -279,8 +312,8 @@
                                      realw* rmassz) {
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     // note: update adds rotational acceleration in case two_omega_earth is non-zero
     accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
@@ -292,244 +325,190 @@
     veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
   }
 }
+*/
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void kernel_3_accel_cuda_device(realw* accel,
-                                           realw* veloc,
-                                           int size,
-                                           realw two_omega_earth,
-                                           realw* rmassx,
-                                           realw* rmassy,
-                                           realw* rmassz) {
+__global__ void multiply_accel_elastic_cuda_device(realw* accel,
+                                                   realw* veloc,
+                                                   int size,
+                                                   realw two_omega_earth,
+                                                   realw* rmassx,
+                                                   realw* rmassy,
+                                                   realw* rmassz) {
+
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     // note: update adds rotational acceleration in case two_omega_earth is non-zero
-    accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
-    accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
-    accel[3*id+2] = accel[3*id+2]*rmassz[id];
+    accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id + 1]; // (2,i);
+    accel[3*id + 1] = accel[3*id + 1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
+    accel[3*id + 2] = accel[3*id + 2]*rmassz[id];
   }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
-__global__ void kernel_3_veloc_cuda_device(realw* veloc,
-                                           realw* accel,
-                                           int size,
-                                           realw deltatover2) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+extern "C"
+void FC_FUNC_(multiply_accel_elastic_cuda,
+              MULTIPLY_ACCEL_ELASTIC_CUDA)(long* Mesh_pointer,
+                                           int* FORWARD_OR_ADJOINT) {
+  TRACE("multiply_accel_elastic_cuda");
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
-    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
-    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
-  }
-}
+  int size_padded,num_blocks_x,num_blocks_y;
+  dim3 grid,threads;
 
-/* ----------------------------------------------------------------------------------------------- */
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-extern "C"
-void FC_FUNC_(update_accel_3_a_cuda,
-              UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
-                                     realw* deltatover2_F,
-                                     int* NCHUNKS_VAL,
-                                     int* FORWARD_OR_ADJOINT) {
-  TRACE("update_accel_3_a_cuda");
+  int blocksize = BLOCKSIZE_KERNEL3;
 
-  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+  // multiplies accel with inverse of mass matrix
 
-  realw deltatover2 = *deltatover2_F;
+  // crust/mantle region
+  size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
 
-  int blocksize = BLOCKSIZE_KERNEL3;
-  int size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
+  grid = dim3(num_blocks_x,num_blocks_y);
+  threads = dim3(blocksize,1,1);
+
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+                                                           mp->d_veloc_crust_mantle,
+                                                           mp->NGLOB_CRUST_MANTLE,
+                                                           mp->two_omega_earth,
+                                                           mp->d_rmassx_crust_mantle,
+                                                           mp->d_rmassy_crust_mantle,
+                                                           mp->d_rmassz_crust_mantle);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
+    multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+                                                           mp->d_b_veloc_crust_mantle,
+                                                           mp->NGLOB_CRUST_MANTLE,
+                                                           mp->b_two_omega_earth,
+                                                           mp->d_b_rmassx_crust_mantle,
+                                                           mp->d_b_rmassy_crust_mantle,
+                                                           mp->d_b_rmassz_crust_mantle);
   }
 
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
+  // inner core region
+  size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
 
-  // crust/mantle region only
-  // check whether we can update accel and veloc, or only accel at this point
-  if( mp->oceans == 0 ){
-    // updates both, accel and veloc
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
 
-    if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
-      // uses corrected mass matrices rmassx,rmassy,rmassz
-      if( *FORWARD_OR_ADJOINT == 1 ){
-        kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
-                                                 mp->d_accel_crust_mantle,
-                                                 mp->NGLOB_CRUST_MANTLE,
-                                                 deltatover2,
-                                                 mp->two_omega_earth,
-                                                 mp->d_rmassx_crust_mantle,
-                                                 mp->d_rmassy_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle);
-      }else if( *FORWARD_OR_ADJOINT == 3 ){
-        kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
-                                                 mp->d_b_accel_crust_mantle,
-                                                 mp->NGLOB_CRUST_MANTLE,
-                                                 deltatover2,
-                                                 mp->b_two_omega_earth,
-                                                 mp->d_rmassx_crust_mantle,
-                                                 mp->d_rmassy_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle);
-      }
-    }else{
-      // uses only rmassz
-      if( *FORWARD_OR_ADJOINT == 1 ){
-        kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
-                                                 mp->d_accel_crust_mantle,
-                                                 mp->NGLOB_CRUST_MANTLE,
-                                                 deltatover2,
-                                                 mp->two_omega_earth,
-                                                 mp->d_rmassz_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle);
-      }else if( *FORWARD_OR_ADJOINT == 3 ){
-        kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
-                                                 mp->d_b_accel_crust_mantle,
-                                                 mp->NGLOB_CRUST_MANTLE,
-                                                 deltatover2,
-                                                 mp->b_two_omega_earth,
-                                                 mp->d_rmassz_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle);
-      }
-    }
+  grid = dim3(num_blocks_x,num_blocks_y);
+  threads = dim3(blocksize,1,1);
 
-  }else{
-    // updates only accel
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_accel_inner_core,
+                                                           mp->d_veloc_inner_core,
+                                                           mp->NGLOB_INNER_CORE,
+                                                           mp->two_omega_earth,
+                                                           mp->d_rmassx_inner_core,
+                                                           mp->d_rmassy_inner_core,
+                                                           mp->d_rmassz_inner_core);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
 
-    if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
-      // uses corrected mass matrices
-      if( *FORWARD_OR_ADJOINT == 1 ){
-        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
-                                                       mp->d_veloc_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       mp->two_omega_earth,
-                                                       mp->d_rmassx_crust_mantle,
-                                                       mp->d_rmassy_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle);
-      }else if( *FORWARD_OR_ADJOINT == 3 ){
-        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
-                                                       mp->d_b_veloc_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       mp->b_two_omega_earth,
-                                                       mp->d_rmassx_crust_mantle,
-                                                       mp->d_rmassy_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle);
-      }
-    }else{
-      // uses only rmassz
-      if( *FORWARD_OR_ADJOINT == 1 ){
-        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
-                                                       mp->d_veloc_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       mp->two_omega_earth,
-                                                       mp->d_rmassz_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle);
-      }else if( *FORWARD_OR_ADJOINT == 3 ){
-        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
-                                                       mp->d_b_veloc_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       mp->b_two_omega_earth,
-                                                       mp->d_rmassz_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle);
-      }
-    }
+    multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_b_accel_inner_core,
+                                                           mp->d_b_veloc_inner_core,
+                                                           mp->NGLOB_INNER_CORE,
+                                                           mp->b_two_omega_earth,
+                                                           mp->d_b_rmassx_inner_core,
+                                                           mp->d_b_rmassy_inner_core,
+                                                           mp->d_b_rmassz_inner_core);
   }
 
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after update_accel_3_a");
+  exit_on_cuda_error("after multiply_accel_elastic_cuda");
 #endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
+__global__ void update_veloc_elastic_cuda_device(realw* veloc,
+                                                 realw* accel,
+                                                 int size,
+                                                 realw deltatover2) {
+
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
+  if(id < size) {
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id + 1] = veloc[3*id + 1] + deltatover2*accel[3*id + 1];
+    veloc[3*id + 2] = veloc[3*id + 2] + deltatover2*accel[3*id + 2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
 extern "C"
-void FC_FUNC_(update_veloc_3_b_cuda,
-              UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
-                                     realw* deltatover2_F,
-                                     int* FORWARD_OR_ADJOINT) {
-  TRACE("update_veloc_3_b_cuda");
+void FC_FUNC_(update_veloc_elastic_cuda,
+              UPDATE_VELOC_ELASTIC_CUDA)(long* Mesh_pointer,
+                                         realw* deltatover2_f,
+                                         int* FORWARD_OR_ADJOINT) {
+
+  TRACE("update_veloc_elastic_cuda");
+
   int size_padded,num_blocks_x,num_blocks_y;
+  dim3 grid,threads;
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-  realw deltatover2 = *deltatover2_F;
+  realw deltatover2 = *deltatover2_f;
 
   int blocksize = BLOCKSIZE_KERNEL3;
 
+  // updates velocity
+
   // crust/mantle region
-  // in case of ocean loads, we still have to update the velocity for crust/mantle region
-  if( mp->oceans ){
-    size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
-    num_blocks_x = size_padded/blocksize;
-    num_blocks_y = 1;
-    while(num_blocks_x > MAXIMUM_GRID_DIM) {
-      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-      num_blocks_y = num_blocks_y*2;
-    }
-    dim3 grid1(num_blocks_x,num_blocks_y);
-    dim3 threads1(blocksize,1,1);
+  size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
 
-    // updates only veloc at this point
-    if( *FORWARD_OR_ADJOINT == 1 ){
-      kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_veloc_crust_mantle,
-                                                       mp->d_accel_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       deltatover2);
-    }else if( *FORWARD_OR_ADJOINT == 3 ){
-      kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_b_veloc_crust_mantle,
-                                                       mp->d_b_accel_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       deltatover2);
-    }
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+  grid = dim3(num_blocks_x,num_blocks_y);
+  threads = dim3(blocksize,1,1);
+
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+                                                         mp->d_accel_crust_mantle,
+                                                         mp->NGLOB_CRUST_MANTLE,
+                                                         deltatover2);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+                                                         mp->d_b_accel_crust_mantle,
+                                                         mp->NGLOB_CRUST_MANTLE,
+                                                         deltatover2);
   }
 
   // inner core region
   size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
-  num_blocks_x = size_padded/blocksize;
-  num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
 
-  // updates both, accel and veloc
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+  grid = dim3(num_blocks_x,num_blocks_y);
+  threads = dim3(blocksize,1,1);
+
   if( *FORWARD_OR_ADJOINT == 1 ){
-    kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
-                                             mp->d_accel_inner_core,
-                                             mp->NGLOB_INNER_CORE,
-                                             deltatover2,
-                                             mp->two_omega_earth,
-                                             mp->d_rmassx_inner_core,
-                                             mp->d_rmassy_inner_core,
-                                             mp->d_rmassz_inner_core);
+    update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
+                                                         mp->d_accel_inner_core,
+                                                         mp->NGLOB_INNER_CORE,
+                                                         deltatover2);
   }else if( *FORWARD_OR_ADJOINT == 3 ){
-    kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
-                                             mp->d_b_accel_inner_core,
-                                             mp->NGLOB_INNER_CORE,
-                                             deltatover2,
-                                             mp->b_two_omega_earth,
-                                             mp->d_rmassx_inner_core,
-                                             mp->d_rmassy_inner_core,
-                                             mp->d_rmassz_inner_core);
+    update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
+                                                         mp->d_b_accel_inner_core,
+                                                         mp->NGLOB_INNER_CORE,
+                                                         deltatover2);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -542,23 +521,77 @@
 
 // KERNEL 3
 //
-// for outer_core
+// acoustic domains: for fluid outer_core
 
 /* ----------------------------------------------------------------------------------------------- */
 
 
-__global__ void kernel_3_outer_core_cuda_device(realw* veloc,
-                                                realw* accel,int size,
-                                                realw deltatover2,
-                                                realw* rmass) {
+__global__ void multiply_accel_acoustic_cuda_device(realw* accel,
+                                                    int size,
+                                                    realw* rmass) {
+
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
   if(id < size) {
     // multiplies pressure with the inverse of the mass matrix
     accel[id] = accel[id]*rmass[id];
+  }
+}
 
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(multiply_accel_acoustic_cuda,
+              MULTIPLY_ACCEL_ACOUSTIC_CUDA)(long* Mesh_pointer,
+                                            int* FORWARD_OR_ADJOINT) {
+  TRACE("multiply_accel_acoustic_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+  int blocksize = BLOCKSIZE_KERNEL3;
+
+  int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // multiplies accel with inverse of mass matrix
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    multiply_accel_acoustic_cuda_device<<< grid, threads>>>(mp->d_accel_outer_core,
+                                                            mp->NGLOB_OUTER_CORE,
+                                                            mp->d_rmass_outer_core);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
+    multiply_accel_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_accel_outer_core,
+                                                            mp->NGLOB_OUTER_CORE,
+                                                            mp->d_b_rmass_outer_core);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("after multiply_accel_acoustic_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void update_veloc_acoustic_cuda_device(realw* veloc,
+                                                  realw* accel,
+                                                  int size,
+                                                  realw deltatover2) {
+
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  // because of block and grid sizing problems, there is a small
+  // amount of buffer at the end of the calculation
+  if(id < size) {
     // Newmark time scheme: corrector term
     veloc[id] = veloc[id] + deltatover2*accel[id];
   }
@@ -568,44 +601,46 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
-void FC_FUNC_(kernel_3_outer_core_cuda,
-              KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
-                                        realw* deltatover2_F,
-                                        int* FORWARD_OR_ADJOINT) {
+void FC_FUNC_(update_veloc_acoustic_cuda,
+              UPDATE_VELOC_ACOUSTIC_CUDA)(long* Mesh_pointer,
+                                          realw* deltatover2_f,
+                                          int* FORWARD_OR_ADJOINT) {
 
-  TRACE("kernel_3_outer_core_cuda");
+  TRACE("update_veloc_acoustic_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
-  realw deltatover2 = *deltatover2_F;
+  realw deltatover2 = *deltatover2_f;
 
   int blocksize = BLOCKSIZE_KERNEL3;
 
   int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
+  // updates velocity
   if( *FORWARD_OR_ADJOINT == 1 ){
-    kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
-                                                        mp->d_accel_outer_core,
-                                                        mp->NGLOB_OUTER_CORE,
-                                                        deltatover2,mp->d_rmass_outer_core);
+    update_veloc_acoustic_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
+                                                          mp->d_accel_outer_core,
+                                                          mp->NGLOB_OUTER_CORE,
+                                                          deltatover2);
   }else if( *FORWARD_OR_ADJOINT == 3){
-    kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
-                                                        mp->d_b_accel_outer_core,
-                                                        mp->NGLOB_OUTER_CORE,
-                                                        deltatover2,mp->d_rmass_outer_core);
+    // debug
+    DEBUG_EMPTY_BACKWARD();
+
+    update_veloc_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
+                                                          mp->d_b_accel_outer_core,
+                                                          mp->NGLOB_OUTER_CORE,
+                                                          deltatover2);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after kernel_3_outer_core");
+  exit_on_cuda_error("after update_veloc_acoustic_cuda");
 #endif
 }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2013-09-13 11:23:45 UTC (rev 22782)
@@ -105,12 +105,9 @@
 
   int blocksize = NGLL3;
 
-  int num_blocks_x = mp->nrec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nrec_local,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 
@@ -162,12 +159,9 @@
 
   int blocksize = NGLL3;
 
-  int num_blocks_x = mp->nrec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
+  int num_blocks_x, num_blocks_y;
+  get_blocks_xy(mp->nrec_local,&num_blocks_x,&num_blocks_y);
+
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -341,9 +341,7 @@
     enddo
 
     ! adding contributions of neighbours
-    call transfer_asmbl_pot_to_device(Mesh_pointer, &
-                                    buffer_recv_scalar, &
-                                    FORWARD_OR_ADJOINT)
+    call transfer_asmbl_pot_to_device(Mesh_pointer,buffer_recv_scalar,FORWARD_OR_ADJOINT)
 
     ! note: adding contributions of neighbours has been done just above for cuda
     !do iinterface = 1, num_interfaces

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -78,17 +78,16 @@
      ! send messages
      do iinterface = 1, num_interfaces
         call isend_cr(buffer_send_vector(1,1,iinterface), &
-             NDIM*nibool_interfaces(iinterface), &
-             my_neighbours(iinterface), &
-             itag, &
-             request_send_vector(iinterface) &
-             )
+                      NDIM*nibool_interfaces(iinterface), &
+                      my_neighbours(iinterface), &
+                      itag, &
+                      request_send_vector(iinterface))
+
         call irecv_cr(buffer_recv_vector(1,1,iinterface), &
-             NDIM*nibool_interfaces(iinterface), &
-             my_neighbours(iinterface), &
-             itag, &
-             request_recv_vector(iinterface) &
-             )
+                      NDIM*nibool_interfaces(iinterface), &
+                      my_neighbours(iinterface), &
+                      itag, &
+                      request_recv_vector(iinterface))
      enddo
 
   endif
@@ -136,7 +135,7 @@
   integer :: FORWARD_OR_ADJOINT
 
   ! local parameters
-  integer iinterface
+  integer :: iinterface
 
   ! send only if more than one partition
   if(NPROC > 1) then
@@ -144,25 +143,24 @@
     ! preparation of the contribution between partitions using MPI
     ! transfers mpi buffers to CPU
     call transfer_boun_accel_from_device(Mesh_pointer, &
-                                        buffer_send_vector,&
-                                        IREGION,FORWARD_OR_ADJOINT)
+                                         buffer_send_vector,&
+                                         IREGION,FORWARD_OR_ADJOINT)
 
-     ! send messages
-     do iinterface = 1, num_interfaces
-        call isend_cr(buffer_send_vector(1,1,iinterface), &
-             NDIM*nibool_interfaces(iinterface), &
-             my_neighbours(iinterface), &
-             itag, &
-             request_send_vector(iinterface) &
-             )
-        call irecv_cr(buffer_recv_vector(1,1,iinterface), &
-             NDIM*nibool_interfaces(iinterface), &
-             my_neighbours(iinterface), &
-             itag, &
-             request_recv_vector(iinterface) &
-             )
-     enddo
+    ! send messages
+    do iinterface = 1, num_interfaces
+      call isend_cr(buffer_send_vector(1,1,iinterface), &
+                    NDIM*nibool_interfaces(iinterface), &
+                    my_neighbours(iinterface), &
+                    itag, &
+                    request_send_vector(iinterface))
 
+      call irecv_cr(buffer_recv_vector(1,1,iinterface), &
+                    NDIM*nibool_interfaces(iinterface), &
+                    my_neighbours(iinterface), &
+                    itag, &
+                    request_recv_vector(iinterface))
+    enddo
+
   endif
 
   end subroutine assemble_MPI_vector_send_cuda
@@ -252,8 +250,7 @@
 
   integer :: num_interfaces,max_nibool_interfaces
 
-  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: &
-       buffer_recv_vector
+  real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: buffer_recv_vector
   integer, dimension(num_interfaces) :: request_send_vector,request_recv_vector
 
   integer :: IREGION
@@ -275,8 +272,8 @@
 
     ! adding contributions of neighbours
     call transfer_asmbl_accel_to_device(Mesh_pointer, &
-                                      buffer_recv_vector, &
-                                      IREGION,FORWARD_OR_ADJOINT)
+                                        buffer_recv_vector, &
+                                        IREGION,FORWARD_OR_ADJOINT)
 
     ! This step is done via previous function transfer_and_assemble...
     ! do iinterface = 1, num_interfaces

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -133,7 +133,7 @@
       enddo
     endif
     ! adds sources: only implements SIMTYPE=1 and NOISE_TOM=0
-    call compute_add_sources_el_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
+    call compute_add_sources_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
   endif
 
 
@@ -288,9 +288,9 @@
 
   else
     ! on GPU
-    call add_sources_el_sim_type_2_or_3(Mesh_pointer,nrec,adj_sourcearrays, &
-                                       islice_selected_rec,ispec_selected_rec, &
-                                       iadj_vec(it))
+    call compute_add_sources_adjoint_cuda(Mesh_pointer,nrec,adj_sourcearrays, &
+                                          islice_selected_rec,ispec_selected_rec, &
+                                          iadj_vec(it))
   endif
 
   end subroutine compute_add_sources_adjoint
@@ -414,7 +414,7 @@
       enddo
     endif
     ! adds sources: only implements SIMTYPE=3 (and NOISE_TOM=0)
-    call compute_add_sources_el_s3_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
+    call compute_add_sources_backward_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
   endif
 
   end subroutine compute_add_sources_backward

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -333,7 +333,8 @@
         if(GRAVITY_VAL) then
           pressure = RHO_BOTTOM_OC * (- accel_outer_core(iglob) &
              + minus_g_icb *(displ_inner_core(1,iglob_inner_core)*nx &
-                           + displ_inner_core(2,iglob_inner_core)*ny + displ_inner_core(3,iglob_inner_core)*nz))
+                             + displ_inner_core(2,iglob_inner_core)*ny &
+                             + displ_inner_core(3,iglob_inner_core)*nz))
         else
           pressure = - RHO_BOTTOM_OC * accel_outer_core(iglob)
         endif
@@ -425,8 +426,8 @@
           ! we divide by rmass_crust_mantle() which is 1 / M
           ! we use the total force which includes the Coriolis term above
           force_normal_comp = accel_crust_mantle(1,iglob)*nx / rmassx_crust_mantle(iglob) &
-                           + accel_crust_mantle(2,iglob)*ny / rmassy_crust_mantle(iglob) &
-                           + accel_crust_mantle(3,iglob)*nz / rmassz_crust_mantle(iglob)
+                            + accel_crust_mantle(2,iglob)*ny / rmassy_crust_mantle(iglob) &
+                            + accel_crust_mantle(3,iglob)*nz / rmassz_crust_mantle(iglob)
 
           additional_term_x = (rmass_ocean_load(iglob) - rmassx_crust_mantle(iglob)) * force_normal_comp
           additional_term_y = (rmass_ocean_load(iglob) - rmassy_crust_mantle(iglob)) * force_normal_comp

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -204,8 +204,17 @@
   enddo ! iphase
 
   ! multiply by the inverse of the mass matrix
-  call multiply_accel_acoustic(NGLOB_OUTER_CORE,accel_outer_core,rmass_outer_core)
 
+  if(.NOT. GPU_MODE) then
+    ! on CPU
+    call multiply_accel_acoustic(NGLOB_OUTER_CORE,accel_outer_core,rmass_outer_core)
+  else
+    ! on GPU
+    ! includes FORWARD_OR_ADJOINT == 1
+    call multiply_accel_acoustic_cuda(Mesh_pointer,1)
+  endif
+
+
   ! time schemes
   if( USE_LDDRK ) then
     ! runge-kutta scheme
@@ -407,25 +416,32 @@
       if(.NOT. GPU_MODE) then
         ! on CPU
         call assemble_MPI_scalar_w(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
-                              b_accel_outer_core, &
-                              b_buffer_recv_scalar_outer_core,num_interfaces_outer_core,&
-                              max_nibool_interfaces_oc, &
-                              nibool_interfaces_outer_core,ibool_interfaces_outer_core, &
-                              b_request_send_scalar_oc,b_request_recv_scalar_oc)
+                                   b_accel_outer_core, &
+                                   b_buffer_recv_scalar_outer_core,num_interfaces_outer_core,&
+                                   max_nibool_interfaces_oc, &
+                                   nibool_interfaces_outer_core,ibool_interfaces_outer_core, &
+                                   b_request_send_scalar_oc,b_request_recv_scalar_oc)
       else
         ! on GPU
         call assemble_MPI_scalar_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
-                              b_buffer_recv_scalar_outer_core, &
-                              num_interfaces_outer_core,max_nibool_interfaces_oc, &
-                              b_request_send_scalar_oc,b_request_recv_scalar_oc, &
-                              3) ! <-- 3 == adjoint b_accel
+                                            b_buffer_recv_scalar_outer_core, &
+                                            num_interfaces_outer_core,max_nibool_interfaces_oc, &
+                                            b_request_send_scalar_oc,b_request_recv_scalar_oc, &
+                                            3) ! <-- 3 == adjoint b_accel
       endif
     endif ! iphase == 1
 
   enddo ! iphase
 
   ! multiply by the inverse of the mass matrix
-  call multiply_accel_acoustic(NGLOB_OUTER_CORE_ADJOINT,b_accel_outer_core,b_rmass_outer_core)
+  if(.NOT. GPU_MODE) then
+    ! on CPU
+    call multiply_accel_acoustic(NGLOB_OUTER_CORE_ADJOINT,b_accel_outer_core,b_rmass_outer_core)
+  else
+    ! on GPU
+    ! includes FORWARD_OR_ADJOINT == 3
+    call multiply_accel_acoustic_cuda(Mesh_pointer,3)
+  endif
 
   ! time schemes
   if( USE_LDDRK ) then

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -73,80 +73,79 @@
     ! compute internal forces in the solid regions
     ! note: for anisotropy and gravity, x y and z contain r theta and phi
     if( .NOT. GPU_MODE ) then
-       ! on CPU
-       if( USE_DEVILLE_PRODUCTS_VAL ) then
-          ! uses Deville (2002) optimizations
-          ! crust/mantle region
-          call compute_forces_crust_mantle_Dev( NSPEC_CRUST_MANTLE_STR_OR_ATT,NGLOB_CRUST_MANTLE, &
-               NSPEC_CRUST_MANTLE_ATTENUATION, &
-               deltat, &
-               displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
-               phase_is_inner, &
-               R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
-               R_xz_crust_mantle,R_yz_crust_mantle, &
-               R_xx_crust_mantle_lddrk,R_yy_crust_mantle_lddrk,R_xy_crust_mantle_lddrk, &
-               R_xz_crust_mantle_lddrk,R_yz_crust_mantle_lddrk, &
-               epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
-               epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
-               eps_trace_over_3_crust_mantle, &
-               alphaval,betaval,gammaval, &
-               factor_common_crust_mantle,size(factor_common_crust_mantle,5), .false. )
-          ! inner core region
-          call compute_forces_inner_core_Dev( NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
-               NSPEC_INNER_CORE_ATTENUATION, &
-               deltat, &
-               displ_inner_core,veloc_inner_core,accel_inner_core, &
-               phase_is_inner, &
-               R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
-               R_xz_inner_core,R_yz_inner_core, &
-               R_xx_inner_core_lddrk,R_yy_inner_core_lddrk,R_xy_inner_core_lddrk, &
-               R_xz_inner_core_lddrk,R_yz_inner_core_lddrk, &
-               epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
-               epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
-               eps_trace_over_3_inner_core,&
-               alphaval,betaval,gammaval, &
-               factor_common_inner_core,size(factor_common_inner_core,5), .false. )
-       else
-          ! no Deville optimization
-          ! crust/mantle region
-          call compute_forces_crust_mantle(  NSPEC_CRUST_MANTLE_STR_OR_ATT,NGLOB_CRUST_MANTLE, &
-               NSPEC_CRUST_MANTLE_ATTENUATION, &
-               deltat, &
-               displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
-               phase_is_inner, &
-               R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
-               R_xz_crust_mantle,R_yz_crust_mantle, &
-               R_xx_crust_mantle_lddrk,R_yy_crust_mantle_lddrk,R_xy_crust_mantle_lddrk, &
-               R_xz_crust_mantle_lddrk,R_yz_crust_mantle_lddrk, &
-               epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
-               epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
-               eps_trace_over_3_crust_mantle, &
-               alphaval,betaval,gammaval, &
-               factor_common_crust_mantle,size(factor_common_crust_mantle,5), .false. )
-          ! inner core region
-          call compute_forces_inner_core( NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
-               NSPEC_INNER_CORE_ATTENUATION, &
-               deltat, &
-               displ_inner_core,veloc_inner_core,accel_inner_core, &
-               phase_is_inner, &
-               R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
-               R_xz_inner_core,R_yz_inner_core, &
-               R_xx_inner_core_lddrk,R_yy_inner_core_lddrk,R_xy_inner_core_lddrk, &
-               R_xz_inner_core_lddrk,R_yz_inner_core_lddrk, &
-               epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
-               epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
-               eps_trace_over_3_inner_core,&
-               alphaval,betaval,gammaval, &
-               factor_common_inner_core,size(factor_common_inner_core,5), .false. )
-
-       endif
+      ! on CPU
+      if( USE_DEVILLE_PRODUCTS_VAL ) then
+        ! uses Deville (2002) optimizations
+        ! crust/mantle region
+        call compute_forces_crust_mantle_Dev(NSPEC_CRUST_MANTLE_STR_OR_ATT,NGLOB_CRUST_MANTLE, &
+                                             NSPEC_CRUST_MANTLE_ATTENUATION, &
+                                             deltat, &
+                                             displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
+                                             phase_is_inner, &
+                                             R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
+                                             R_xz_crust_mantle,R_yz_crust_mantle, &
+                                             R_xx_crust_mantle_lddrk,R_yy_crust_mantle_lddrk,R_xy_crust_mantle_lddrk, &
+                                             R_xz_crust_mantle_lddrk,R_yz_crust_mantle_lddrk, &
+                                             epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                                             epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                                             eps_trace_over_3_crust_mantle, &
+                                             alphaval,betaval,gammaval, &
+                                             factor_common_crust_mantle,size(factor_common_crust_mantle,5), .false. )
+        ! inner core region
+        call compute_forces_inner_core_Dev(NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
+                                           NSPEC_INNER_CORE_ATTENUATION, &
+                                           deltat, &
+                                           displ_inner_core,veloc_inner_core,accel_inner_core, &
+                                           phase_is_inner, &
+                                           R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
+                                           R_xz_inner_core,R_yz_inner_core, &
+                                           R_xx_inner_core_lddrk,R_yy_inner_core_lddrk,R_xy_inner_core_lddrk, &
+                                           R_xz_inner_core_lddrk,R_yz_inner_core_lddrk, &
+                                           epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+                                           epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+                                           eps_trace_over_3_inner_core,&
+                                           alphaval,betaval,gammaval, &
+                                           factor_common_inner_core,size(factor_common_inner_core,5), .false. )
+      else
+        ! no Deville optimization
+        ! crust/mantle region
+        call compute_forces_crust_mantle(NSPEC_CRUST_MANTLE_STR_OR_ATT,NGLOB_CRUST_MANTLE, &
+                                         NSPEC_CRUST_MANTLE_ATTENUATION, &
+                                         deltat, &
+                                         displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
+                                         phase_is_inner, &
+                                         R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
+                                         R_xz_crust_mantle,R_yz_crust_mantle, &
+                                         R_xx_crust_mantle_lddrk,R_yy_crust_mantle_lddrk,R_xy_crust_mantle_lddrk, &
+                                         R_xz_crust_mantle_lddrk,R_yz_crust_mantle_lddrk, &
+                                         epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                                         epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                                         eps_trace_over_3_crust_mantle, &
+                                         alphaval,betaval,gammaval, &
+                                         factor_common_crust_mantle,size(factor_common_crust_mantle,5), .false. )
+        ! inner core region
+        call compute_forces_inner_core(NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
+                                       NSPEC_INNER_CORE_ATTENUATION, &
+                                       deltat, &
+                                       displ_inner_core,veloc_inner_core,accel_inner_core, &
+                                       phase_is_inner, &
+                                       R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
+                                       R_xz_inner_core,R_yz_inner_core, &
+                                       R_xx_inner_core_lddrk,R_yy_inner_core_lddrk,R_xy_inner_core_lddrk, &
+                                       R_xz_inner_core_lddrk,R_yz_inner_core_lddrk, &
+                                       epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+                                       epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+                                       eps_trace_over_3_inner_core,&
+                                       alphaval,betaval,gammaval, &
+                                       factor_common_inner_core,size(factor_common_inner_core,5), .false. )
+      endif
     else
-       ! on GPU
-       ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
-       ! for crust/mantle
-       call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase)
-       ! for inner core
-       call compute_forces_inner_core_cuda(Mesh_pointer,iphase)
+      ! on GPU
+      ! contains forward FORWARD_OR_ADJOINT == 1
+      ! for crust/mantle
+      call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase,1)
+      ! for inner core
+      call compute_forces_inner_core_cuda(Mesh_pointer,iphase,1)
     endif ! GPU_MODE
 
     ! computes additional contributions to acceleration field
@@ -272,8 +271,7 @@
                       nibool_interfaces_crust_mantle,&
                       my_neighbours_crust_mantle, &
                       request_send_vector_cm,request_recv_vector_cm, &
-                      IREGION_CRUST_MANTLE, &
-                      1) ! <-- 1 == fwd accel
+                      IREGION_CRUST_MANTLE,1) ! <-- 1 == fwd accel
         ! inner core
         call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
                       buffer_send_vector_inner_core,buffer_recv_vector_inner_core, &
@@ -281,8 +279,7 @@
                       nibool_interfaces_inner_core,&
                       my_neighbours_inner_core, &
                       request_send_vector_ic,request_recv_vector_ic, &
-                      IREGION_INNER_CORE, &
-                      1)
+                      IREGION_INNER_CORE,1)
       endif ! GPU_MODE
     else
       ! waits for send/receive requests to be completed and assembles values
@@ -309,15 +306,13 @@
                             buffer_recv_vector_crust_mantle, &
                             num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
                             request_send_vector_cm,request_recv_vector_cm, &
-                            IREGION_CRUST_MANTLE, &
-                            1) ! <-- 1 == fwd accel
+                            IREGION_CRUST_MANTLE,1) ! <-- 1 == fwd accel
         ! inner core
         call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
                             buffer_recv_vector_inner_core, &
                             num_interfaces_inner_core,max_nibool_interfaces_ic, &
                             request_send_vector_ic,request_recv_vector_ic, &
-                            IREGION_INNER_CORE, &
-                            1)
+                            IREGION_INNER_CORE,1)
       endif
     endif ! iphase == 1
 
@@ -337,7 +332,7 @@
   else
     ! on GPU
     ! includes FORWARD_OR_ADJOINT == 1
-    call update_accel_3_a_cuda(Mesh_pointer,deltatover2,NCHUNKS_VAL,1)
+    call multiply_accel_elastic_cuda(Mesh_pointer,1)
   endif
 
   ! couples ocean with crust mantle
@@ -353,8 +348,7 @@
                                   NSPEC2D_TOP(IREGION_CRUST_MANTLE) )
     else
       ! on GPU
-      call compute_coupling_ocean_cuda(Mesh_pointer,NCHUNKS_VAL,EXACT_MASS_MATRIX_FOR_ROTATION, &
-                                       1) ! <- 1 == forward arrays
+      call compute_coupling_ocean_cuda(Mesh_pointer,1) ! <- 1 == forward arrays
     endif
   endif
 
@@ -463,78 +457,76 @@
         ! uses Deville (2002) optimizations
         ! crust/mantle region
         call compute_forces_crust_mantle_Dev( NSPEC_CRUST_MANTLE_ADJOINT,NGLOB_CRUST_MANTLE_ADJOINT, &
-              NSPEC_CRUST_MANTLE_STR_AND_ATT, &
-              b_deltat, &
-              b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
-              phase_is_inner, &
-              b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
-              b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
-              b_R_xx_crust_mantle_lddrk,b_R_yy_crust_mantle_lddrk,b_R_xy_crust_mantle_lddrk, &
-              b_R_xz_crust_mantle_lddrk,b_R_yz_crust_mantle_lddrk, &
-              b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
-              b_epsilondev_xy_crust_mantle, &
-              b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
-              b_eps_trace_over_3_crust_mantle, &
-              b_alphaval,b_betaval,b_gammaval, &
-              factor_common_crust_mantle,size(factor_common_crust_mantle,5), .true. )
+                                              NSPEC_CRUST_MANTLE_STR_AND_ATT, &
+                                              b_deltat, &
+                                              b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
+                                              phase_is_inner, &
+                                              b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
+                                              b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
+                                              b_R_xx_crust_mantle_lddrk,b_R_yy_crust_mantle_lddrk,b_R_xy_crust_mantle_lddrk, &
+                                              b_R_xz_crust_mantle_lddrk,b_R_yz_crust_mantle_lddrk, &
+                                              b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
+                                              b_epsilondev_xy_crust_mantle, &
+                                              b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
+                                              b_eps_trace_over_3_crust_mantle, &
+                                              b_alphaval,b_betaval,b_gammaval, &
+                                              factor_common_crust_mantle,size(factor_common_crust_mantle,5), .true. )
         ! inner core region
         call compute_forces_inner_core_Dev( NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
-              NSPEC_INNER_CORE_STR_AND_ATT, &
-              b_deltat, &
-              b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
-              phase_is_inner, &
-              b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
-              b_R_xz_inner_core,b_R_yz_inner_core, &
-              b_R_xx_inner_core_lddrk,b_R_yy_inner_core_lddrk,b_R_xy_inner_core_lddrk, &
-              b_R_xz_inner_core_lddrk,b_R_yz_inner_core_lddrk, &
-              b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
-              b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
-              b_eps_trace_over_3_inner_core,&
-              b_alphaval,b_betaval,b_gammaval, &
-              factor_common_inner_core,size(factor_common_inner_core,5), .true. )
-
+                                            NSPEC_INNER_CORE_STR_AND_ATT, &
+                                            b_deltat, &
+                                            b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
+                                            phase_is_inner, &
+                                            b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
+                                            b_R_xz_inner_core,b_R_yz_inner_core, &
+                                            b_R_xx_inner_core_lddrk,b_R_yy_inner_core_lddrk,b_R_xy_inner_core_lddrk, &
+                                            b_R_xz_inner_core_lddrk,b_R_yz_inner_core_lddrk, &
+                                            b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+                                            b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
+                                            b_eps_trace_over_3_inner_core,&
+                                            b_alphaval,b_betaval,b_gammaval, &
+                                            factor_common_inner_core,size(factor_common_inner_core,5), .true. )
       else
         ! no Deville optimization
         ! crust/mantle region
         call compute_forces_crust_mantle( NSPEC_CRUST_MANTLE_ADJOINT,NGLOB_CRUST_MANTLE_ADJOINT, &
-              NSPEC_CRUST_MANTLE_STR_AND_ATT, &
-              b_deltat, &
-              b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
-              phase_is_inner, &
-              b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
-              b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
-              b_R_xx_crust_mantle_lddrk,b_R_yy_crust_mantle_lddrk,b_R_xy_crust_mantle_lddrk, &
-              b_R_xz_crust_mantle_lddrk,b_R_yz_crust_mantle_lddrk, &
-              b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
-              b_epsilondev_xy_crust_mantle, &
-              b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
-              b_eps_trace_over_3_crust_mantle, &
-              b_alphaval,b_betaval,b_gammaval, &
-              factor_common_crust_mantle,size(factor_common_crust_mantle,5), .true. )
-
+                                          NSPEC_CRUST_MANTLE_STR_AND_ATT, &
+                                          b_deltat, &
+                                          b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
+                                          phase_is_inner, &
+                                          b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
+                                          b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
+                                          b_R_xx_crust_mantle_lddrk,b_R_yy_crust_mantle_lddrk,b_R_xy_crust_mantle_lddrk, &
+                                          b_R_xz_crust_mantle_lddrk,b_R_yz_crust_mantle_lddrk, &
+                                          b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
+                                          b_epsilondev_xy_crust_mantle, &
+                                          b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
+                                          b_eps_trace_over_3_crust_mantle, &
+                                          b_alphaval,b_betaval,b_gammaval, &
+                                          factor_common_crust_mantle,size(factor_common_crust_mantle,5), .true. )
         ! inner core region
         call compute_forces_inner_core( NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
-              NSPEC_INNER_CORE_STR_AND_ATT, &
-              b_deltat, &
-              b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
-              phase_is_inner, &
-              b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
-              b_R_xz_inner_core,b_R_yz_inner_core, &
-              b_R_xx_inner_core_lddrk,b_R_yy_inner_core_lddrk,b_R_xy_inner_core_lddrk, &
-              b_R_xz_inner_core_lddrk,b_R_yz_inner_core_lddrk, &
-              b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
-              b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
-              b_eps_trace_over_3_inner_core,&
-              b_alphaval,b_betaval,b_gammaval, &
-              factor_common_inner_core,size(factor_common_inner_core,5), .true. )
+                                        NSPEC_INNER_CORE_STR_AND_ATT, &
+                                        b_deltat, &
+                                        b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
+                                        phase_is_inner, &
+                                        b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
+                                        b_R_xz_inner_core,b_R_yz_inner_core, &
+                                        b_R_xx_inner_core_lddrk,b_R_yy_inner_core_lddrk,b_R_xy_inner_core_lddrk, &
+                                        b_R_xz_inner_core_lddrk,b_R_yz_inner_core_lddrk, &
+                                        b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+                                        b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
+                                        b_eps_trace_over_3_inner_core,&
+                                        b_alphaval,b_betaval,b_gammaval, &
+                                        factor_common_inner_core,size(factor_common_inner_core,5), .true. )
       endif
     else
       ! on GPU
-      ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
+      ! contains forward FORWARD_OR_ADJOINT == 3
       ! for crust/mantle
-      call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase)
+      call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase,3)
       ! for inner core
-      call compute_forces_inner_core_cuda(Mesh_pointer,iphase)
+      call compute_forces_inner_core_cuda(Mesh_pointer,iphase,3)
     endif ! GPU_MODE
 
     ! computes additional contributions to acceleration field
@@ -647,8 +639,7 @@
                     nibool_interfaces_crust_mantle,&
                     my_neighbours_crust_mantle, &
                     b_request_send_vector_cm,b_request_recv_vector_cm, &
-                    IREGION_CRUST_MANTLE, &
-                    3) ! <-- 3 == adjoint b_accel
+                    IREGION_CRUST_MANTLE,3) ! <-- 3 == adjoint b_accel
         ! inner core
         call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
                     b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
@@ -656,8 +647,7 @@
                     nibool_interfaces_inner_core,&
                     my_neighbours_inner_core, &
                     b_request_send_vector_ic,b_request_recv_vector_ic, &
-                    IREGION_INNER_CORE, &
-                    3)
+                    IREGION_INNER_CORE,3)
       endif ! GPU
     else
       ! waits for send/receive requests to be completed and assembles values
@@ -687,15 +677,13 @@
                           b_buffer_recv_vector_cm, &
                           num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
                           b_request_send_vector_cm,b_request_recv_vector_cm, &
-                          IREGION_CRUST_MANTLE, &
-                          3) ! <-- 3 == adjoint b_accel
+                          IREGION_CRUST_MANTLE,3) ! <-- 3 == adjoint b_accel
         ! inner core
         call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL,&
                           b_buffer_recv_vector_inner_core, &
                           num_interfaces_inner_core,max_nibool_interfaces_ic, &
                           b_request_send_vector_ic,b_request_recv_vector_ic, &
-                          IREGION_INNER_CORE, &
-                          3)
+                          IREGION_INNER_CORE,3)
       endif
     endif ! iphase == 1
 
@@ -716,7 +704,7 @@
   else
      ! on GPU
      ! includes FORWARD_OR_ADJOINT == 3
-     call update_accel_3_a_cuda(Mesh_pointer,b_deltatover2,NCHUNKS_VAL,3)
+     call multiply_accel_elastic_cuda(Mesh_pointer,3)
   endif
 
   ! couples ocean with crust mantle
@@ -732,8 +720,7 @@
                                   NSPEC2D_TOP(IREGION_CRUST_MANTLE) )
     else
       ! on GPU
-      call compute_coupling_ocean_cuda(Mesh_pointer,NCHUNKS_VAL,EXACT_MASS_MATRIX_FOR_ROTATION, &
-                                       3) ! <- 3 == backward/reconstructed arrays
+      call compute_coupling_ocean_cuda(Mesh_pointer,3) ! <- 3 == backward/reconstructed arrays
     endif
   endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -128,8 +128,8 @@
     else
       ! on GPU
       if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
-                                                                absorb_xmin_crust_mantle, &
-                                                                0) ! <= xmin
+                                                                           absorb_xmin_crust_mantle, &
+                                                                           0) ! <= xmin
     endif
 
     ! writes absorbing boundary values
@@ -190,8 +190,8 @@
     else
       ! on GPU
       if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
-                                                                absorb_xmax_crust_mantle, &
-                                                                1) ! <= xmin
+                                                                           absorb_xmax_crust_mantle, &
+                                                                           1) ! <= xmin
     endif
 
 
@@ -251,8 +251,8 @@
   else
     ! on GPU
     if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
-                                                              absorb_ymin_crust_mantle, &
-                                                              2) ! <= ymin
+                                                                         absorb_ymin_crust_mantle, &
+                                                                         2) ! <= ymin
   endif
 
   ! writes absorbing boundary values
@@ -309,8 +309,8 @@
   else
     ! on GPU
     if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
-                                                              absorb_ymax_crust_mantle, &
-                                                              3) ! <= ymax
+                                                                         absorb_ymax_crust_mantle, &
+                                                                         3) ! <= ymax
   endif
 
   ! writes absorbing boundary values
@@ -401,8 +401,8 @@
     else
       ! on GPU
       if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
-                                                                absorb_xmin_crust_mantle, &
-                                                                0) ! <= xmin
+                                                                                    absorb_xmin_crust_mantle, &
+                                                                                    0) ! <= xmin
     endif
 
   endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC
@@ -438,8 +438,8 @@
     else
       ! on GPU
       if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
-                                                                absorb_xmax_crust_mantle, &
-                                                                1) ! <= xmin
+                                                                                    absorb_xmax_crust_mantle, &
+                                                                                    1) ! <= xmin
     endif
 
   endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB
@@ -473,8 +473,8 @@
   else
     ! on GPU
     if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
-                                                              absorb_ymin_crust_mantle, &
-                                                              2) ! <= ymin
+                                                                                  absorb_ymin_crust_mantle, &
+                                                                                  2) ! <= ymin
   endif
 
   !   ymax
@@ -506,8 +506,8 @@
   else
     ! on GPU
     if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
-                                                              absorb_ymax_crust_mantle, &
-                                                              3) ! <= ymax
+                                                                                  absorb_ymax_crust_mantle, &
+                                                                                  3) ! <= ymax
   endif
 
   end subroutine compute_stacey_crust_mantle_backward
@@ -542,9 +542,7 @@
     njmin_crust_mantle,njmax_crust_mantle, &
     nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
     nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
-    nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
-    absorb_xmin_crust_mantle,absorb_xmax_crust_mantle, &
-    absorb_ymin_crust_mantle,absorb_ymax_crust_mantle
+    nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle
 
   implicit none
 
@@ -566,9 +564,6 @@
   if( SIMULATION_TYPE /= 3 ) return
   if( SAVE_FORWARD ) return
 
-  ! daniel debug
-  if( GPU_MODE ) stop 'error compute_stacey_crust_mantle_backward_undoatt not implemented yet for GPU simulations'
-
   ! crust & mantle
 
   !   xmin
@@ -615,9 +610,7 @@
 
     else
       ! on GPU
-      if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
-                                                                absorb_xmin_crust_mantle, &
-                                                                0) ! <= xmin
+      if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_undoatt_cuda(Mesh_pointer,0) ! <= xmin
     endif
 
   endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC
@@ -666,9 +659,7 @@
 
     else
       ! on GPU
-      if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
-                                                                absorb_xmax_crust_mantle, &
-                                                                1) ! <= xmin
+      if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_undoatt_cuda(Mesh_pointer,1) ! <= xmin
     endif
 
   endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB
@@ -715,9 +706,7 @@
 
   else
     ! on GPU
-    if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
-                                                              absorb_ymin_crust_mantle, &
-                                                              2) ! <= ymin
+    if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_undoatt_cuda(Mesh_pointer,2) ! <= ymin
   endif
 
   !   ymax
@@ -762,9 +751,7 @@
 
   else
     ! on GPU
-    if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
-                                                              absorb_ymax_crust_mantle, &
-                                                              3) ! <= ymax
+    if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_undoatt_cuda(Mesh_pointer,3) ! <= ymax
   endif
 
   end subroutine compute_stacey_crust_mantle_backward_undoatt

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -548,9 +548,6 @@
   if( SIMULATION_TYPE /= 3 ) return
   if( SAVE_FORWARD ) return
 
-  ! daniel debug
-  if( GPU_MODE ) stop 'error compute_stacey_outer_core_backward_undoatt not implemented yet for GPU simulations'
-
   ! outer core
 
   !   xmin
@@ -583,9 +580,7 @@
 
     else
       ! on GPU
-      if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
-                                                                absorb_xmin_outer_core, &
-                                                                4) ! <= xmin
+      if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,4) ! <= xmin
     endif
 
   endif
@@ -620,9 +615,7 @@
 
     else
       ! on GPU
-      if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
-                                                                absorb_xmax_outer_core, &
-                                                                5) ! <= xmax
+      if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,5) ! <= xmax
     endif
 
   endif
@@ -655,9 +648,7 @@
 
   else
     ! on GPU
-    if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
-                                                              absorb_ymin_outer_core, &
-                                                              6) ! <= ymin
+    if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,6) ! <= ymin
   endif
 
   !   ymax
@@ -688,9 +679,7 @@
 
   else
     ! on GPU
-    if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
-                                                              absorb_ymax_outer_core, &
-                                                              7) ! <= ymax
+    if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,7) ! <= ymax
   endif
 
   ! zmin
@@ -720,9 +709,7 @@
 
   else
     ! on GPU
-    if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
-                                                              absorb_zmin_outer_core, &
-                                                              8) ! <= zmin
+    if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,8) ! <= zmin
   endif
 
   end subroutine compute_stacey_outer_core_backward_undoatt

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -2161,23 +2161,23 @@
   ! inner core region
   if(myrank == 0 ) write(IMAIN,*) "  loading inner core region"
   call prepare_inner_core_device(Mesh_pointer, &
-                                xix_inner_core,xiy_inner_core,xiz_inner_core, &
-                                etax_inner_core,etay_inner_core,etaz_inner_core, &
-                                gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
-                                rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
-                                rmassx_inner_core,rmassy_inner_core,rmassz_inner_core, &
-                                b_rmassx_inner_core,b_rmassy_inner_core, &
-                                ibool_inner_core, &
-                                xstore_inner_core,ystore_inner_core,zstore_inner_core, &
-                                c11store_inner_core,c12store_inner_core,c13store_inner_core, &
-                                c33store_inner_core,c44store_inner_core, &
-                                idoubling_inner_core, &
-                                num_phase_ispec_inner_core,phase_ispec_inner_inner_core, &
-                                nspec_outer_inner_core,nspec_inner_inner_core, &
-                                NSPEC2D_TOP(IREGION_INNER_CORE), &
-                                ibelm_top_inner_core, &
-                                num_colors_outer_inner_core,num_colors_inner_inner_core, &
-                                num_elem_colors_inner_core)
+                                 xix_inner_core,xiy_inner_core,xiz_inner_core, &
+                                 etax_inner_core,etay_inner_core,etaz_inner_core, &
+                                 gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+                                 rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
+                                 rmassx_inner_core,rmassy_inner_core,rmassz_inner_core, &
+                                 b_rmassx_inner_core,b_rmassy_inner_core, &
+                                 ibool_inner_core, &
+                                 xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+                                 c11store_inner_core,c12store_inner_core,c13store_inner_core, &
+                                 c33store_inner_core,c44store_inner_core, &
+                                 idoubling_inner_core, &
+                                 num_phase_ispec_inner_core,phase_ispec_inner_inner_core, &
+                                 nspec_outer_inner_core,nspec_inner_inner_core, &
+                                 NSPEC2D_TOP(IREGION_INNER_CORE), &
+                                 ibelm_top_inner_core, &
+                                 num_colors_outer_inner_core,num_colors_inner_inner_core, &
+                                 num_elem_colors_inner_core)
 
   ! transfer forward and backward fields to device with initial values
   if(myrank == 0 ) write(IMAIN,*) "  transfering initial wavefield"

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -198,6 +198,7 @@
 
   ! transfers fields onto GPU
   if(GPU_MODE) then
+    ! transfers initialized wavefields to GPU device
     call transfer_b_fields_cm_to_device(NDIM*NGLOB_CRUST_MANTLE, &
                                     b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
                                     Mesh_pointer)
@@ -209,7 +210,7 @@
     call transfer_b_fields_oc_to_device(NGLOB_OUTER_CORE, &
                                     b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
                                     Mesh_pointer)
-
+    ! strain
     if( .not. UNDO_ATTENUATION ) then
       call transfer_b_strain_cm_to_device(Mesh_pointer, &
                                     b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle, &
@@ -221,20 +222,20 @@
                                     b_epsilondev_xy_inner_core,b_epsilondev_xz_inner_core, &
                                     b_epsilondev_yz_inner_core)
     endif
-
+    ! rotation
     if (ROTATION_VAL) then
       call transfer_b_rotation_to_device(Mesh_pointer,b_A_array_rotation,b_B_array_rotation)
     endif
-
+    ! attenuation memory variables
     if (ATTENUATION_VAL) then
       call transfer_b_rmemory_cm_to_device(Mesh_pointer, &
-                                    b_R_xx_crust_mantle,b_R_yy_crust_mantle, &
-                                    b_R_xy_crust_mantle,b_R_xz_crust_mantle, &
-                                    b_R_yz_crust_mantle)
+                                           b_R_xx_crust_mantle,b_R_yy_crust_mantle, &
+                                           b_R_xy_crust_mantle,b_R_xz_crust_mantle, &
+                                           b_R_yz_crust_mantle)
       call transfer_b_rmemory_ic_to_device(Mesh_pointer, &
-                                    b_R_xx_inner_core,b_R_yy_inner_core, &
-                                    b_R_xy_inner_core,b_R_xz_inner_core, &
-                                    b_R_yz_inner_core)
+                                           b_R_xx_inner_core,b_R_yy_inner_core, &
+                                           b_R_xy_inner_core,b_R_xz_inner_core, &
+                                           b_R_yz_inner_core)
     endif
   endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -819,18 +819,18 @@
   endif
 
   allocate(buffer_send_vector_crust_mantle(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
-          buffer_recv_vector_crust_mantle(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
-          request_send_vector_cm(num_interfaces_crust_mantle), &
-          request_recv_vector_cm(num_interfaces_crust_mantle), &
-          stat=ier)
+           buffer_recv_vector_crust_mantle(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
+           request_send_vector_cm(num_interfaces_crust_mantle), &
+           request_recv_vector_cm(num_interfaces_crust_mantle), &
+           stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_crust_mantle etc.')
 
   if( SIMULATION_TYPE == 3 ) then
     allocate(b_buffer_send_vector_cm(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
-            b_buffer_recv_vector_cm(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
-            b_request_send_vector_cm(num_interfaces_crust_mantle), &
-            b_request_recv_vector_cm(num_interfaces_crust_mantle), &
-            stat=ier)
+             b_buffer_recv_vector_cm(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
+             b_request_send_vector_cm(num_interfaces_crust_mantle), &
+             b_request_recv_vector_cm(num_interfaces_crust_mantle), &
+             stat=ier)
     if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_cm etc.')
   endif
 
@@ -842,18 +842,18 @@
   endif
 
   allocate(buffer_send_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
-          buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
-          request_send_scalar_oc(num_interfaces_outer_core), &
-          request_recv_scalar_oc(num_interfaces_outer_core), &
-          stat=ier)
+           buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
+           request_send_scalar_oc(num_interfaces_outer_core), &
+           request_recv_scalar_oc(num_interfaces_outer_core), &
+           stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_outer_core etc.')
 
   if( SIMULATION_TYPE == 3 ) then
     allocate(b_buffer_send_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
-            b_buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
-            b_request_send_scalar_oc(num_interfaces_outer_core), &
-            b_request_recv_scalar_oc(num_interfaces_outer_core), &
-            stat=ier)
+             b_buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
+             b_request_send_scalar_oc(num_interfaces_outer_core), &
+             b_request_recv_scalar_oc(num_interfaces_outer_core), &
+             stat=ier)
     if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_outer_core etc.')
   endif
 
@@ -865,18 +865,18 @@
   endif
 
   allocate(buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
-          buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
-          request_send_vector_ic(num_interfaces_inner_core), &
-          request_recv_vector_ic(num_interfaces_inner_core), &
-          stat=ier)
+           buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
+           request_send_vector_ic(num_interfaces_inner_core), &
+           request_recv_vector_ic(num_interfaces_inner_core), &
+           stat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_inner_core etc.')
 
   if( SIMULATION_TYPE == 3 ) then
     allocate(b_buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
-            b_buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
-            b_request_send_vector_ic(num_interfaces_inner_core), &
-            b_request_recv_vector_ic(num_interfaces_inner_core), &
-            stat=ier)
+             b_buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
+             b_request_send_vector_ic(num_interfaces_inner_core), &
+             b_request_recv_vector_ic(num_interfaces_inner_core), &
+             stat=ier)
     if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_inner_core etc.')
   endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90	2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90	2013-09-13 11:23:45 UTC (rev 22782)
@@ -86,12 +86,12 @@
   else
     ! on GPU
     ! Includes FORWARD_OR_ADJOINT == 1
+    ! crust/mantle region
+    call update_displacement_cm_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
     ! outer core region
     call update_displacement_oc_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
     ! inner core region
     call update_displacement_ic_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
-    ! crust/mantle region
-    call update_displacement_cm_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
   endif
 
   end subroutine update_displacement_Newmark
@@ -129,12 +129,12 @@
   else
     ! on GPU
     ! Includes FORWARD_OR_ADJOINT == 3
+    ! crust/mantle region
+    call update_displacement_cm_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
     ! outer core region
     call update_displacement_oc_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
     ! inner core region
     call update_displacement_ic_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
-    ! crust/mantle region
-    call update_displacement_cm_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
   endif
 
   end subroutine update_displacement_Newmark_backward
@@ -231,7 +231,7 @@
   else
     ! on GPU
     ! includes FORWARD_OR_ADJOINT == 1
-    call kernel_3_outer_core_cuda(Mesh_pointer,deltatover2,1)
+    call update_veloc_acoustic_cuda(Mesh_pointer,deltatover2,1)
   endif
 
   end subroutine update_veloc_acoustic_newmark
@@ -257,7 +257,7 @@
   else
     ! on GPU
     ! includes FORWARD_OR_ADJOINT == 3
-    call kernel_3_outer_core_cuda(Mesh_pointer,b_deltatover2,3)
+    call update_veloc_acoustic_cuda(Mesh_pointer,b_deltatover2,3)
   endif
 
   end subroutine update_veloc_acoustic_newmark_backward
@@ -321,7 +321,7 @@
   else
     ! on GPU
     ! includes FORWARD_OR_ADJOINT == 1
-    call update_veloc_3_b_cuda(Mesh_pointer,deltatover2,1)
+    call update_veloc_elastic_cuda(Mesh_pointer,deltatover2,1)
 
   endif
 
@@ -350,7 +350,7 @@
   else
     ! on GPU
     ! includes FORWARD_OR_ADJOINT == 3
-    call update_veloc_3_b_cuda(Mesh_pointer,b_deltatover2,3)
+    call update_veloc_elastic_cuda(Mesh_pointer,b_deltatover2,3)
 
   endif