[cig-commits] r22780 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER: . EXAMPLES/regional_Greece_small src/cuda src/specfem3D utils

Mon Sep 9 08:42:04 PDT 2013

Author: danielpeter
Date: 2013-09-09 08:42:03 -0700 (Mon, 09 Sep 2013)
New Revision: 22780

Added:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/utils/check_cuda_device.cu
Removed:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
Modified:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/xcombine_vol_data.sh
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/Makefile.in
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/rules.mk
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_output_SAC.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90
Log:
updates cuda routines; renames update_displacement_cuda.cu

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/xcombine_vol_data.sh
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/xcombine_vol_data.sh	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/xcombine_vol_data.sh	2013-09-09 15:42:03 UTC (rev 22780)
@@ -14,7 +14,7 @@
 res="1"
 
 # for visualization
-cp ../../UTILS/Visualization/Paraview/AVS_continent_boundaries.inp ./
+cp ../../utils/Visualization/VTK_ParaView/AVS_continent_boundaries.inp ./
 
 echo
 echo "alpha_kernel"

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/Makefile.in	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/Makefile.in	2013-09-09 15:42:03 UTC (rev 22780)
@@ -99,10 +99,10 @@
 
 # CUDA flags and linking
 @COND_CUDA_TRUE at NVCC_FLAGS_BASE = $(CUDA_INC) $(MPI_INC)
- at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -dc -DCUDA $(GENCODE)
- at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -DCUDA -DUSE_OLDER_CUDA4_GPU $(GENCODE)
+ at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -dc $(GENCODE)
+ at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -DUSE_OLDER_CUDA4_GPU $(GENCODE)
 
- at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK_BASE = $(NVCC) $(CUDA_INC) $(MPI_INC) -DCUDA
+ at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK_BASE = $(NVCC) $(CUDA_INC) $(MPI_INC)
 @COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK = $(NVCCLINK_BASE) -dlink $(GENCODE)
 @COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCCLINK = $(NVCCLINK_BASE) -DUSE_OLDER_CUDA4_GPU $(GENCODE)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2013-09-09 15:42:03 UTC (rev 22780)
@@ -715,7 +715,7 @@
                                           realw* epsilon_trace_over_3,
                                           int SIMULATION_TYPE,
                                           int ATTENUATION,
-                                          int USE_ATTENUATION_MIMIC,
+                                          int PARTIAL_PHYS_DISPERSION_ONLY,
                                           int USE_3D_ATTENUATION_ARRAYS,
                                           realw* one_minus_sum_beta,realw* factor_common,
                                           realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
@@ -1056,7 +1056,7 @@
       }
     } // ! end of test whether isotropic or anisotropic element
 
-    if(ATTENUATION && (! USE_ATTENUATION_MIMIC ) ){
+    if(ATTENUATION && (! PARTIAL_PHYS_DISPERSION_ONLY ) ){
       // subtracts memory variables if attenuation
       compute_element_cm_att_stress(tx,working_element,
                                     R_xx,R_yy,R_xy,R_xz,R_yz,
@@ -1255,7 +1255,7 @@
 #endif // MESH_COLORING
 
     // update memory variables based upon the Runge-Kutta scheme
-    if( ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+    if( ATTENUATION && ( ! PARTIAL_PHYS_DISPERSION_ONLY ) ){
 
       compute_element_cm_att_memory(tx,working_element,
                                     d_muvstore,
@@ -1377,7 +1377,7 @@
                                                 d_epsilon_trace_over_3,
                                                 mp->simulation_type,
                                                 mp->attenuation,
-                                                mp->use_attenuation_mimic,
+                                                mp->partial_phys_dispersion_only,
                                                 mp->use_3d_attenuation_arrays,
                                                 d_one_minus_sum_beta,d_factor_common,
                                                 d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
@@ -1427,7 +1427,7 @@
                                                    d_b_epsilon_trace_over_3,
                                                    mp->simulation_type,
                                                    mp->attenuation,
-                                                   mp->use_attenuation_mimic,
+                                                   mp->partial_phys_dispersion_only,
                                                    mp->use_3d_attenuation_arrays,
                                                    d_one_minus_sum_beta,d_factor_common,
                                                    d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2013-09-09 15:42:03 UTC (rev 22780)
@@ -328,7 +328,7 @@
                                          realw* epsilon_trace_over_3,
                                          int SIMULATION_TYPE,
                                          int ATTENUATION,
-                                         int USE_ATTENUATION_MIMIC,
+                                         int PARTIAL_PHYS_DISPERSION_ONLY,
                                          int USE_3D_ATTENUATION_ARRAYS,
                                          realw* one_minus_sum_beta,realw* factor_common,
                                          realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
@@ -678,7 +678,7 @@
       sigma_yz = mul*duzdyl_plus_duydzl;
     }
 
-    if(ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+    if(ATTENUATION && ( ! PARTIAL_PHYS_DISPERSION_ONLY ) ){
       // subtracts memory variables if attenuation
       compute_element_ic_att_stress(tx,working_element,
                                     R_xx,R_yy,R_xy,R_xz,R_yz,
@@ -891,7 +891,7 @@
 #endif // MESH_COLORING
 
     // update memory variables based upon the Runge-Kutta scheme
-    if( ATTENUATION && ! USE_ATTENUATION_MIMIC ){
+    if( ATTENUATION && ! PARTIAL_PHYS_DISPERSION_ONLY ){
       compute_element_ic_att_memory(tx,working_element,
                                 d_muv,
                                 factor_common,alphaval,betaval,gammaval,
@@ -1005,7 +1005,7 @@
                                              d_epsilon_trace_over_3,
                                              mp->simulation_type,
                                              mp->attenuation,
-                                             mp->use_attenuation_mimic,
+                                             mp->partial_phys_dispersion_only,
                                              mp->use_3d_attenuation_arrays,
                                              d_one_minus_sum_beta,
                                              d_factor_common,
@@ -1053,7 +1053,7 @@
                                                 d_b_epsilon_trace_over_3,
                                                 mp->simulation_type,
                                                 mp->attenuation,
-                                                mp->use_attenuation_mimic,
+                                                mp->partial_phys_dispersion_only,
                                                 mp->use_3d_attenuation_arrays,
                                                 d_one_minus_sum_beta,
                                                 d_factor_common,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu	2013-09-09 15:42:03 UTC (rev 22780)
@@ -30,10 +30,6 @@
 #include <cuda.h>
 #include <cublas.h>
 
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-
 #include <sys/time.h>
 #include <sys/resource.h>
 
@@ -115,7 +111,26 @@
     //MPI_Barrier(MPI_COMM_WORLD);
 
     // sets active device
+#ifdef CUDA_DEVICE_ID
+    // uses fixed device id when compile with e.g.: -DCUDA_DEVICE_ID=1
+    device = CUDA_DEVICE_ID;
+    if(myrank == 0 ) printf("setting cuda devices with id = %d for all processes by -DCUDA_DEVICE_ID\n\n",device);
+
+    cudaSetDevice( device );
+    exit_on_cuda_error("cudaSetDevice has invalid device");
+
+    // double check that device was  properly selected
+    cudaGetDevice(&device);
+    if( device != CUDA_DEVICE_ID ){
+       printf("error rank: %d devices: %d \n",myrank,device_count);
+       printf("  cudaSetDevice()=%d\n  cudaGetDevice()=%d\n",CUDA_DEVICE_ID,device);
+       exit_on_error("CUDA set/get device error: device id conflict \n");
+    }
+#else
+    // device changes for different mpi processes according to number of device per node
+    // (assumes that number of devices per node is the same for different compute nodes)
     device = myrank % device_count;
+
     cudaSetDevice( device );
     exit_on_cuda_error("cudaSetDevice has invalid device");
 
@@ -126,6 +141,7 @@
        printf("  cudaSetDevice()=%d\n  cudaGetDevice()=%d\n",myrank%device_count,device);
        exit_on_error("CUDA set/get device error: device id conflict \n");
     }
+#endif
   }
 
   // returns a handle to the active device

Deleted: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2013-09-09 15:42:03 UTC (rev 22780)
@@ -1,611 +0,0 @@
-/*
- !=====================================================================
- !
- !               S p e c f e m 3 D  V e r s i o n  2 . 0
- !               ---------------------------------------
- !
- !          Main authors: Dimitri Komatitsch and Jeroen Tromp
- !    Princeton University, USA and University of Pau / CNRS / INRIA
- ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
- !                            August 2013
- !
- ! This program is free software; you can redistribute it and/or modify
- ! it under the terms of the GNU General Public License as published by
- ! the Free Software Foundation; either version 2 of the License, or
- ! (at your option) any later version.
- !
- ! This program is distributed in the hope that it will be useful,
- ! but WITHOUT ANY WARRANTY; without even the implied warranty of
- ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- ! GNU General Public License for more details.
- !
- ! You should have received a copy of the GNU General Public License along
- ! with this program; if not, write to the Free Software Foundation, Inc.,
- ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- !
- !=====================================================================
- */
-
-#include <stdio.h>
-#include <cuda.h>
-
-#include "config.h"
-#include "mesh_constants_cuda.h"
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// elastic wavefield
-
-// KERNEL 1
-/* ----------------------------------------------------------------------------------------------- */
-
-
-__global__ void UpdateDispVeloc_kernel(realw* displ,
-                                       realw* veloc,
-                                       realw* accel,
-                                       int size,
-                                       realw deltat,
-                                       realw deltatsqover2,
-                                       realw deltatover2) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id];
-    veloc[id] = veloc[id] + deltatover2*accel[id];
-    accel[id] = 0.0f; // can do this using memset...not sure if faster
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 1
-// inner core
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(it_update_displacement_ic_cuda,
-              IT_UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
-                                             realw* deltat_F,
-                                             realw* deltatsqover2_F,
-                                             realw* deltatover2_F,
-                                             int* FORWARD_OR_ADJOINT) {
-
-TRACE("it_update_displacement_ic_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-
-  int size = NDIM * mp->NGLOB_INNER_CORE;
-
-  realw deltat = *deltat_F;
-  realw deltatsqover2 = *deltatsqover2_F;
-  realw deltatover2 = *deltatover2_F;
-
-  int blocksize = BLOCKSIZE_KERNEL1;
-  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  if( *FORWARD_OR_ADJOINT == 1 ){
-    //launch kernel
-    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
-                                             mp->d_veloc_inner_core,
-                                             mp->d_accel_inner_core,
-                                             size,deltat,deltatsqover2,deltatover2);
-  }else if( *FORWARD_OR_ADJOINT == 3 ){
-    // kernel for backward fields
-    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
-                                             mp->d_b_veloc_inner_core,
-                                             mp->d_b_accel_inner_core,
-                                             size,deltat,deltatsqover2,deltatover2);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("it_update_displacement_ic_cuda");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 1
-// crust/mantle
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(it_update_displacement_cm_cuda,
-              IT_UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
-                                             realw* deltat_F,
-                                             realw* deltatsqover2_F,
-                                             realw* deltatover2_F,
-                                             int* FORWARD_OR_ADJOINT) {
-
-  TRACE("it_update_displacement_cm_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-
-  int size = NDIM * mp->NGLOB_CRUST_MANTLE;
-
-  realw deltat = *deltat_F;
-  realw deltatsqover2 = *deltatsqover2_F;
-  realw deltatover2 = *deltatover2_F;
-
-  int blocksize = BLOCKSIZE_KERNEL1;
-  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  if( *FORWARD_OR_ADJOINT == 1 ){
-    //launch kernel
-    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
-                                             mp->d_veloc_crust_mantle,
-                                             mp->d_accel_crust_mantle,
-                                             size,deltat,deltatsqover2,deltatover2);
-  }else if( *FORWARD_OR_ADJOINT == 3 ){
-    // kernel for backward fields
-    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
-                                             mp->d_b_veloc_crust_mantle,
-                                             mp->d_b_accel_crust_mantle,
-                                             size,deltat,deltatsqover2,deltatover2);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("it_update_displacement_cm_cuda");
-#endif
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// acoustic wavefield
-
-// KERNEL 1
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void UpdatePotential_kernel(realw* potential_acoustic,
-                                       realw* potential_dot_acoustic,
-                                       realw* potential_dot_dot_acoustic,
-                                       int size,
-                                       realw deltat,
-                                       realw deltatsqover2,
-                                       realw deltatover2) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    potential_acoustic[id] = potential_acoustic[id]
-                            + deltat*potential_dot_acoustic[id]
-                            + deltatsqover2*potential_dot_dot_acoustic[id];
-
-    potential_dot_acoustic[id] = potential_dot_acoustic[id]
-                                + deltatover2*potential_dot_dot_acoustic[id];
-
-    potential_dot_dot_acoustic[id] = 0.0f;
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 1
-// outer core
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(it_update_displacement_oc_cuda,
-              IT_UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
-                                              realw* deltat_F,
-                                              realw* deltatsqover2_F,
-                                              realw* deltatover2_F,
-                                              int* FORWARD_OR_ADJOINT) {
-
-  TRACE("it_update_displacement_oc_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-
-  int size = mp->NGLOB_OUTER_CORE;
-
-  realw deltat = *deltat_F;
-  realw deltatsqover2 = *deltatsqover2_F;
-  realw deltatover2 = *deltatover2_F;
-
-  int blocksize = BLOCKSIZE_KERNEL1;
-  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
-
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  if( *FORWARD_OR_ADJOINT == 1 ){
-    //launch kernel
-    UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
-                                             mp->d_veloc_outer_core,
-                                             mp->d_accel_outer_core,
-                                             size,deltat,deltatsqover2,deltatover2);
-  }else if( *FORWARD_OR_ADJOINT == 1 ){
-    UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,
-                                             mp->d_b_veloc_outer_core,
-                                             mp->d_b_accel_outer_core,
-                                             size,deltat,deltatsqover2,deltatover2);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("it_update_displacement_oc_cuda");
-#endif
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 3
-//
-// crust/mantle and inner core regions
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void kernel_3_cuda_device(realw* veloc,
-                                     realw* accel,
-                                     int size,
-                                     realw deltatover2,
-                                     realw two_omega_earth,
-                                     realw* rmassx,
-                                     realw* rmassy,
-                                     realw* rmassz) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    // note: update adds rotational acceleration in case two_omega_earth is non-zero
-    accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
-    accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
-    accel[3*id+2] = accel[3*id+2]*rmassz[id];
-
-    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
-    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
-    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void kernel_3_accel_cuda_device(realw* accel,
-                                           realw* veloc,
-                                           int size,
-                                           realw two_omega_earth,
-                                           realw* rmassx,
-                                           realw* rmassy,
-                                           realw* rmassz) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    // note: update adds rotational acceleration in case two_omega_earth is non-zero
-    accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
-    accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
-    accel[3*id+2] = accel[3*id+2]*rmassz[id];
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void kernel_3_veloc_cuda_device(realw* veloc,
-                                           realw* accel,
-                                           int size,
-                                           realw deltatover2) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
-    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
-    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
-  }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(update_accel_3_a_cuda,
-              UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
-                                     realw* deltatover2_F,
-                                     int* NCHUNKS_VAL,
-                                     int* FORWARD_OR_ADJOINT) {
-  TRACE("update_accel_3_a_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-
-  realw deltatover2 = *deltatover2_F;
-
-  int blocksize = BLOCKSIZE_KERNEL3;
-  int size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
-
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  // crust/mantle region only
-  // check whether we can update accel and veloc, or only accel at this point
-  if( mp->oceans == 0 ){
-    // updates both, accel and veloc
-
-    if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
-      // uses corrected mass matrices rmassx,rmassy,rmassz
-      if( *FORWARD_OR_ADJOINT == 1 ){
-        kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
-                                                 mp->d_accel_crust_mantle,
-                                                 mp->NGLOB_CRUST_MANTLE,
-                                                 deltatover2,
-                                                 mp->two_omega_earth,
-                                                 mp->d_rmassx_crust_mantle,
-                                                 mp->d_rmassy_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle);
-      }else if( *FORWARD_OR_ADJOINT == 3 ){
-        kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
-                                                 mp->d_b_accel_crust_mantle,
-                                                 mp->NGLOB_CRUST_MANTLE,
-                                                 deltatover2,
-                                                 mp->b_two_omega_earth,
-                                                 mp->d_rmassx_crust_mantle,
-                                                 mp->d_rmassy_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle);
-      }
-    }else{
-      // uses only rmassz
-      if( *FORWARD_OR_ADJOINT == 1 ){
-        kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
-                                                 mp->d_accel_crust_mantle,
-                                                 mp->NGLOB_CRUST_MANTLE,
-                                                 deltatover2,
-                                                 mp->two_omega_earth,
-                                                 mp->d_rmassz_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle);
-      }else if( *FORWARD_OR_ADJOINT == 3 ){
-        kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
-                                                 mp->d_b_accel_crust_mantle,
-                                                 mp->NGLOB_CRUST_MANTLE,
-                                                 deltatover2,
-                                                 mp->b_two_omega_earth,
-                                                 mp->d_rmassz_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle,
-                                                 mp->d_rmassz_crust_mantle);
-      }
-    }
-
-  }else{
-    // updates only accel
-
-    if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
-      // uses corrected mass matrices
-      if( *FORWARD_OR_ADJOINT == 1 ){
-        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
-                                                       mp->d_veloc_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       mp->two_omega_earth,
-                                                       mp->d_rmassx_crust_mantle,
-                                                       mp->d_rmassy_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle);
-      }else if( *FORWARD_OR_ADJOINT == 3 ){
-        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
-                                                       mp->d_b_veloc_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       mp->b_two_omega_earth,
-                                                       mp->d_rmassx_crust_mantle,
-                                                       mp->d_rmassy_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle);
-      }
-    }else{
-      // uses only rmassz
-      if( *FORWARD_OR_ADJOINT == 1 ){
-        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
-                                                       mp->d_veloc_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       mp->two_omega_earth,
-                                                       mp->d_rmassz_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle);
-      }else if( *FORWARD_OR_ADJOINT == 3 ){
-        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
-                                                       mp->d_b_veloc_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       mp->b_two_omega_earth,
-                                                       mp->d_rmassz_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle,
-                                                       mp->d_rmassz_crust_mantle);
-      }
-    }
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after update_accel_3_a");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(update_veloc_3_b_cuda,
-              UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
-                                     realw* deltatover2_F,
-                                     int* FORWARD_OR_ADJOINT) {
-  TRACE("update_veloc_3_b_cuda");
-  int size_padded,num_blocks_x,num_blocks_y;
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-
-  realw deltatover2 = *deltatover2_F;
-
-  int blocksize = BLOCKSIZE_KERNEL3;
-
-  // crust/mantle region
-  // in case of ocean loads, we still have to update the velocity for crust/mantle region
-  if( mp->oceans ){
-    size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
-    num_blocks_x = size_padded/blocksize;
-    num_blocks_y = 1;
-    while(num_blocks_x > MAXIMUM_GRID_DIM) {
-      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-      num_blocks_y = num_blocks_y*2;
-    }
-    dim3 grid1(num_blocks_x,num_blocks_y);
-    dim3 threads1(blocksize,1,1);
-
-    // updates only veloc at this point
-    if( *FORWARD_OR_ADJOINT == 1 ){
-      kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_veloc_crust_mantle,
-                                                       mp->d_accel_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       deltatover2);
-    }else if( *FORWARD_OR_ADJOINT == 3 ){
-      kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_b_veloc_crust_mantle,
-                                                       mp->d_b_accel_crust_mantle,
-                                                       mp->NGLOB_CRUST_MANTLE,
-                                                       deltatover2);
-    }
-  }
-
-  // inner core region
-  size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
-  num_blocks_x = size_padded/blocksize;
-  num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  // updates both, accel and veloc
-  if( *FORWARD_OR_ADJOINT == 1 ){
-    kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
-                                             mp->d_accel_inner_core,
-                                             mp->NGLOB_INNER_CORE,
-                                             deltatover2,
-                                             mp->two_omega_earth,
-                                             mp->d_rmassx_inner_core,
-                                             mp->d_rmassy_inner_core,
-                                             mp->d_rmassz_inner_core);
-  }else if( *FORWARD_OR_ADJOINT == 3 ){
-    kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
-                                             mp->d_b_accel_inner_core,
-                                             mp->NGLOB_INNER_CORE,
-                                             deltatover2,
-                                             mp->b_two_omega_earth,
-                                             mp->d_rmassx_inner_core,
-                                             mp->d_rmassy_inner_core,
-                                             mp->d_rmassz_inner_core);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after update_veloc_3_b");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// KERNEL 3
-//
-// for outer_core
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-__global__ void kernel_3_outer_core_cuda_device(realw* veloc,
-                                                realw* accel,int size,
-                                                realw deltatover2,
-                                                realw* rmass) {
-  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-
-  /* because of block and grid sizing problems, there is a small */
-  /* amount of buffer at the end of the calculation */
-  if(id < size) {
-    // multiplies pressure with the inverse of the mass matrix
-    accel[id] = accel[id]*rmass[id];
-
-    // Newmark time scheme: corrector term
-    veloc[id] = veloc[id] + deltatover2*accel[id];
-  }
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(kernel_3_outer_core_cuda,
-              KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
-                                        realw* deltatover2_F,
-                                        int* FORWARD_OR_ADJOINT) {
-
-  TRACE("kernel_3_outer_core_cuda");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-
-  realw deltatover2 = *deltatover2_F;
-
-  int blocksize = BLOCKSIZE_KERNEL3;
-
-  int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
-  int num_blocks_x = size_padded/blocksize;
-  int num_blocks_y = 1;
-  while(num_blocks_x > MAXIMUM_GRID_DIM) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  if( *FORWARD_OR_ADJOINT == 1 ){
-    kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
-                                                        mp->d_accel_outer_core,
-                                                        mp->NGLOB_OUTER_CORE,
-                                                        deltatover2,mp->d_rmass_outer_core);
-  }else if( *FORWARD_OR_ADJOINT == 3){
-    kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
-                                                        mp->d_b_accel_outer_core,
-                                                        mp->NGLOB_OUTER_CORE,
-                                                        deltatover2,mp->d_rmass_outer_core);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after kernel_3_outer_core");
-#endif
-}
-

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2013-09-09 15:42:03 UTC (rev 22780)
@@ -558,7 +558,7 @@
 
   int attenuation;
   int undo_attenuation;
-  int use_attenuation_mimic;
+  int partial_phys_dispersion_only;
   int use_3d_attenuation_arrays;
 
   int compute_and_store_strain;

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2013-09-09 15:42:03 UTC (rev 22780)
@@ -30,9 +30,9 @@
 #include <cuda.h>
 #include <cublas.h>
 
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
+//#ifdef WITH_MPI
+//#include <mpi.h>
+//#endif
 
 #include <sys/types.h>
 #include <unistd.h>
@@ -42,6 +42,7 @@
 
 
 /* ----------------------------------------------------------------------------------------------- */
+//unused routines...
 /*
 extern "C"
 void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2013-09-09 15:42:03 UTC (rev 22780)
@@ -30,10 +30,6 @@
 #include <cuda.h>
 #include <cublas.h>
 
-#ifdef WITH_MPI
-#include <mpi.h>
-#endif
-
 #include <sys/time.h>
 #include <sys/resource.h>
 
@@ -114,39 +110,31 @@
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
                                         int* myrank_f,
                                         int* h_NGLLX,
-                                        realw* h_hprime_xx,
-                                        realw* h_hprimewgll_xx,
+                                        realw* h_hprime_xx,realw* h_hprimewgll_xx,
                                         realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
                                         int* NSOURCES,int* nsources_local,
                                         realw* h_sourcearrays,
                                         int* h_islice_selected_source,int* h_ispec_selected_source,
+                                        int* nrec,int* nrec_local, int* nadj_rec_local,
                                         int* h_number_receiver_global,
                                         int* h_islice_selected_rec,int* h_ispec_selected_rec,
-                                        int* nrec,int* nrec_local, int* nadj_rec_local,
                                         int* NSPEC_CRUST_MANTLE, int* NGLOB_CRUST_MANTLE,
                                         int* NSPEC_CRUST_MANTLE_STRAIN_ONLY,
                                         int* NSPEC_OUTER_CORE, int* NGLOB_OUTER_CORE,
                                         int* NSPEC_INNER_CORE, int* NGLOB_INNER_CORE,
                                         int* NSPEC_INNER_CORE_STRAIN_ONLY,
-                                        int* SIMULATION_TYPE,
-                                        int* NOISE_TOMOGRAPHY,
-                                        int* SAVE_FORWARD_f,
-                                        int* ABSORBING_CONDITIONS_f,
-                                        int* OCEANS_f,
-                                        int* GRAVITY_f,
+                                        int* SIMULATION_TYPE,int* NOISE_TOMOGRAPHY,
+                                        int* SAVE_FORWARD_f,int* ABSORBING_CONDITIONS_f,
+                                        int* OCEANS_f,int* GRAVITY_f,
                                         int* ROTATION_f,int* EXACT_MASS_MATRIX_FOR_ROTATION_f,
                                         int* ATTENUATION_f,int* UNDO_ATTENUATION_f,
-                                        int* USE_ATTENUATION_MIMIC_f,
-                                        int* USE_3D_ATTENUATION_ARRAYS_f,
+                                        int* PARTIAL_PHYS_DISPERSION_ONLY_f,int* USE_3D_ATTENUATION_ARRAYS_f,
                                         int* COMPUTE_AND_STORE_STRAIN_f,
-                                        int* ANISOTROPIC_3D_MANTLE_f,
-                                        int* ANISOTROPIC_INNER_CORE_f,
+                                        int* ANISOTROPIC_3D_MANTLE_f,int* ANISOTROPIC_INNER_CORE_f,
                                         int* SAVE_BOUNDARY_MESH_f,
                                         int* USE_MESH_COLORING_GPU_f,
-                                        int* ANISOTROPIC_KL_f,
-                                        int* APPROXIMATE_HESS_KL_f,
-                                        realw* deltat_f,
-                                        realw* b_deltat_f) {
+                                        int* ANISOTROPIC_KL_f,int* APPROXIMATE_HESS_KL_f,
+                                        realw* deltat_f,realw* b_deltat_f) {
 
   TRACE("prepare_constants_device");
 
@@ -240,7 +228,7 @@
 
   mp->attenuation = *ATTENUATION_f;
   mp->undo_attenuation = *UNDO_ATTENUATION_f;
-  mp->use_attenuation_mimic = *USE_ATTENUATION_MIMIC_f;
+  mp->partial_phys_dispersion_only = *PARTIAL_PHYS_DISPERSION_ONLY_f;
   mp->use_3d_attenuation_arrays = *USE_3D_ATTENUATION_ARRAYS_f;
 
   mp->compute_and_store_strain = *COMPUTE_AND_STORE_STRAIN_f;
@@ -503,7 +491,7 @@
 
   copy_todevice_realw((void**)&mp->d_one_minus_sum_beta_crust_mantle,one_minus_sum_beta_crust_mantle,R_size2);
 
-  if( ! mp->use_attenuation_mimic ){
+  if( ! mp->partial_phys_dispersion_only ){
     // common factor
     copy_todevice_realw((void**)&mp->d_factor_common_crust_mantle,factor_common_crust_mantle,R_size3);
     // memory variables
@@ -515,7 +503,7 @@
   }
 
   if(mp->simulation_type == 3 ){
-    if( ! mp->use_attenuation_mimic ){
+    if( ! mp->partial_phys_dispersion_only ){
       // memory variables
       copy_todevice_realw((void**)&mp->d_b_R_xx_crust_mantle,b_R_xx_crust_mantle,R_size1);
       copy_todevice_realw((void**)&mp->d_b_R_yy_crust_mantle,b_R_yy_crust_mantle,R_size1);
@@ -537,7 +525,7 @@
 
   copy_todevice_realw((void**)&mp->d_one_minus_sum_beta_inner_core,one_minus_sum_beta_inner_core,R_size2);
 
-  if( ! mp->use_attenuation_mimic ){
+  if( ! mp->partial_phys_dispersion_only ){
     // common factor
     copy_todevice_realw((void**)&mp->d_factor_common_inner_core,factor_common_inner_core,R_size3);
     // memory variables
@@ -549,7 +537,7 @@
   }
 
   if(mp->simulation_type == 3 ){
-    if( ! mp->use_attenuation_mimic ){
+    if( ! mp->partial_phys_dispersion_only ){
       // memory variables
       copy_todevice_realw((void**)&mp->d_b_R_xx_inner_core,b_R_xx_inner_core,R_size1);
       copy_todevice_realw((void**)&mp->d_b_R_yy_inner_core,b_R_yy_inner_core,R_size1);
@@ -1931,7 +1919,7 @@
   if( mp->attenuation ){
     cudaFree(mp->d_one_minus_sum_beta_crust_mantle);
     cudaFree(mp->d_one_minus_sum_beta_inner_core);
-    if( ! mp->use_attenuation_mimic ){
+    if( ! mp->partial_phys_dispersion_only ){
       cudaFree(mp->d_factor_common_crust_mantle);
       cudaFree(mp->d_R_xx_crust_mantle);
       cudaFree(mp->d_R_yy_crust_mantle);

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/rules.mk
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/rules.mk	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/rules.mk	2013-09-09 15:42:03 UTC (rev 22780)
@@ -44,10 +44,10 @@
 	$O/compute_stacey_acoustic_cuda.cuda.o \
 	$O/compute_stacey_elastic_cuda.cuda.o \
 	$O/initialize_cuda.cuda.o \
-	$O/it_update_displacement_cuda.cuda.o \
 	$O/noise_tomography_cuda.cuda.o \
 	$O/prepare_mesh_constants_cuda.cuda.o \
 	$O/transfer_fields_cuda.cuda.o \
+	$O/update_displacement_cuda.cuda.o \
 	$O/write_seismograms_cuda.cuda.o \
 	$O/save_and_compare_cpu_vs_gpu.cudacc.o \
 	$(EMPTY_MACRO)

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2013-09-09 15:42:03 UTC (rev 22780)
@@ -49,6 +49,11 @@
 #endif
   sprintf(filename,"maxval_surface_proc_%03d.dat",rank);
   fp = fopen(filename,"a+");
+  if( fp == NULL ){
+    printf("FILE ERROR:%s\n",(char*) strerror(errno));
+    perror("file error\n");
+    exit(1);
+  }
   fprintf(fp,"%e\n",*maxval);
   fclose(fp);
 }
@@ -63,6 +68,11 @@
     sprintf(filename, "debug_output_gpu_%d.dat",*id);
   }
   fp = fopen(filename, "wb");
+  if( fp == NULL ){
+    printf("FILE ERROR:%s\n",(char*) strerror(errno));
+    perror("file error\n");
+    exit(1);
+  }
   printf("writing vector, vector[0]=%e\n",vector[0]);
   fwrite(vector, sizeof(float), *size, fp);
   fclose(fp);
@@ -79,6 +89,11 @@
     sprintf(filename, "debug_output_gpu_%d.dat",*id);
   }
   fp = fopen(filename, "wb");
+  if( fp == NULL ){
+    printf("FILE ERROR:%s\n",(char*) strerror(errno));
+    perror("file error\n");
+    exit(1);
+  }
   fwrite(vector, sizeof(int), *size, fp);
   fclose(fp);
 
@@ -89,6 +104,8 @@
   int nodes_per_iteration = *nodes_per_iterationf;
   char filename[BUFSIZ];
   int procid;
+  size_t res;
+
 #ifdef WITH_MPI
   MPI_Comm_rank(MPI_COMM_WORLD,&procid);
 #else
@@ -99,9 +116,7 @@
   FILE* fp; int it;
   printf("Opening %s for analysis\n",filename);
   fp = fopen(filename,"rb");
-  //char* errorstr;
-  if(fp == 0) {
-    //errorstr = (char*) strerror(errno);
+  if(fp == NULL) {
     printf("FILE ERROR:%s\n",(char*) strerror(errno));
     perror("file error\n");
     exit(1);
@@ -114,7 +129,8 @@
   for(it=0;it<*NSTEP;it++) {
     int pos = (sizeof(float)*nodes_per_iteration)*(it);
     fseek(fp,pos,SEEK_SET);
-    fread(vector,sizeof(float),nodes_per_iteration,fp);
+    res = fread(vector,sizeof(float),nodes_per_iteration,fp);
+    if (res != nodes_per_iteration) {printf ("Reading error:%s\n",(char*) strerror(errno)); exit (1);}
     for(i=0;i<nodes_per_iteration;i++) {
       max_val = MAX(max_val,vector[i]);
     }
@@ -176,19 +192,20 @@
 
   char* cpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_proc000001_surface_movie";
   char* gpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_v2_proc000001_surface_movie";
+  size_t res;
 
   FILE* fp_cpu;
   fp_cpu = fopen(cpu_file,"rb");
   //char* errorstr;
-  if(fp_cpu == 0) {
+  if(fp_cpu == NULL) {
     //errorstr = (char*) strerror(errno);
     //printf("CPU FILE ERROR:%s\n",errorstr);
     printf("CPU FILE ERROR:%s\n",(char*) strerror(errno));
     perror("cpu file error\n");
   }
+
   FILE* fp_gpu;
   fp_gpu = fopen(gpu_file,"rb");
-
   if(fp_gpu == NULL) {
     //errorstr = (char*) strerror(errno);
     //printf("GPU FILE ERROR:%s\n",errorstr);
@@ -208,8 +225,12 @@
     fseek(fp_gpu,pos,SEEK_SET);
 
     int number_of_nodes = *bytes_per_iteration/sizeof(float);
-    fread(cpu_vector,sizeof(float),number_of_nodes,fp_cpu);
-    fread(gpu_vector,sizeof(float),number_of_nodes,fp_gpu);
+    res = fread(cpu_vector,sizeof(float),number_of_nodes,fp_cpu);
+    if (res != number_of_nodes) {printf ("Reading error:%s\n",(char*) strerror(errno)); exit (1);}
+
+    res = fread(gpu_vector,sizeof(float),number_of_nodes,fp_gpu);
+    if (res != number_of_nodes) {printf ("Reading error:%s\n",(char*) strerror(errno)); exit (1);}
+
     int size = number_of_nodes;
     float gpu_min_val=10;
     float gpu_max_val=0;
@@ -240,6 +261,8 @@
 void compare_fvector_(float* vector, int* size, int* id, int* cpu_or_gpu) {
   FILE* fp;
   char cmp_filename[BUFSIZ];
+  size_t res;
+
   float* compare_vector = (float*)malloc(*size*sizeof(float));
   if(*cpu_or_gpu == 0) { //swap gpu/cpu for compare
     sprintf(cmp_filename, "debug_output_gpu_%d.dat",*id);
@@ -247,13 +270,15 @@
   else {
     sprintf(cmp_filename, "debug_output_cpu_%d.dat",*id);
   }
-  fopen(cmp_filename, "rb");
+
+  fp = fopen(cmp_filename, "rb");
   /* read the values */
-  if((fp=fopen(cmp_filename, "rb"))==NULL) {
+  if( fp == NULL ) {
     printf("Cannot open comparison file %s.\n",cmp_filename);
     exit(1);
   }
-  if(fread(compare_vector, sizeof(float), *size, fp) != *size) {
+  res = fread(compare_vector, sizeof(float), *size, fp);
+  if(res != *size) {
     if(feof(fp))
        printf("Premature end of file.");
     else
@@ -284,6 +309,8 @@
 void compare_ivector_(int* vector, int* size, int* id, int* cpu_or_gpu) {
   FILE* fp;
   char cmp_filename[BUFSIZ];
+  size_t res;
+
   int* compare_vector = (int*)malloc(*size*sizeof(int));
   if(*cpu_or_gpu == 0) { //swap gpu/cpu for compare
     sprintf(cmp_filename, "debug_output_gpu_%d.dat",*id);
@@ -291,13 +318,14 @@
   else {
     sprintf(cmp_filename, "debug_output_cpu_%d.dat",*id);
   }
-  fopen(cmp_filename, "rb");
+  fp = fopen(cmp_filename, "rb");
   /* read the values */
-  if((fp=fopen(cmp_filename, "rb"))==NULL) {
+  if( fp == NULL ) {
     printf("Cannot open comparison file %s.\n",cmp_filename);
     exit(1);
   }
-  if(fread(compare_vector, sizeof(int), *size, fp) != *size) {
+  res = fread(compare_vector, sizeof(int), *size, fp);
+  if(res != *size) {
     if(feof(fp))
        printf("Premature end of file.");
     else

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2013-09-09 15:42:03 UTC (rev 22780)
@@ -271,48 +271,6 @@
 
 
 //
-// src/cuda/it_update_displacement_cuda.cu
-//
-
-void FC_FUNC_(it_update_displacement_ic_cuda,
-              IT_UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
-                                             realw* deltat_F,
-                                             realw* deltatsqover2_F,
-                                             realw* deltatover2_F,
-                                             int* FORWARD_OR_ADJOINT) {} 
-
-void FC_FUNC_(it_update_displacement_cm_cuda,
-              IT_UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
-                                             realw* deltat_F,
-                                             realw* deltatsqover2_F,
-                                             realw* deltatover2_F,
-                                             int* FORWARD_OR_ADJOINT) {} 
-
-void FC_FUNC_(it_update_displacement_oc_cuda,
-              IT_UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
-                                              realw* deltat_F,
-                                              realw* deltatsqover2_F,
-                                              realw* deltatover2_F,
-                                              int* FORWARD_OR_ADJOINT) {} 
-
-void FC_FUNC_(update_accel_3_a_cuda,
-              UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
-                                     realw* deltatover2_F,
-                                     int* NCHUNKS_VAL,
-                                     int* FORWARD_OR_ADJOINT) {} 
-
-void FC_FUNC_(update_veloc_3_b_cuda,
-              UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
-                                     realw* deltatover2_F,
-                                     int* FORWARD_OR_ADJOINT) {} 
-
-void FC_FUNC_(kernel_3_outer_core_cuda,
-              KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
-                                        realw* deltatover2_F,
-                                        int* FORWARD_OR_ADJOINT) {} 
-
-
-//
 // src/cuda/noise_tomography_cuda.cu
 //
 
@@ -349,39 +307,31 @@
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
                                         int* myrank_f,
                                         int* h_NGLLX,
-                                        realw* h_hprime_xx,
-                                        realw* h_hprimewgll_xx,
+                                        realw* h_hprime_xx,realw* h_hprimewgll_xx,
                                         realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
                                         int* NSOURCES,int* nsources_local,
                                         realw* h_sourcearrays,
                                         int* h_islice_selected_source,int* h_ispec_selected_source,
+                                        int* nrec,int* nrec_local, int* nadj_rec_local,
                                         int* h_number_receiver_global,
                                         int* h_islice_selected_rec,int* h_ispec_selected_rec,
-                                        int* nrec,int* nrec_local, int* nadj_rec_local,
                                         int* NSPEC_CRUST_MANTLE, int* NGLOB_CRUST_MANTLE,
                                         int* NSPEC_CRUST_MANTLE_STRAIN_ONLY,
                                         int* NSPEC_OUTER_CORE, int* NGLOB_OUTER_CORE,
                                         int* NSPEC_INNER_CORE, int* NGLOB_INNER_CORE,
                                         int* NSPEC_INNER_CORE_STRAIN_ONLY,
-                                        int* SIMULATION_TYPE,
-                                        int* NOISE_TOMOGRAPHY,
-                                        int* SAVE_FORWARD_f,
-                                        int* ABSORBING_CONDITIONS_f,
-                                        int* OCEANS_f,
-                                        int* GRAVITY_f,
+                                        int* SIMULATION_TYPE,int* NOISE_TOMOGRAPHY,
+                                        int* SAVE_FORWARD_f,int* ABSORBING_CONDITIONS_f,
+                                        int* OCEANS_f,int* GRAVITY_f,
                                         int* ROTATION_f,int* EXACT_MASS_MATRIX_FOR_ROTATION_f,
                                         int* ATTENUATION_f,int* UNDO_ATTENUATION_f,
-                                        int* USE_ATTENUATION_MIMIC_f,
-                                        int* USE_3D_ATTENUATION_ARRAYS_f,
+                                        int* PARTIAL_PHYS_DISPERSION_ONLY_f,int* USE_3D_ATTENUATION_ARRAYS_f,
                                         int* COMPUTE_AND_STORE_STRAIN_f,
-                                        int* ANISOTROPIC_3D_MANTLE_f,
-                                        int* ANISOTROPIC_INNER_CORE_f,
+                                        int* ANISOTROPIC_3D_MANTLE_f,int* ANISOTROPIC_INNER_CORE_f,
                                         int* SAVE_BOUNDARY_MESH_f,
                                         int* USE_MESH_COLORING_GPU_f,
-                                        int* ANISOTROPIC_KL_f,
-                                        int* APPROXIMATE_HESS_KL_f,
-                                        realw* deltat_f,
-                                        realw* b_deltat_f) {} 
+                                        int* ANISOTROPIC_KL_f,int* APPROXIMATE_HESS_KL_f,
+                                        realw* deltat_f,realw* b_deltat_f) {} 
 
 void FC_FUNC_(prepare_fields_rotation_device,
               PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
@@ -792,6 +742,48 @@
 
 
 //
+// src/cuda/update_displacement_cuda.cu
+//
+
+void FC_FUNC_(update_displacement_ic_cuda,
+              UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
+                                          realw* deltat_F,
+                                          realw* deltatsqover2_F,
+                                          realw* deltatover2_F,
+                                          int* FORWARD_OR_ADJOINT) {} 
+
+void FC_FUNC_(update_displacement_cm_cuda,
+              UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
+                                          realw* deltat_F,
+                                          realw* deltatsqover2_F,
+                                          realw* deltatover2_F,
+                                          int* FORWARD_OR_ADJOINT) {} 
+
+void FC_FUNC_(update_displacement_oc_cuda,
+              UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
+                                           realw* deltat_F,
+                                           realw* deltatsqover2_F,
+                                           realw* deltatover2_F,
+                                           int* FORWARD_OR_ADJOINT) {} 
+
+void FC_FUNC_(update_accel_3_a_cuda,
+              UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
+                                     realw* deltatover2_F,
+                                     int* NCHUNKS_VAL,
+                                     int* FORWARD_OR_ADJOINT) {} 
+
+void FC_FUNC_(update_veloc_3_b_cuda,
+              UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
+                                     realw* deltatover2_F,
+                                     int* FORWARD_OR_ADJOINT) {} 
+
+void FC_FUNC_(kernel_3_outer_core_cuda,
+              KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
+                                        realw* deltatover2_F,
+                                        int* FORWARD_OR_ADJOINT) {} 
+
+
+//
 // src/cuda/write_seismograms_cuda.cu
 //
 

Copied: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu (from rev 22778, seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu)
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu	2013-09-09 15:42:03 UTC (rev 22780)
@@ -0,0 +1,611 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            August 2013
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elastic wavefield
+
+// KERNEL 1
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void UpdateDispVeloc_kernel(realw* displ,
+                                       realw* veloc,
+                                       realw* accel,
+                                       int size,
+                                       realw deltat,
+                                       realw deltatsqover2,
+                                       realw deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id];
+    veloc[id] = veloc[id] + deltatover2*accel[id];
+    accel[id] = 0.0f; // can do this using memset...not sure if faster
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 1
+// inner core
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(update_displacement_ic_cuda,
+              UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
+                                          realw* deltat_F,
+                                          realw* deltatsqover2_F,
+                                          realw* deltatover2_F,
+                                          int* FORWARD_OR_ADJOINT) {
+
+TRACE("update_displacement_ic_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int size = NDIM * mp->NGLOB_INNER_CORE;
+
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL1;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    //launch kernel
+    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+                                             mp->d_veloc_inner_core,
+                                             mp->d_accel_inner_core,
+                                             size,deltat,deltatsqover2,deltatover2);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // kernel for backward fields
+    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
+                                             mp->d_b_veloc_inner_core,
+                                             mp->d_b_accel_inner_core,
+                                             size,deltat,deltatsqover2,deltatover2);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("update_displacement_ic_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 1
+// crust/mantle
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(update_displacement_cm_cuda,
+              UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
+                                          realw* deltat_F,
+                                          realw* deltatsqover2_F,
+                                          realw* deltatover2_F,
+                                          int* FORWARD_OR_ADJOINT) {
+
+  TRACE("update_displacement_cm_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int size = NDIM * mp->NGLOB_CRUST_MANTLE;
+
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL1;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    //launch kernel
+    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+                                             mp->d_veloc_crust_mantle,
+                                             mp->d_accel_crust_mantle,
+                                             size,deltat,deltatsqover2,deltatover2);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    // kernel for backward fields
+    UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+                                             mp->d_b_veloc_crust_mantle,
+                                             mp->d_b_accel_crust_mantle,
+                                             size,deltat,deltatsqover2,deltatover2);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("update_displacement_cm_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic wavefield
+
+// KERNEL 1
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void UpdatePotential_kernel(realw* potential_acoustic,
+                                       realw* potential_dot_acoustic,
+                                       realw* potential_dot_dot_acoustic,
+                                       int size,
+                                       realw deltat,
+                                       realw deltatsqover2,
+                                       realw deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    potential_acoustic[id] = potential_acoustic[id]
+                            + deltat*potential_dot_acoustic[id]
+                            + deltatsqover2*potential_dot_dot_acoustic[id];
+
+    potential_dot_acoustic[id] = potential_dot_acoustic[id]
+                                + deltatover2*potential_dot_dot_acoustic[id];
+
+    potential_dot_dot_acoustic[id] = 0.0f;
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 1
+// outer core
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(update_displacement_oc_cuda,
+              UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
+                                           realw* deltat_F,
+                                           realw* deltatsqover2_F,
+                                           realw* deltatover2_F,
+                                           int* FORWARD_OR_ADJOINT) {
+
+  TRACE("update_displacement_oc_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+  int size = mp->NGLOB_OUTER_CORE;
+
+  realw deltat = *deltat_F;
+  realw deltatsqover2 = *deltatsqover2_F;
+  realw deltatover2 = *deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL1;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    //launch kernel
+    UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
+                                             mp->d_veloc_outer_core,
+                                             mp->d_accel_outer_core,
+                                             size,deltat,deltatsqover2,deltatover2);
+  }else if( *FORWARD_OR_ADJOINT == 1 ){
+    UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,
+                                             mp->d_b_veloc_outer_core,
+                                             mp->d_b_accel_outer_core,
+                                             size,deltat,deltatsqover2,deltatover2);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("update_displacement_oc_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 3
+//
+// crust/mantle and inner core regions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_cuda_device(realw* veloc,
+                                     realw* accel,
+                                     int size,
+                                     realw deltatover2,
+                                     realw two_omega_earth,
+                                     realw* rmassx,
+                                     realw* rmassy,
+                                     realw* rmassz) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    // note: update adds rotational acceleration in case two_omega_earth is non-zero
+    accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
+    accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
+    accel[3*id+2] = accel[3*id+2]*rmassz[id];
+
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_accel_cuda_device(realw* accel,
+                                           realw* veloc,
+                                           int size,
+                                           realw two_omega_earth,
+                                           realw* rmassx,
+                                           realw* rmassy,
+                                           realw* rmassz) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    // note: update adds rotational acceleration in case two_omega_earth is non-zero
+    accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
+    accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
+    accel[3*id+2] = accel[3*id+2]*rmassz[id];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_veloc_cuda_device(realw* veloc,
+                                           realw* accel,
+                                           int size,
+                                           realw deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(update_accel_3_a_cuda,
+              UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
+                                     realw* deltatover2_F,
+                                     int* NCHUNKS_VAL,
+                                     int* FORWARD_OR_ADJOINT) {
+  TRACE("update_accel_3_a_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+  realw deltatover2 = *deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL3;
+  int size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // crust/mantle region only
+  // check whether we can update accel and veloc, or only accel at this point
+  if( mp->oceans == 0 ){
+    // updates both, accel and veloc
+
+    if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
+      // uses corrected mass matrices rmassx,rmassy,rmassz
+      if( *FORWARD_OR_ADJOINT == 1 ){
+        kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+                                                 mp->d_accel_crust_mantle,
+                                                 mp->NGLOB_CRUST_MANTLE,
+                                                 deltatover2,
+                                                 mp->two_omega_earth,
+                                                 mp->d_rmassx_crust_mantle,
+                                                 mp->d_rmassy_crust_mantle,
+                                                 mp->d_rmassz_crust_mantle);
+      }else if( *FORWARD_OR_ADJOINT == 3 ){
+        kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+                                                 mp->d_b_accel_crust_mantle,
+                                                 mp->NGLOB_CRUST_MANTLE,
+                                                 deltatover2,
+                                                 mp->b_two_omega_earth,
+                                                 mp->d_rmassx_crust_mantle,
+                                                 mp->d_rmassy_crust_mantle,
+                                                 mp->d_rmassz_crust_mantle);
+      }
+    }else{
+      // uses only rmassz
+      if( *FORWARD_OR_ADJOINT == 1 ){
+        kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+                                                 mp->d_accel_crust_mantle,
+                                                 mp->NGLOB_CRUST_MANTLE,
+                                                 deltatover2,
+                                                 mp->two_omega_earth,
+                                                 mp->d_rmassz_crust_mantle,
+                                                 mp->d_rmassz_crust_mantle,
+                                                 mp->d_rmassz_crust_mantle);
+      }else if( *FORWARD_OR_ADJOINT == 3 ){
+        kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+                                                 mp->d_b_accel_crust_mantle,
+                                                 mp->NGLOB_CRUST_MANTLE,
+                                                 deltatover2,
+                                                 mp->b_two_omega_earth,
+                                                 mp->d_rmassz_crust_mantle,
+                                                 mp->d_rmassz_crust_mantle,
+                                                 mp->d_rmassz_crust_mantle);
+      }
+    }
+
+  }else{
+    // updates only accel
+
+    if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
+      // uses corrected mass matrices
+      if( *FORWARD_OR_ADJOINT == 1 ){
+        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+                                                       mp->d_veloc_crust_mantle,
+                                                       mp->NGLOB_CRUST_MANTLE,
+                                                       mp->two_omega_earth,
+                                                       mp->d_rmassx_crust_mantle,
+                                                       mp->d_rmassy_crust_mantle,
+                                                       mp->d_rmassz_crust_mantle);
+      }else if( *FORWARD_OR_ADJOINT == 3 ){
+        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+                                                       mp->d_b_veloc_crust_mantle,
+                                                       mp->NGLOB_CRUST_MANTLE,
+                                                       mp->b_two_omega_earth,
+                                                       mp->d_rmassx_crust_mantle,
+                                                       mp->d_rmassy_crust_mantle,
+                                                       mp->d_rmassz_crust_mantle);
+      }
+    }else{
+      // uses only rmassz
+      if( *FORWARD_OR_ADJOINT == 1 ){
+        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+                                                       mp->d_veloc_crust_mantle,
+                                                       mp->NGLOB_CRUST_MANTLE,
+                                                       mp->two_omega_earth,
+                                                       mp->d_rmassz_crust_mantle,
+                                                       mp->d_rmassz_crust_mantle,
+                                                       mp->d_rmassz_crust_mantle);
+      }else if( *FORWARD_OR_ADJOINT == 3 ){
+        kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+                                                       mp->d_b_veloc_crust_mantle,
+                                                       mp->NGLOB_CRUST_MANTLE,
+                                                       mp->b_two_omega_earth,
+                                                       mp->d_rmassz_crust_mantle,
+                                                       mp->d_rmassz_crust_mantle,
+                                                       mp->d_rmassz_crust_mantle);
+      }
+    }
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after update_accel_3_a");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(update_veloc_3_b_cuda,
+              UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
+                                     realw* deltatover2_F,
+                                     int* FORWARD_OR_ADJOINT) {
+  TRACE("update_veloc_3_b_cuda");
+  int size_padded,num_blocks_x,num_blocks_y;
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+  realw deltatover2 = *deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL3;
+
+  // crust/mantle region
+  // in case of ocean loads, we still have to update the velocity for crust/mantle region
+  if( mp->oceans ){
+    size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
+    num_blocks_x = size_padded/blocksize;
+    num_blocks_y = 1;
+    while(num_blocks_x > MAXIMUM_GRID_DIM) {
+      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+      num_blocks_y = num_blocks_y*2;
+    }
+    dim3 grid1(num_blocks_x,num_blocks_y);
+    dim3 threads1(blocksize,1,1);
+
+    // updates only veloc at this point
+    if( *FORWARD_OR_ADJOINT == 1 ){
+      kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_veloc_crust_mantle,
+                                                       mp->d_accel_crust_mantle,
+                                                       mp->NGLOB_CRUST_MANTLE,
+                                                       deltatover2);
+    }else if( *FORWARD_OR_ADJOINT == 3 ){
+      kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_b_veloc_crust_mantle,
+                                                       mp->d_b_accel_crust_mantle,
+                                                       mp->NGLOB_CRUST_MANTLE,
+                                                       deltatover2);
+    }
+  }
+
+  // inner core region
+  size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
+  num_blocks_x = size_padded/blocksize;
+  num_blocks_y = 1;
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // updates both, accel and veloc
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
+                                             mp->d_accel_inner_core,
+                                             mp->NGLOB_INNER_CORE,
+                                             deltatover2,
+                                             mp->two_omega_earth,
+                                             mp->d_rmassx_inner_core,
+                                             mp->d_rmassy_inner_core,
+                                             mp->d_rmassz_inner_core);
+  }else if( *FORWARD_OR_ADJOINT == 3 ){
+    kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
+                                             mp->d_b_accel_inner_core,
+                                             mp->NGLOB_INNER_CORE,
+                                             deltatover2,
+                                             mp->b_two_omega_earth,
+                                             mp->d_rmassx_inner_core,
+                                             mp->d_rmassy_inner_core,
+                                             mp->d_rmassz_inner_core);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after update_veloc_3_b");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 3
+//
+// for outer_core
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void kernel_3_outer_core_cuda_device(realw* veloc,
+                                                realw* accel,int size,
+                                                realw deltatover2,
+                                                realw* rmass) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    // multiplies pressure with the inverse of the mass matrix
+    accel[id] = accel[id]*rmass[id];
+
+    // Newmark time scheme: corrector term
+    veloc[id] = veloc[id] + deltatover2*accel[id];
+  }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(kernel_3_outer_core_cuda,
+              KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
+                                        realw* deltatover2_F,
+                                        int* FORWARD_OR_ADJOINT) {
+
+  TRACE("kernel_3_outer_core_cuda");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+  realw deltatover2 = *deltatover2_F;
+
+  int blocksize = BLOCKSIZE_KERNEL3;
+
+  int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+    num_blocks_y = num_blocks_y*2;
+  }
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  if( *FORWARD_OR_ADJOINT == 1 ){
+    kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
+                                                        mp->d_accel_outer_core,
+                                                        mp->NGLOB_OUTER_CORE,
+                                                        deltatover2,mp->d_rmass_outer_core);
+  }else if( *FORWARD_OR_ADJOINT == 3){
+    kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
+                                                        mp->d_b_accel_outer_core,
+                                                        mp->NGLOB_OUTER_CORE,
+                                                        deltatover2,mp->d_rmass_outer_core);
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel_3_outer_core");
+#endif
+}
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_seismograms.f90	2013-09-09 15:42:03 UTC (rev 22780)
@@ -34,7 +34,7 @@
 
   use specfem_par,only: &
     NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-    nrec,nrec_local, &
+    nrec_local, &
     nu,hxir_store,hetar_store,hgammar_store, &
     ispec_selected_rec,number_receiver_global, &
     scale_displ
@@ -117,7 +117,7 @@
 
   use specfem_par,only: &
     NSTEP,NTSTEP_BETWEEN_OUTPUT_SEISMOS,UNDO_ATTENUATION, &
-    NSOURCES,nrec_local, &
+    nrec_local, &
     nu_source,Mxx,Myy,Mzz,Mxy,Mxz,Myz, &
     hxir_store,hpxir_store,hetar_store,hpetar_store,hgammar_store,hpgammar_store, &
     tshift_cmt,hdur_gaussian, &

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2013-09-09 15:42:03 UTC (rev 22780)
@@ -1829,7 +1829,7 @@
 
   ! user output
   if(myrank == 0 ) then
-    write(IMAIN,*) "preparing Fields and Constants on GPU Device"
+    write(IMAIN,*) "preparing fields and constants on GPU devices:"
     call flush_IMAIN()
   endif
 
@@ -1841,33 +1841,31 @@
 
   ! prepares general fields on GPU
   call prepare_constants_device(Mesh_pointer,myrank,NGLLX, &
-                                  hprime_xx, &
-                                  hprimewgll_xx, &
-                                  wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
-                                  NSOURCES, nsources_local, &
-                                  sourcearrays, &
-                                  islice_selected_source,ispec_selected_source, &
-                                  number_receiver_global, &
-                                  islice_selected_rec,ispec_selected_rec, &
-                                  nrec, nrec_local, nadj_rec_local, &
-                                  NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
-                                  NSPEC_CRUST_MANTLE_STRAIN_ONLY, &
-                                  NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
-                                  NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
-                                  NSPEC_INNER_CORE_STRAIN_ONLY, &
-                                  SIMULATION_TYPE,NOISE_TOMOGRAPHY, &
-                                  SAVE_FORWARD,ABSORBING_CONDITIONS, &
-                                  OCEANS_VAL, &
-                                  GRAVITY_VAL, &
-                                  ROTATION_VAL,EXACT_MASS_MATRIX_FOR_ROTATION, &
-                                  ATTENUATION_VAL,UNDO_ATTENUATION, &
-                                  PARTIAL_PHYS_DISPERSION_ONLY,USE_3D_ATTENUATION_ARRAYS, &
-                                  COMPUTE_AND_STORE_STRAIN, &
-                                  ANISOTROPIC_3D_MANTLE_VAL,ANISOTROPIC_INNER_CORE_VAL, &
-                                  SAVE_BOUNDARY_MESH, &
-                                  USE_MESH_COLORING_GPU, &
-                                  ANISOTROPIC_KL,APPROXIMATE_HESS_KL, &
-                                  deltat,b_deltat)
+                                hprime_xx,hprimewgll_xx, &
+                                wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+                                NSOURCES, nsources_local, &
+                                sourcearrays, &
+                                islice_selected_source,ispec_selected_source, &
+                                nrec, nrec_local, nadj_rec_local, &
+                                number_receiver_global, &
+                                islice_selected_rec,ispec_selected_rec, &
+                                NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
+                                NSPEC_CRUST_MANTLE_STRAIN_ONLY, &
+                                NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
+                                NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
+                                NSPEC_INNER_CORE_STRAIN_ONLY, &
+                                SIMULATION_TYPE,NOISE_TOMOGRAPHY, &
+                                SAVE_FORWARD,ABSORBING_CONDITIONS, &
+                                OCEANS_VAL,GRAVITY_VAL, &
+                                ROTATION_VAL,EXACT_MASS_MATRIX_FOR_ROTATION, &
+                                ATTENUATION_VAL,UNDO_ATTENUATION, &
+                                PARTIAL_PHYS_DISPERSION_ONLY,USE_3D_ATTENUATION_ARRAYS, &
+                                COMPUTE_AND_STORE_STRAIN, &
+                                ANISOTROPIC_3D_MANTLE_VAL,ANISOTROPIC_INNER_CORE_VAL, &
+                                SAVE_BOUNDARY_MESH, &
+                                USE_MESH_COLORING_GPU, &
+                                ANISOTROPIC_KL,APPROXIMATE_HESS_KL, &
+                                deltat,b_deltat)
 
   ! prepares rotation arrays
   if( ROTATION_VAL ) then

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/rules.mk	2013-09-09 15:42:03 UTC (rev 22780)
@@ -155,10 +155,10 @@
 	$O/compute_stacey_acoustic_cuda.cuda.o \
 	$O/compute_stacey_elastic_cuda.cuda.o \
 	$O/initialize_cuda.cuda.o \
-	$O/it_update_displacement_cuda.cuda.o \
 	$O/noise_tomography_cuda.cuda.o \
 	$O/prepare_mesh_constants_cuda.cuda.o \
 	$O/transfer_fields_cuda.cuda.o \
+	$O/update_displacement_cuda.cuda.o \
 	$O/write_seismograms_cuda.cuda.o \
 	$O/save_and_compare_cpu_vs_gpu.cudacc.o \
 	$(EMPTY_MACRO)
@@ -243,6 +243,8 @@
 	@echo ""
 	${NVCCLINK} -o $(cuda_DEVICE_OBJ) $(cuda_OBJECTS)
 	${FCLINK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(cuda_DEVICE_OBJ) $(MPILIBS) $(CUDA_LINK)
+	@echo ""
+
 else
 
 ## cuda 4 version
@@ -251,6 +253,8 @@
 	@echo "building xspecfem3D with CUDA 4 support"
 	@echo ""
 	${FCLINK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(MPILIBS) $(CUDA_LINK)
+	@echo ""
+
 endif
 
 else
@@ -264,6 +268,7 @@
 ## DK DK add OpenMP compiler flag here if needed
 #	${MPIFCCOMPILE_CHECK} -qsmp=omp -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(MPILIBS)
 	${MPIFCCOMPILE_CHECK} -o ${E}/xspecfem3D $(XSPECFEM_OBJECTS) $(MPILIBS)
+	@echo ""
 
 endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90	2013-09-09 15:42:03 UTC (rev 22780)
@@ -87,11 +87,11 @@
     ! on GPU
     ! Includes FORWARD_OR_ADJOINT == 1
     ! outer core region
-    call it_update_displacement_oc_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
+    call update_displacement_oc_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
     ! inner core region
-    call it_update_displacement_ic_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
+    call update_displacement_ic_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
     ! crust/mantle region
-    call it_update_displacement_cm_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
+    call update_displacement_cm_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
   endif
 
   end subroutine update_displacement_Newmark
@@ -130,11 +130,11 @@
     ! on GPU
     ! Includes FORWARD_OR_ADJOINT == 3
     ! outer core region
-    call it_update_displacement_oc_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
+    call update_displacement_oc_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
     ! inner core region
-    call it_update_displacement_ic_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
+    call update_displacement_ic_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
     ! crust/mantle region
-    call it_update_displacement_cm_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
+    call update_displacement_cm_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
   endif
 
   end subroutine update_displacement_Newmark_backward

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_output_SAC.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_output_SAC.f90	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_output_SAC.f90	2013-09-09 15:42:03 UTC (rev 22780)
@@ -34,7 +34,6 @@
 
   use specfem_par,only: &
           ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI, &
-          myrank,nrec, &
           station_name,network_name,stlat,stlon,stele,stbur, &
           DT,t0, &
           seismo_offset,seismo_current,it_end, &

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90	2013-09-09 11:33:19 UTC (rev 22779)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90	2013-09-09 15:42:03 UTC (rev 22780)
@@ -334,15 +334,12 @@
 
   use constants_solver
   use specfem_par,only: &
-          ANGULAR_WIDTH_XI_IN_DEGREES,NEX_XI, &
-          myrank,nrec, &
-          station_name,network_name,stlat,stlon,stele,stbur, &
-          DT,t0, &
-          seismo_offset,seismo_current,it_end, &
+          myrank, &
+          station_name,network_name,stlat,stlon, &
+          DT, &
+          seismo_current, &
           OUTPUT_SEISMOS_ASCII_TEXT,OUTPUT_SEISMOS_SAC_ALPHANUM, &
-          OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS, &
-          SAVE_ALL_SEISMOS_IN_ONE_FILE,USE_BINARY_FOR_LARGE_FILE, &
-          MODEL,OUTPUT_FILES
+          OUTPUT_SEISMOS_SAC_BINARY,ROTATE_SEISMOGRAMS_RT,NTSTEP_BETWEEN_OUTPUT_SEISMOS
 
   use specfem_par,only: &
           cmt_lat=>cmt_lat_SAC,cmt_lon=>cmt_lon_SAC

Added: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/utils/check_cuda_device.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/utils/check_cuda_device.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/utils/check_cuda_device.cu	2013-09-09 15:42:03 UTC (rev 22780)
@@ -0,0 +1,288 @@
+/* 
+**************************
+
+check_cuda_device utility
+
+**************************
+
+this utility program will output GPU device informations helpful for debugging CUDA.
+
+
+for compilation, see the command-line examples given here:
+
+- example without MPI support:
+
+nvcc -o check_cuda_device check_cuda_device.cu
+./check_cuda_device
+
+- example with MPI support:
+
+nvcc -DWITH_MPI -I/usr/lib/openmpi/include -o check_cuda_device check_cuda_device.cu -lmpi -L/usr/lib/openmpi/lib
+mpirun -np 2 ./check_cuda_device
+
+
+*/
+
+#include <stdio.h>
+#include <cuda.h>
+
+#ifdef WITH_MPI
+#include <mpi.h>
+#endif
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void get_free_memory(double* free_db, double* used_db, double* total_db) {
+
+  // gets memory usage in byte
+  size_t free_byte ;
+  size_t total_byte ;
+  cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
+  if ( cudaSuccess != cuda_status ){
+    printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
+    exit(EXIT_FAILURE);
+  }
+
+  *free_db = (double)free_byte ;
+  *total_db = (double)total_byte ;
+  *used_db = *total_db - *free_db ;
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void exit_on_error(char* info) {
+  printf("\nERROR: %s\n",info);
+  fflush(stdout);
+  
+  // stops program
+#ifdef WITH_MPI
+  MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+  //free(info);
+  exit(EXIT_FAILURE);
+  return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void exit_on_cuda_error(char* kernel_name) {
+  // sync and check to catch errors from previous async operations
+  cudaThreadSynchronize();
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess){
+    printf("Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
+
+    // releases previous contexts
+#if CUDA_VERSION < 4000
+    cudaThreadExit();
+#else
+    cudaDeviceReset();
+#endif
+    
+    // stops program
+    //free(kernel_name);
+#ifdef WITH_MPI
+    MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+    exit(EXIT_FAILURE);
+  }
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// GPU initialization
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+void initialize_cuda_device(int* myrank_f,int* ncuda_devices) {
+
+  int device;
+  int device_count;
+
+  // Gets rank number of MPI process
+  int myrank = *myrank_f;
+
+  /*
+  // cuda initialization (needs -lcuda library)
+  // note:   cuInit initializes the driver API.
+  //             it is needed for any following CUDA driver API function call (format cuFUNCTION(..) )
+  //             however, for the CUDA runtime API functions (format cudaFUNCTION(..) )
+  //             the initialization is implicit, thus cuInit() here would not be needed...
+  CUresult status = cuInit(0);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA driver API device initialization failed\n");
+
+  // returns a handle to the first cuda compute device
+  CUdevice dev;
+  status = cuDeviceGet(&dev, 0);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device not found\n");
+
+  // gets device properties
+  int major,minor;
+  status = cuDeviceComputeCapability(&major,&minor,dev);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device information not found\n");
+
+  // make sure that the device has compute capability >= 1.3
+  if (major < 1){
+    fprintf(stderr,"Compute capability major number should be at least 1, got: %d \nexiting...\n",major);
+    exit_on_error("CUDA Compute capability major number should be at least 1\n");
+  }
+  if (major == 1 && minor < 3){
+    fprintf(stderr,"Compute capability should be at least 1.3, got: %d.%d \nexiting...\n",major,minor);
+    exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+  }
+   */
+
+  // note: from here on we use the runtime API  ...
+
+  // Gets number of GPU devices
+  device_count = 0;
+  cudaGetDeviceCount(&device_count);
+  exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\ncheck if driver and runtime libraries work together\nexiting...\n");
+
+  // returns device count to fortran
+  if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
+  *ncuda_devices = device_count;
+
+  // Sets the active device
+  if(device_count >= 1) {
+    // generalized for more GPUs per node
+    // note: without previous context release, cudaSetDevice will complain with the cuda error
+    //         "setting the device when a process is active is not allowed"
+
+    // releases previous contexts
+#if CUDA_VERSION < 4000
+    cudaThreadExit();
+#else
+    cudaDeviceReset();
+#endif
+
+    //printf("rank %d: cuda device count = %d sets device = %d \n",myrank,device_count,myrank % device_count);
+    //MPI_Barrier(MPI_COMM_WORLD);
+
+    // sets active device
+    device = myrank % device_count;
+    cudaSetDevice( device );
+    exit_on_cuda_error("cudaSetDevice has invalid device");
+
+    // double check that device was  properly selected
+    cudaGetDevice(&device);
+    if( device != (myrank % device_count) ){
+       printf("error rank: %d devices: %d \n",myrank,device_count);
+       printf("  cudaSetDevice()=%d\n  cudaGetDevice()=%d\n",myrank%device_count,device);
+       exit_on_error("CUDA set/get device error: device id conflict \n");
+    }
+  }
+
+  // returns a handle to the active device
+  cudaGetDevice(&device);
+
+  // get device properties
+  struct cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp,device);
+
+  // exit if the machine has no CUDA-enabled device
+  if (deviceProp.major == 9999 && deviceProp.minor == 9999){
+    printf("No CUDA-enabled device found, exiting...\n\n");
+    exit_on_error("CUDA runtime error: there is no CUDA-enabled device found\n");
+  }
+
+  // outputs device infos to file
+
+  printf("GPU device for rank: %d\n\n",myrank);
+
+  // display device properties
+  printf("Device Name = %s\n",deviceProp.name);
+  printf("multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
+  printf("totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
+  printf("totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
+  printf("sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
+  printf("Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+  printf("Maximum size of each dimension of a block: %d x %d x %d\n",
+          deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
+  printf("Maximum sizes of each dimension of a grid: %d x %d x %d\n",
+          deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
+  printf("Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
+  if(deviceProp.canMapHostMemory){
+    printf("canMapHostMemory: TRUE\n");
+  }else{
+    printf("canMapHostMemory: FALSE\n");
+  }
+  if(deviceProp.deviceOverlap){
+    printf("deviceOverlap: TRUE\n");
+  }else{
+    printf("deviceOverlap: FALSE\n");
+  }
+
+  // outputs initial memory infos via cudaMemGetInfo()
+  double free_db,used_db,total_db;
+  get_free_memory(&free_db,&used_db,&total_db);
+  printf("\n%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n\n",myrank,
+          used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+
+  // make sure that the device has compute capability >= 1.3
+  if (deviceProp.major < 1){
+    printf("Compute capability major number should be at least 1, exiting...\n\n");
+    exit_on_error("CUDA Compute capability major number should be at least 1\n");
+  }
+  if (deviceProp.major == 1 && deviceProp.minor < 3){
+    printf("Compute capability should be at least 1.3, exiting...\n");
+    exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+  }
+  // we use pinned memory for asynchronous copy
+  if( ! deviceProp.canMapHostMemory){
+    printf("Device capability should allow to map host memory, exiting...\n");
+    exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n");
+  }
+}
+
+
+int main(int argc, char **argv)
+{
+
+  int myrank,ndevices;
+
+  // initialize
+#ifdef WITH_MPI
+  int size;
+  int rc;
+  rc = MPI_Init(&argc,&argv);
+  if (rc != MPI_SUCCESS) {
+    printf ("Error starting MPI program. Terminating.\n");
+    MPI_Abort(MPI_COMM_WORLD, rc);
+  }
+  MPI_Comm_size(MPI_COMM_WORLD,&size);
+  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+  if( myrank == 0 ){ printf ("Number of MPI processes = %d \n",size); }
+#else
+  myrank = 0;
+#endif
+
+  ndevices = 0;
+  
+  // initializes cuda devices
+  initialize_cuda_device(&myrank,&ndevices);
+
+  // releases previous contexts
+#if CUDA_VERSION < 4000
+  cudaThreadExit();
+#else
+  cudaDeviceReset();
+#endif
+  
+  printf("number of total devices: %d\n\n",ndevices);
+
+#ifdef WITH_MPI
+  MPI_Finalize();
+#endif
+  return 0;
+}
+