[cig-commits] r20570 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER: EXAMPLES/regional_Greece_small/DATA setup src/cuda src/meshfem3D src/shared src/specfem3D

Wed Aug 15 12:55:45 PDT 2012

Author: danielpeter
Date: 2012-08-15 12:55:44 -0700 (Wed, 15 Aug 2012)
New Revision: 20570

Modified:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_MPI_interfaces.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_mass_matrices.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_regions_mesh.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/get_perm_color.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_atten3D_QRFSI12.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_attenuation.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_s362ani.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/save_arrays_solver.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_MPI_interfaces.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_color_perm.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_model.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/get_model_parameters.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_compute_parameters.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
Log:
updates ocean coupling; updates mesh coloring for attenuation arrays; adds flag in constants.h.in to use 3D attenuation by default

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/regional_Greece_small/DATA/Par_file	2012-08-15 19:55:44 UTC (rev 20570)
@@ -37,15 +37,15 @@
 
 # parameters describing the Earth model
 OCEANS                          = .true.
-ELLIPTICITY                     = .false.
+ELLIPTICITY                     = .true.
 TOPOGRAPHY                      = .true.
-GRAVITY                         = .false.
-ROTATION                        = .false.
-ATTENUATION                     = .false.
+GRAVITY                         = .true.
+ROTATION                        = .true.
+ATTENUATION                     = .true.
 ATTENUATION_NEW                 = .false.
 
 # absorbing boundary conditions for a regional simulation
-ABSORBING_CONDITIONS            = .false.
+ABSORBING_CONDITIONS            = .true.
 
 # record length in minutes
 RECORD_LENGTH_IN_MINUTES        = 15.0d0

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/setup/constants.h.in	2012-08-15 19:55:44 UTC (rev 20570)
@@ -231,17 +231,18 @@
 ! the mesher) and use them for the computation of boundary kernel (in the solver)
   logical, parameter :: SAVE_BOUNDARY_MESH = .false.
 
+! mesh coloring:
 ! add mesh coloring for the GPU + MPI implementation
   logical, parameter :: USE_MESH_COLORING_GPU = .false.
   integer, parameter :: MAX_NUMBER_OF_COLORS = 1000
 
 ! enhanced coloring:
-!
 ! using Droux algorithm
 ! try several times with one more color before giving up
   logical, parameter :: USE_DROUX_OPTIMIZATION = .false.
   integer, parameter :: MAX_NB_TRIES_OF_DROUX_1993 = 15
-!
+
+! using balancing algorithm
 ! postprocess the colors to balance them if Droux (1993) algorithm is not used
   logical, parameter :: BALANCE_COLORS_SIMPLE_ALGO = .false.
 
@@ -270,6 +271,12 @@
 ! (default is to use transverse isotropy only between MOHO and 220 )
   logical, parameter :: USE_FULL_TISO_MANTLE = .false.
 
+! forces attenuation arrays to be fully 3D and defined on all GLL points
+! (useful for more accurate attenuation profiles and adjoint inversions)
+! (if set to .false., 3D attenuation is only used for models with 3D attenuation distributions)
+  logical, parameter :: USE_3D_ATTENUATION = .true.
+
+
 !!-----------------------------------------------------------
 !!
 !! time stamp information

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2012-08-15 19:55:44 UTC (rev 20570)
@@ -266,444 +266,11 @@
 
 
 /* ----------------------------------------------------------------------------------------------- */
-//daniel: helper function
-/*
- __global__ void check_phase_ispec_kernel(int num_phase_ispec,
- int* phase_ispec,
- int NSPEC_AB,
- int* ier) {
 
- int i,ispec,iphase,count0,count1;
- *ier = 0;
+// scalar arrays (acoustic/fluid outer core)
 
- for(iphase=0; iphase < 2; iphase++){
- count0 = 0;
- count1 = 0;
-
- for(i=0; i < num_phase_ispec; i++){
- ispec = phase_ispec[iphase*num_phase_ispec + i] - 1;
- if( ispec < -1 || ispec >= NSPEC_AB ){
- printf("Error in d_phase_ispec_inner_elastic %d %d\n",i,ispec);
- *ier = 1;
- return;
- }
- if( ispec >= 0 ){ count0++;}
- if( ispec < 0 ){ count1++;}
- }
-
- printf("check_phase_ispec done: phase %d, count = %d %d \n",iphase,count0,count1);
-
- }
- }
-
- void check_phase_ispec(long* Mesh_pointer_f,int type){
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- printf("check phase_ispec for type=%d\n",type);
-
- dim3 grid(1,1);
- dim3 threads(1,1,1);
-
- int* h_debug = (int*) calloc(1,sizeof(int));
- int* d_debug;
- cudaMalloc((void**)&d_debug,sizeof(int));
-
- if( type == 1 ){
- check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_elastic,
- mp->d_phase_ispec_inner_elastic,
- mp->NSPEC_AB,
- d_debug);
- }else if( type == 2 ){
- check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_acoustic,
- mp->d_phase_ispec_inner_acoustic,
- mp->NSPEC_AB,
- d_debug);
- }
-
- cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
- cudaFree(d_debug);
- if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
- free(h_debug);
- fflush(stdout);
-
- #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("check_phase_ispec");
- #endif
-
- }
- */
-
 /* ----------------------------------------------------------------------------------------------- */
-//daniel: helper function
-/*
- __global__ void check_ispec_is_kernel(int NSPEC_AB,
- int* ispec_is,
- int* ier) {
 
- int ispec,count0,count1;
-
- *ier = 0;
- count0 = 0;
- count1 = 0;
- for(ispec=0; ispec < NSPEC_AB; ispec++){
- if( ispec_is[ispec] < -1 || ispec_is[ispec] > 1 ){
- printf("Error in ispec_is %d %d\n",ispec,ispec_is[ispec]);
- *ier = 1;
- return;
- //exit(1);
- }
- if( ispec_is[ispec] == 0 ){count0++;}
- if( ispec_is[ispec] != 0 ){count1++;}
- }
- printf("check_ispec_is done: count = %d %d\n",count0,count1);
- }
-
- void check_ispec_is(long* Mesh_pointer_f,int type){
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- printf("check ispec_is for type=%d\n",type);
-
- dim3 grid(1,1);
- dim3 threads(1,1,1);
-
- int* h_debug = (int*) calloc(1,sizeof(int));
- int* d_debug;
- cudaMalloc((void**)&d_debug,sizeof(int));
-
- if( type == 0 ){
- check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
- mp->d_ispec_is_inner,
- d_debug);
- }else if( type == 1 ){
- check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
- mp->d_ispec_is_elastic,
- d_debug);
- }else if( type == 2 ){
- check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
- mp->d_ispec_is_acoustic,
- d_debug);
- }
-
- cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
- cudaFree(d_debug);
- if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
- free(h_debug);
- fflush(stdout);
-
- #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("check_ispec_is");
- #endif
- }
- */
-/* ----------------------------------------------------------------------------------------------- */
-//daniel: helper function
-/*
- __global__ void check_array_ispec_kernel(int num_array_ispec,
- int* array_ispec,
- int NSPEC_AB,
- int* ier) {
-
- int i,ispec,count0,count1;
-
- *ier = 0;
- count0 = 0;
- count1 = 0;
-
- for(i=0; i < num_array_ispec; i++){
- ispec = array_ispec[i] - 1;
- if( ispec < -1 || ispec >= NSPEC_AB ){
- printf("Error in d_array_ispec %d %d\n",i,ispec);
- *ier = 1;
- return;
- }
- if( ispec >= 0 ){ count0++;}
- if( ispec < 0 ){ count1++;}
- }
-
- printf("check_array_ispec done: count = %d %d \n",count0,count1);
- }
-
- void check_array_ispec(long* Mesh_pointer_f,int type){
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- printf("check array_ispec for type=%d\n",type);
-
- dim3 grid(1,1);
- dim3 threads(1,1,1);
-
- int* h_debug = (int*) calloc(1,sizeof(int));
- int* d_debug;
- cudaMalloc((void**)&d_debug,sizeof(int));
-
- if( type == 1 ){
- check_array_ispec_kernel<<<grid,threads>>>(mp->d_num_abs_boundary_faces,
- mp->d_abs_boundary_ispec,
- mp->NSPEC_AB,
- d_debug);
- }
-
- cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
- cudaFree(d_debug);
- if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
- free(h_debug);
- fflush(stdout);
-
- #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("check_array_ispec");
- #endif
-
- }
- */
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// Check functions
-
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(check_max_norm_displ_gpu,
-              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {
-
-TRACE("check_max_norm_displ_gpu");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  cudaMemcpy(displ, mp->d_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
-  realw maxnorm=0;
-
-  for(int i=0;i<*size;i++) {
-    maxnorm = MAX(maxnorm,fabsf(displ[i]));
-  }
-  printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(check_max_norm_vector,
-              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {
-
-TRACE("check_max_norm_vector");
-
-  int procid;
-#ifdef WITH_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
-#else
-  procid = 0;
-#endif
-  realw maxnorm=0;
-  int maxloc;
-  for(int i=0;i<*size;i++) {
-    if(maxnorm<fabsf(vector1[i])) {
-      maxnorm = vector1[i];
-      maxloc = i;
-    }
-  }
-  printf("%d:maxnorm of vector %d [%d] = %e\n",procid,*announceID,maxloc,maxnorm);
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(check_max_norm_displ,
-              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {
-
-TRACE("check_max_norm_displ");
-
-  realw maxnorm=0;
-
-  for(int i=0;i<*size;i++) {
-    maxnorm = MAX(maxnorm,fabsf(displ[i]));
-  }
-  printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(check_max_norm_b_displ_gpu,
-              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {
-
-TRACE("check_max_norm_b_displ_gpu");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  realw* b_accel = (realw*)malloc(*size*sizeof(realw));
-
-  cudaMemcpy(b_displ, mp->d_b_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
-  cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
-
-  realw maxnorm=0;
-  realw maxnorm_accel=0;
-
-  for(int i=0;i<*size;i++) {
-    maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
-    maxnorm_accel = MAX(maxnorm,fabsf(b_accel[i]));
-  }
-  free(b_accel);
-  printf("%d: maxnorm of backward displ = %e\n",*announceID,maxnorm);
-  printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm_accel);
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(check_max_norm_b_accel_gpu,
-              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {
-
-TRACE("check_max_norm_b_accel_gpu");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
-
-  realw maxnorm=0;
-
-  for(int i=0;i<*size;i++) {
-    maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
-  }
-  printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm);
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(check_max_norm_b_veloc_gpu,
-              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {
-
-TRACE("check_max_norm_b_veloc_gpu");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
-  cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(realw),cudaMemcpyDeviceToHost);
-
-  realw maxnorm=0;
-
-  for(int i=0;i<*size;i++) {
-    maxnorm = MAX(maxnorm,fabsf(b_veloc[i]));
-  }
-  printf("%d: maxnorm of backward veloc = %e\n",*announceID,maxnorm);
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(check_max_norm_b_displ,
-              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {
-
-TRACE("check_max_norm_b_displ");
-
-  realw maxnorm=0;
-
-  for(int i=0;i<*size;i++) {
-    maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
-  }
-  printf("%d:maxnorm of backward displ = %e\n",*announceID,maxnorm);
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(check_max_norm_b_accel,
-              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {
-
-TRACE("check_max_norm_b_accel");
-
-  realw maxnorm=0;
-
-  for(int i=0;i<*size;i++) {
-    maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
-  }
-  printf("%d:maxnorm of backward accel = %e\n",*announceID,maxnorm);
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(check_error_vectors,
-              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {
-
-TRACE("check_error_vectors");
-
-  int size = *sizef;
-
-  double diff2 = 0;
-  double sum = 0;
-  double temp;
-  double maxerr=0;
-  int maxerrorloc;
-
-  for(int i=0;i<size;++i) {
-    temp = vector1[i]-vector2[i];
-    diff2 += temp*temp;
-    sum += vector1[i]*vector1[i];
-    if(maxerr < fabsf(temp)) {
-      maxerr = abs(temp);
-      maxerrorloc = i;
-    }
-  }
-
-  printf("rel error = %f, maxerr = %e @ %d\n",diff2/sum,maxerr,maxerrorloc);
-  int myrank;
-#ifdef WITH_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
-#else
-  myrank = 0;
-#endif
-  if(myrank == 0) {
-    for(int i=maxerrorloc;i>maxerrorloc-5;i--) {
-      printf("[%d]: %e vs. %e\n",i,vector1[i],vector2[i]);
-    }
-  }
-
-}
-*/
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// Auxiliary functions
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(get_max_accel,
-              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {
-
-TRACE("get_max_accel");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer);
-  int procid;
-#ifdef WITH_MPI
-  MPI_Comm_rank(MPI_COMM_WORLD,&procid);
-#else
-  procid = 0;
-#endif
-  int size = *sizef;
-  int it = *itf;
-  realw* accel_cpy = (realw*)malloc(size*sizeof(realw));
-  cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost);
-  realw maxval=0;
-  for(int i=0;i<size;++i) {
-    maxval = MAX(maxval,accel_cpy[i]);
-  }
-  printf("%d/%d: max=%e\n",it,procid,maxval);
-  free(accel_cpy);
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-
-// ACOUSTIC simulations
-
-/* ----------------------------------------------------------------------------------------------- */
-
 __global__ void get_maximum_scalar_kernel(realw* array, int size, realw* d_max){
 
   /* simplest version: uses only 1 thread
@@ -728,7 +295,7 @@
   unsigned int i = tid + bx*blockDim.x;
 
   // loads absolute values into shared memory
-  sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ;
+  sdata[tid] = (i < size) ? fabs(array[i]) : 0.0f ;
 
   __syncthreads();
 
@@ -884,7 +451,7 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-// ELASTIC simulations
+// vector arrays (elastic solid part)
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -901,7 +468,7 @@
   // loads values into shared memory: assume array is a vector array
   sdata[tid] = (i < size) ? sqrt(array[i*3]*array[i*3]
                                  + array[i*3+1]*array[i*3+1]
-                                 + array[i*3+2]*array[i*3+2]) : 0.0 ;
+                                 + array[i*3+2]*array[i*3+2]) : 0.0f ;
 
   __syncthreads();
 
@@ -1181,3 +748,440 @@
 }
 
 
+// unused....
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Auxiliary functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+ __global__ void check_phase_ispec_kernel(int num_phase_ispec,
+ int* phase_ispec,
+ int NSPEC_AB,
+ int* ier) {
+
+ int i,ispec,iphase,count0,count1;
+ *ier = 0;
+
+ for(iphase=0; iphase < 2; iphase++){
+ count0 = 0;
+ count1 = 0;
+
+ for(i=0; i < num_phase_ispec; i++){
+ ispec = phase_ispec[iphase*num_phase_ispec + i] - 1;
+ if( ispec < -1 || ispec >= NSPEC_AB ){
+ printf("Error in d_phase_ispec_inner_elastic %d %d\n",i,ispec);
+ *ier = 1;
+ return;
+ }
+ if( ispec >= 0 ){ count0++;}
+ if( ispec < 0 ){ count1++;}
+ }
+
+ printf("check_phase_ispec done: phase %d, count = %d %d \n",iphase,count0,count1);
+
+ }
+ }
+
+ void check_phase_ispec(long* Mesh_pointer_f,int type){
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ printf("check phase_ispec for type=%d\n",type);
+
+ dim3 grid(1,1);
+ dim3 threads(1,1,1);
+
+ int* h_debug = (int*) calloc(1,sizeof(int));
+ int* d_debug;
+ cudaMalloc((void**)&d_debug,sizeof(int));
+
+ if( type == 1 ){
+ check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_elastic,
+ mp->d_phase_ispec_inner_elastic,
+ mp->NSPEC_AB,
+ d_debug);
+ }else if( type == 2 ){
+ check_phase_ispec_kernel<<<grid,threads>>>(mp->num_phase_ispec_acoustic,
+ mp->d_phase_ispec_inner_acoustic,
+ mp->NSPEC_AB,
+ d_debug);
+ }
+
+ cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+ cudaFree(d_debug);
+ if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+ free(h_debug);
+ fflush(stdout);
+
+ #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("check_phase_ispec");
+ #endif
+
+ }
+ */
+
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+ __global__ void check_ispec_is_kernel(int NSPEC_AB,
+ int* ispec_is,
+ int* ier) {
+
+ int ispec,count0,count1;
+
+ *ier = 0;
+ count0 = 0;
+ count1 = 0;
+ for(ispec=0; ispec < NSPEC_AB; ispec++){
+ if( ispec_is[ispec] < -1 || ispec_is[ispec] > 1 ){
+ printf("Error in ispec_is %d %d\n",ispec,ispec_is[ispec]);
+ *ier = 1;
+ return;
+ //exit(1);
+ }
+ if( ispec_is[ispec] == 0 ){count0++;}
+ if( ispec_is[ispec] != 0 ){count1++;}
+ }
+ printf("check_ispec_is done: count = %d %d\n",count0,count1);
+ }
+
+ void check_ispec_is(long* Mesh_pointer_f,int type){
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ printf("check ispec_is for type=%d\n",type);
+
+ dim3 grid(1,1);
+ dim3 threads(1,1,1);
+
+ int* h_debug = (int*) calloc(1,sizeof(int));
+ int* d_debug;
+ cudaMalloc((void**)&d_debug,sizeof(int));
+
+ if( type == 0 ){
+ check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+ mp->d_ispec_is_inner,
+ d_debug);
+ }else if( type == 1 ){
+ check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+ mp->d_ispec_is_elastic,
+ d_debug);
+ }else if( type == 2 ){
+ check_ispec_is_kernel<<<grid,threads>>>(mp->NSPEC_AB,
+ mp->d_ispec_is_acoustic,
+ d_debug);
+ }
+
+ cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+ cudaFree(d_debug);
+ if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+ free(h_debug);
+ fflush(stdout);
+
+ #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("check_ispec_is");
+ #endif
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+//daniel: helper function
+/*
+ __global__ void check_array_ispec_kernel(int num_array_ispec,
+ int* array_ispec,
+ int NSPEC_AB,
+ int* ier) {
+
+ int i,ispec,count0,count1;
+
+ *ier = 0;
+ count0 = 0;
+ count1 = 0;
+
+ for(i=0; i < num_array_ispec; i++){
+ ispec = array_ispec[i] - 1;
+ if( ispec < -1 || ispec >= NSPEC_AB ){
+ printf("Error in d_array_ispec %d %d\n",i,ispec);
+ *ier = 1;
+ return;
+ }
+ if( ispec >= 0 ){ count0++;}
+ if( ispec < 0 ){ count1++;}
+ }
+
+ printf("check_array_ispec done: count = %d %d \n",count0,count1);
+ }
+
+ void check_array_ispec(long* Mesh_pointer_f,int type){
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ printf("check array_ispec for type=%d\n",type);
+
+ dim3 grid(1,1);
+ dim3 threads(1,1,1);
+
+ int* h_debug = (int*) calloc(1,sizeof(int));
+ int* d_debug;
+ cudaMalloc((void**)&d_debug,sizeof(int));
+
+ if( type == 1 ){
+ check_array_ispec_kernel<<<grid,threads>>>(mp->d_num_abs_boundary_faces,
+ mp->d_abs_boundary_ispec,
+ mp->NSPEC_AB,
+ d_debug);
+ }
+
+ cudaMemcpy(h_debug,d_debug,1*sizeof(int),cudaMemcpyDeviceToHost);
+ cudaFree(d_debug);
+ if( *h_debug != 0 ){printf("error for type=%d\n",type); exit(1);}
+ free(h_debug);
+ fflush(stdout);
+
+ #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("check_array_ispec");
+ #endif
+
+ }
+ */
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Check functions
+
+/* ----------------------------------------------------------------------------------------------- */
+/*
+ extern "C"
+ void FC_FUNC_(check_max_norm_displ_gpu,
+ CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {
+
+ TRACE("check_max_norm_displ_gpu");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ cudaMemcpy(displ, mp->d_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+ realw maxnorm=0;
+
+ for(int i=0;i<*size;i++) {
+ maxnorm = MAX(maxnorm,fabsf(displ[i]));
+ }
+ printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+/*
+ extern "C"
+ void FC_FUNC_(check_max_norm_vector,
+ CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {
+
+ TRACE("check_max_norm_vector");
+
+ int procid;
+ #ifdef WITH_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+ #else
+ procid = 0;
+ #endif
+ realw maxnorm=0;
+ int maxloc;
+ for(int i=0;i<*size;i++) {
+ if(maxnorm<fabsf(vector1[i])) {
+ maxnorm = vector1[i];
+ maxloc = i;
+ }
+ }
+ printf("%d:maxnorm of vector %d [%d] = %e\n",procid,*announceID,maxloc,maxnorm);
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+/*
+ extern "C"
+ void FC_FUNC_(check_max_norm_displ,
+ CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {
+
+ TRACE("check_max_norm_displ");
+
+ realw maxnorm=0;
+
+ for(int i=0;i<*size;i++) {
+ maxnorm = MAX(maxnorm,fabsf(displ[i]));
+ }
+ printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+/*
+ extern "C"
+ void FC_FUNC_(check_max_norm_b_displ_gpu,
+ CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {
+
+ TRACE("check_max_norm_b_displ_gpu");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ realw* b_accel = (realw*)malloc(*size*sizeof(realw));
+
+ cudaMemcpy(b_displ, mp->d_b_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+ realw maxnorm=0;
+ realw maxnorm_accel=0;
+
+ for(int i=0;i<*size;i++) {
+ maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
+ maxnorm_accel = MAX(maxnorm,fabsf(b_accel[i]));
+ }
+ free(b_accel);
+ printf("%d: maxnorm of backward displ = %e\n",*announceID,maxnorm);
+ printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm_accel);
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+/*
+ extern "C"
+ void FC_FUNC_(check_max_norm_b_accel_gpu,
+ CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {
+
+ TRACE("check_max_norm_b_accel_gpu");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+ realw maxnorm=0;
+
+ for(int i=0;i<*size;i++) {
+ maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
+ }
+ printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm);
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+/*
+ extern "C"
+ void FC_FUNC_(check_max_norm_b_veloc_gpu,
+ CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {
+
+ TRACE("check_max_norm_b_veloc_gpu");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+
+ realw maxnorm=0;
+
+ for(int i=0;i<*size;i++) {
+ maxnorm = MAX(maxnorm,fabsf(b_veloc[i]));
+ }
+ printf("%d: maxnorm of backward veloc = %e\n",*announceID,maxnorm);
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+/*
+ extern "C"
+ void FC_FUNC_(check_max_norm_b_displ,
+ CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {
+
+ TRACE("check_max_norm_b_displ");
+
+ realw maxnorm=0;
+
+ for(int i=0;i<*size;i++) {
+ maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
+ }
+ printf("%d:maxnorm of backward displ = %e\n",*announceID,maxnorm);
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+/*
+ extern "C"
+ void FC_FUNC_(check_max_norm_b_accel,
+ CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {
+
+ TRACE("check_max_norm_b_accel");
+
+ realw maxnorm=0;
+
+ for(int i=0;i<*size;i++) {
+ maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
+ }
+ printf("%d:maxnorm of backward accel = %e\n",*announceID,maxnorm);
+ }
+ */
+/* ----------------------------------------------------------------------------------------------- */
+/*
+ extern "C"
+ void FC_FUNC_(check_error_vectors,
+ CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {
+
+ TRACE("check_error_vectors");
+
+ int size = *sizef;
+
+ double diff2 = 0;
+ double sum = 0;
+ double temp;
+ double maxerr=0;
+ int maxerrorloc;
+
+ for(int i=0;i<size;++i) {
+ temp = vector1[i]-vector2[i];
+ diff2 += temp*temp;
+ sum += vector1[i]*vector1[i];
+ if(maxerr < fabsf(temp)) {
+ maxerr = abs(temp);
+ maxerrorloc = i;
+ }
+ }
+
+ printf("rel error = %f, maxerr = %e @ %d\n",diff2/sum,maxerr,maxerrorloc);
+ int myrank;
+ #ifdef WITH_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+ #else
+ myrank = 0;
+ #endif
+ if(myrank == 0) {
+ for(int i=maxerrorloc;i>maxerrorloc-5;i--) {
+ printf("[%d]: %e vs. %e\n",i,vector1[i],vector2[i]);
+ }
+ }
+
+ }
+ */
+
+
+/* ----------------------------------------------------------------------------------------------- */
+/*
+ extern "C"
+ void FC_FUNC_(get_max_accel,
+ GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {
+
+ TRACE("get_max_accel");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+ int procid;
+ #ifdef WITH_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+ #else
+ procid = 0;
+ #endif
+ int size = *sizef;
+ int it = *itf;
+ realw* accel_cpy = (realw*)malloc(size*sizeof(realw));
+ cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost);
+ realw maxval=0;
+ for(int i=0;i<size;++i) {
+ maxval = MAX(maxval,accel_cpy[i]);
+ }
+ printf("%d/%d: max=%e\n",it,procid,maxval);
+ free(accel_cpy);
+ }
+ */
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2012-08-15 19:55:44 UTC (rev 20570)
@@ -558,75 +558,58 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-
 __global__ void compute_coupling_ocean_cuda_kernel(realw* accel_crust_mantle,
-                                                   realw* rmassx_crust_mantle,
-                                                   realw* rmassy_crust_mantle,
-                                                   realw* rmassz_crust_mantle,
-                                                   realw* rmass_ocean_load,
-                                                   realw* normal_top_crust_mantle,
-                                                   int* ibool_crust_mantle,
-                                                   int* ibelm_top_crust_mantle,
-                                                   int* updated_dof_ocean_load,
-                                                   int NSPEC2D_TOP_CM) {
+                                                       realw* rmassx_crust_mantle,
+                                                       realw* rmassy_crust_mantle,
+                                                       realw* rmassz_crust_mantle,
+                                                       realw* rmass_ocean_load,
+                                                       int npoin_ocean_load,
+                                                       int* iglob_ocean_load,
+                                                       realw* normal_ocean_load) {
 
-  int i = threadIdx.x;
-  int j = threadIdx.y;
-  int iface = blockIdx.x + gridDim.x*blockIdx.y;
+  int ipoin = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
-  int k,iglob,ispec;
-  realw nx,ny,nz;
+  int iglob;
+  realw nx,ny,nz,rmass;
   realw force_normal_comp;
   realw additional_term_x,additional_term_y,additional_term_z;
 
-  // for surfaces elements exactly at the top of the crust mantle (ocean bottom)
-  if( iface < NSPEC2D_TOP_CM ){
+  // for global points exactly at the top of the crust mantle (ocean bottom)
+  if(ipoin < npoin_ocean_load) {
 
+    // get global point number
     // "-1" from index values to convert from Fortran-> C indexing
-    ispec = ibelm_top_crust_mantle[iface] - 1;
+    iglob = iglob_ocean_load[ipoin] - 1;
 
-    // only for DOFs exactly on the CMB (top of these elements)
-    k = NGLLX - 1;
+    // get normal
+    nx = normal_ocean_load[INDEX2(NDIM,0,ipoin)]; // (1,ipoin)
+    ny = normal_ocean_load[INDEX2(NDIM,1,ipoin)]; // (1,ipoin)
+    nz = normal_ocean_load[INDEX2(NDIM,2,ipoin)]; // (1,ipoin)
 
-    // get global point number
-    iglob = ibool_crust_mantle[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)] - 1;
+    // make updated component of right-hand side
+    // we divide by rmass() which is 1 / M
+    // we use the total force which includes the Coriolis term above
+    force_normal_comp = accel_crust_mantle[iglob*3]*nx / rmassx_crust_mantle[iglob]
+                      + accel_crust_mantle[iglob*3+1]*ny / rmassy_crust_mantle[iglob]
+                      + accel_crust_mantle[iglob*3+2]*nz / rmassz_crust_mantle[iglob];
 
-    // only update this global point once
+    rmass = rmass_ocean_load[ipoin];
 
-    // daniel: TODO - there might be better ways to implement a mutex like below,
-    //            and find a workaround to not use the temporary update array.
-    //            As a reminder, atomicExch atomically sets the value of the memory
-    //            location stored in address to val and returns the old value.
-    //            0 indicates that we still have to do this point
+    additional_term_x = (rmass - rmassx_crust_mantle[iglob]) * force_normal_comp;
+    additional_term_y = (rmass - rmassy_crust_mantle[iglob]) * force_normal_comp;
+    additional_term_z = (rmass - rmassz_crust_mantle[iglob]) * force_normal_comp;
 
-    if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
-
-      // get normal on the CMB
-      nx = normal_top_crust_mantle[INDEX4(NDIM,NGLLX,NGLLX,0,i,j,iface)]; // (1,i,j,iface)
-      ny = normal_top_crust_mantle[INDEX4(NDIM,NGLLX,NGLLX,1,i,j,iface)]; // (2,i,j,iface)
-      nz = normal_top_crust_mantle[INDEX4(NDIM,NGLLX,NGLLX,2,i,j,iface)]; // (3,i,j,iface)
-
-      // make updated component of right-hand side
-      // we divide by rmass() which is 1 / M
-      // we use the total force which includes the Coriolis term above
-      force_normal_comp = accel_crust_mantle[iglob*3]*nx / rmassx_crust_mantle[iglob]
-                        + accel_crust_mantle[iglob*3+1]*ny / rmassy_crust_mantle[iglob]
-                        + accel_crust_mantle[iglob*3+2]*nz / rmassz_crust_mantle[iglob];
-
-      additional_term_x = (rmass_ocean_load[iglob] - rmassx_crust_mantle[iglob]) * force_normal_comp;
-      additional_term_y = (rmass_ocean_load[iglob] - rmassy_crust_mantle[iglob]) * force_normal_comp;
-      additional_term_z = (rmass_ocean_load[iglob] - rmassz_crust_mantle[iglob]) * force_normal_comp;
-
-      // probably wouldn't need atomicAdd anymore, but just to be sure...
-      atomicAdd(&accel_crust_mantle[iglob*3], + additional_term_x * nx);
-      atomicAdd(&accel_crust_mantle[iglob*3+1], + additional_term_y * ny);
-      atomicAdd(&accel_crust_mantle[iglob*3+2], + additional_term_z * nz);
-    }
+    // since we access this global point only once...
+    accel_crust_mantle[iglob*3] += additional_term_x * nx;
+    accel_crust_mantle[iglob*3+1] += additional_term_y * ny;
+    accel_crust_mantle[iglob*3+2] += additional_term_z * nz;
   }
 }
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
+
 extern "C"
 void FC_FUNC_(compute_coupling_ocean_cuda,
               COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
@@ -636,7 +619,10 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  int num_blocks_x = mp->nspec2D_top_crust_mantle;
+  int blocksize = BLOCKSIZE_TRANSFER;
+  int size_padded = ((int)ceil(((double)mp->npoin_oceans)/((double)blocksize)))*blocksize;
+
+  int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
@@ -644,40 +630,28 @@
   }
 
   dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,1);
+  dim3 threads(blocksize,1,1);
 
-  // initializes temporary array to zero
-  print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
-                                     sizeof(int)*(mp->NGLOB_CRUST_MANTLE_OCEANS)),88501);
-
   if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
     compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
                                                          mp->d_rmassx_crust_mantle,
                                                          mp->d_rmassy_crust_mantle,
                                                          mp->d_rmassz_crust_mantle,
                                                          mp->d_rmass_ocean_load,
-                                                         mp->d_normal_top_crust_mantle,
-                                                         mp->d_ibool_crust_mantle,
-                                                         mp->d_ibelm_top_crust_mantle,
-                                                         mp->d_updated_dof_ocean_load,
-                                                         mp->nspec2D_top_crust_mantle);
+                                                         mp->npoin_oceans,
+                                                         mp->d_iglob_ocean_load,
+                                                         mp->d_normal_ocean_load);
 
     // for backward/reconstructed potentials
     if( mp->simulation_type == 3 ) {
-      // re-initializes array
-      print_CUDA_error_if_any(cudaMemset(mp->d_b_updated_dof_ocean_load,0,
-                                         sizeof(int)*(mp->NGLOB_CRUST_MANTLE_OCEANS)),88502);
-
       compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
                                                            mp->d_rmassx_crust_mantle,
                                                            mp->d_rmassy_crust_mantle,
                                                            mp->d_rmassz_crust_mantle,
                                                            mp->d_rmass_ocean_load,
-                                                           mp->d_normal_top_crust_mantle,
-                                                           mp->d_ibool_crust_mantle,
-                                                           mp->d_ibelm_top_crust_mantle,
-                                                           mp->d_b_updated_dof_ocean_load,
-                                                           mp->nspec2D_top_crust_mantle);
+                                                           mp->npoin_oceans,
+                                                           mp->d_iglob_ocean_load,
+                                                           mp->d_normal_ocean_load);
     }
   }else{
     compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
@@ -685,28 +659,20 @@
                                                          mp->d_rmassz_crust_mantle,
                                                          mp->d_rmassz_crust_mantle,
                                                          mp->d_rmass_ocean_load,
-                                                         mp->d_normal_top_crust_mantle,
-                                                         mp->d_ibool_crust_mantle,
-                                                         mp->d_ibelm_top_crust_mantle,
-                                                         mp->d_updated_dof_ocean_load,
-                                                         mp->nspec2D_top_crust_mantle);
+                                                         mp->npoin_oceans,
+                                                         mp->d_iglob_ocean_load,
+                                                         mp->d_normal_ocean_load);
 
     // for backward/reconstructed potentials
     if( mp->simulation_type == 3 ) {
-      // re-initializes array
-      print_CUDA_error_if_any(cudaMemset(mp->d_b_updated_dof_ocean_load,0,
-                                         sizeof(int)*(mp->NGLOB_CRUST_MANTLE_OCEANS)),88502);
-
       compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
                                                            mp->d_rmassz_crust_mantle,
                                                            mp->d_rmassz_crust_mantle,
                                                            mp->d_rmassz_crust_mantle,
                                                            mp->d_rmass_ocean_load,
-                                                           mp->d_normal_top_crust_mantle,
-                                                           mp->d_ibool_crust_mantle,
-                                                           mp->d_ibelm_top_crust_mantle,
-                                                           mp->d_b_updated_dof_ocean_load,
-                                                           mp->nspec2D_top_crust_mantle);
+                                                           mp->npoin_oceans,
+                                                           mp->d_iglob_ocean_load,
+                                                           mp->d_normal_ocean_load);
     }
   }
 
@@ -714,3 +680,4 @@
   exit_on_cuda_error("compute_coupling_ocean_cuda");
 #endif
 }
+

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-08-15 19:55:44 UTC (rev 20570)
@@ -305,7 +305,7 @@
                                          realw* d_c44store,realw* d_c45store,realw* d_c46store,
                                          realw* d_c55store,realw* d_c56store,realw* d_c66store,
                                          int ATTENUATION,
-                                         realw minus_sum_beta,
+                                         realw one_minus_sum_beta_use,
                                          realw duxdxl,realw duxdyl,realw duxdzl,
                                          realw duydxl,realw duydyl,realw duydzl,
                                          realw duzdxl,realw duzdyl,realw duzdzl,
@@ -315,7 +315,7 @@
                                          ){
 
   realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
-  realw mul;
+  realw mul,minus_sum_beta;
 
   c11 = d_c11store[offset];
   c12 = d_c12store[offset];
@@ -341,7 +341,9 @@
 
   // use unrelaxed parameters if attenuation
   if( ATTENUATION){
+    minus_sum_beta = one_minus_sum_beta_use - 1.0f;
     mul = c44;
+
     c11 = c11 + 1.33333333333333333333f * minus_sum_beta * mul;
     c12 = c12 - 0.66666666666666666666f * minus_sum_beta * mul;
     c13 = c13 - 0.66666666666666666666f * minus_sum_beta * mul;
@@ -760,7 +762,7 @@
   realw templ;
 
   realw fac1,fac2,fac3;
-  realw minus_sum_beta,one_minus_sum_beta_use;
+  realw one_minus_sum_beta_use;
 
   realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
   realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
@@ -785,9 +787,11 @@
   __shared__ realw s_tempx1[NGLL3];
   __shared__ realw s_tempx2[NGLL3];
   __shared__ realw s_tempx3[NGLL3];
+
   __shared__ realw s_tempy1[NGLL3];
   __shared__ realw s_tempy2[NGLL3];
   __shared__ realw s_tempy3[NGLL3];
+
   __shared__ realw s_tempz1[NGLL3];
   __shared__ realw s_tempz2[NGLL3];
   __shared__ realw s_tempz3[NGLL3];
@@ -842,8 +846,7 @@
         s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
         s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
 #endif
-      }
-      else{
+      }else{
         // takes old routines
         s_dummyx_loc_att[tx] = s_dummyx_loc[tx];
         s_dummyy_loc_att[tx] = s_dummyy_loc[tx];
@@ -1141,7 +1144,6 @@
       }else{
         one_minus_sum_beta_use = one_minus_sum_beta[working_element]; // (1,1,1,ispec)
       }
-      minus_sum_beta = one_minus_sum_beta_use - 1.0f;
     }
 
     // computes stresses
@@ -1152,7 +1154,7 @@
                             d_c23store,d_c24store,d_c25store,d_c26store,d_c33store,d_c34store,d_c35store,
                             d_c36store,d_c44store,d_c45store,d_c46store,d_c55store,d_c56store,d_c66store,
                             ATTENUATION,
-                            minus_sum_beta,
+                            one_minus_sum_beta_use,
                             duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl,
                             duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
                             &sigma_xx,&sigma_yy,&sigma_zz,
@@ -1202,8 +1204,8 @@
 
     // jacobian
     jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
-                        -xiyl*(etaxl*gammazl-etazl*gammaxl)
-                        +xizl*(etaxl*gammayl-etayl*gammaxl));
+                     - xiyl*(etaxl*gammazl-etazl*gammaxl)
+                     + xizl*(etaxl*gammayl-etayl*gammaxl));
 
     if( GRAVITY ){
       //  computes non-symmetric terms for gravity
@@ -1390,13 +1392,16 @@
 
     // update memory variables based upon the Runge-Kutta scheme
     if( ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+
       compute_element_cm_att_memory(tx,working_element,
-                                d_muvstore,
-                                factor_common,alphaval,betaval,gammaval,
-                                R_xx,R_yy,R_xy,R_xz,R_yz,
-                                epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
-                                epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc,
-                                d_c44store,ANISOTROPY,ATTENUATION_3D);
+                                    d_muvstore,
+                                    factor_common,alphaval,betaval,gammaval,
+                                    R_xx,R_yy,R_xy,R_xz,R_yz,
+                                    epsilondev_xx,epsilondev_yy,epsilondev_xy,
+                                    epsilondev_xz,epsilondev_yz,
+                                    epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,
+                                    epsilondev_xz_loc,epsilondev_yz_loc,
+                                    d_c44store,ANISOTROPY,ATTENUATION_3D);
     }
 
     // save deviatoric strain for Runge-Kutta scheme
@@ -1635,7 +1640,7 @@
     int nb_colors,nb_blocks_to_compute;
     int istart;
     int color_offset,color_offset_nonpadded;
-    int color_offset_nonpadded_att2,color_offset_nonpadded_att3;
+    int color_offset_nonpadded_att1,color_offset_nonpadded_att2,color_offset_nonpadded_att3;
     int color_offset_nonpadded_strain;
     int color_offset_ispec;
 
@@ -1648,6 +1653,7 @@
       // array offsets
       color_offset = 0;
       color_offset_nonpadded = 0;
+      color_offset_nonpadded_att1 = 0;
       color_offset_nonpadded_att2 = 0;
       color_offset_nonpadded_att3 = 0;
       color_offset_nonpadded_strain = 0;
@@ -1660,6 +1666,8 @@
       // array offsets
       color_offset = (mp->nspec_outer_crust_mantle) * NGLL3_PADDED;
       color_offset_nonpadded = (mp->nspec_outer_crust_mantle) * NGLL3;
+      color_offset_nonpadded_att1 = (mp->nspec_outer_crust_mantle) * NGLL3 * N_SLS;
+
       // for factor_common array
       if( mp->attenuation_3D ){
         color_offset_nonpadded_att2 = (mp->nspec_outer_crust_mantle) * NGLL3;
@@ -1673,7 +1681,7 @@
         color_offset_ispec = mp->nspec_outer_crust_mantle;
       }
       // for strain
-      if( ! mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1 ){
+      if( ! ( mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1 ) ){
         color_offset_nonpadded_strain = (mp->nspec_outer_crust_mantle) * NGLL3;
       }
     }
@@ -1719,22 +1727,22 @@
                             mp->d_eps_trace_over_3_crust_mantle + color_offset_nonpadded_strain,
                             mp->d_one_minus_sum_beta_crust_mantle + color_offset_nonpadded_att2,
                             mp->d_factor_common_crust_mantle + color_offset_nonpadded_att3,
-                            mp->d_R_xx_crust_mantle + color_offset_nonpadded,
-                            mp->d_R_yy_crust_mantle + color_offset_nonpadded,
-                            mp->d_R_xy_crust_mantle + color_offset_nonpadded,
-                            mp->d_R_xz_crust_mantle + color_offset_nonpadded,
-                            mp->d_R_yz_crust_mantle + color_offset_nonpadded,
+                            mp->d_R_xx_crust_mantle + color_offset_nonpadded_att1,
+                            mp->d_R_yy_crust_mantle + color_offset_nonpadded_att1,
+                            mp->d_R_xy_crust_mantle + color_offset_nonpadded_att1,
+                            mp->d_R_xz_crust_mantle + color_offset_nonpadded_att1,
+                            mp->d_R_yz_crust_mantle + color_offset_nonpadded_att1,
                             mp->d_b_epsilondev_xx_crust_mantle + color_offset_nonpadded,
                             mp->d_b_epsilondev_yy_crust_mantle + color_offset_nonpadded,
                             mp->d_b_epsilondev_xy_crust_mantle + color_offset_nonpadded,
                             mp->d_b_epsilondev_xz_crust_mantle + color_offset_nonpadded,
                             mp->d_b_epsilondev_yz_crust_mantle + color_offset_nonpadded,
                             mp->d_b_eps_trace_over_3_crust_mantle + color_offset_nonpadded,
-                            mp->d_b_R_xx_crust_mantle + color_offset_nonpadded,
-                            mp->d_b_R_yy_crust_mantle + color_offset_nonpadded,
-                            mp->d_b_R_xy_crust_mantle + color_offset_nonpadded,
-                            mp->d_b_R_xz_crust_mantle + color_offset_nonpadded,
-                            mp->d_b_R_yz_crust_mantle + color_offset_nonpadded,
+                            mp->d_b_R_xx_crust_mantle + color_offset_nonpadded_att1,
+                            mp->d_b_R_yy_crust_mantle + color_offset_nonpadded_att1,
+                            mp->d_b_R_xy_crust_mantle + color_offset_nonpadded_att1,
+                            mp->d_b_R_xz_crust_mantle + color_offset_nonpadded_att1,
+                            mp->d_b_R_yz_crust_mantle + color_offset_nonpadded_att1,
                             mp->d_c11store_crust_mantle + color_offset,
                             mp->d_c12store_crust_mantle + color_offset,
                             mp->d_c13store_crust_mantle + color_offset,
@@ -1755,13 +1763,13 @@
                             mp->d_c46store_crust_mantle + color_offset,
                             mp->d_c55store_crust_mantle + color_offset,
                             mp->d_c56store_crust_mantle + color_offset,
-                            mp->d_c66store_crust_mantle + color_offset
-                            );
+                            mp->d_c66store_crust_mantle + color_offset);
 
       // for padded and aligned arrays
       color_offset += nb_blocks_to_compute * NGLL3_PADDED;
       // for no-aligned arrays
       color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+      color_offset_nonpadded_att1 += nb_blocks_to_compute * NGLL3 * N_SLS;
       // for factor_common array
       if( mp->attenuation_3D ){
         color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3;
@@ -1775,7 +1783,7 @@
         color_offset_ispec += nb_blocks_to_compute;
       }
       // for strain
-      if( ! mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1 ){
+      if( ! ( mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1 ) ){
         color_offset_nonpadded_strain += nb_blocks_to_compute * NGLL3;
       }
 
@@ -1826,8 +1834,7 @@
                           mp->d_c25store_crust_mantle,mp->d_c26store_crust_mantle,mp->d_c33store_crust_mantle,
                           mp->d_c34store_crust_mantle,mp->d_c35store_crust_mantle,mp->d_c36store_crust_mantle,
                           mp->d_c44store_crust_mantle,mp->d_c45store_crust_mantle,mp->d_c46store_crust_mantle,
-                          mp->d_c55store_crust_mantle,mp->d_c56store_crust_mantle,mp->d_c66store_crust_mantle
-                          );
+                          mp->d_c55store_crust_mantle,mp->d_c56store_crust_mantle,mp->d_c66store_crust_mantle);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2012-08-15 19:55:44 UTC (rev 20570)
@@ -340,7 +340,8 @@
                                          realw* d_minus_deriv_gravity_table,
                                          realw* d_density_table,
                                          realw* wgll_cube,
-                                         int NSPEC_INNER_CORE_STRAIN_ONLY){
+                                         int NSPEC_INNER_CORE_STRAIN_ONLY,
+                                         int NSPEC_INNER_CORE){
 
   /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
@@ -1010,16 +1011,24 @@
     //mesh coloring
     if( use_mesh_coloring_gpu ){
 
-     // no atomic operation needed, colors don't share global points between elements
+      if( NSPEC_INNER_CORE > COLORING_MIN_NSPEC_INNER_CORE ){
+        // no atomic operation needed, colors don't share global points between elements
 #ifdef USE_TEXTURES_FIELDS
-      d_accel[iglob*3]     = tex1Dfetch(d_accel_ic_tex, iglob*3) + sum_terms1;
-      d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 1) + sum_terms2;
-      d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 2) + sum_terms3;
+        d_accel[iglob*3]     = tex1Dfetch(d_accel_ic_tex, iglob*3) + sum_terms1;
+        d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 1) + sum_terms2;
+        d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 2) + sum_terms3;
 #else
-      d_accel[iglob*3]     += sum_terms1;
-      d_accel[iglob*3 + 1] += sum_terms2;
-      d_accel[iglob*3 + 2] += sum_terms3;
+        d_accel[iglob*3]     += sum_terms1;
+        d_accel[iglob*3 + 1] += sum_terms2;
+        d_accel[iglob*3 + 2] += sum_terms3;
 #endif // USE_TEXTURES_FIELDS
+      }else{
+        // poor element count, only use 1 color per inner/outer run
+        // forces atomic operations
+        atomicAdd(&d_accel[iglob*3], sum_terms1);
+        atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+        atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+      }
 
     }else{
 
@@ -1095,8 +1104,7 @@
                          realw* d_b_R_xz,
                          realw* d_b_R_yz,
                          realw* d_c11store,realw* d_c12store,realw* d_c13store,
-                         realw* d_c33store,realw* d_c44store
-                         ){
+                         realw* d_c33store,realw* d_c44store){
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("before kernel Kernel_2_inner_core");
@@ -1168,7 +1176,8 @@
                                              mp->d_minus_deriv_gravity_table,
                                              mp->d_density_table,
                                              mp->d_wgll_cube,
-                                             mp->NSPEC_INNER_CORE_STRAIN_ONLY);
+                                             mp->NSPEC_INNER_CORE_STRAIN_ONLY,
+                                             mp->NSPEC_INNER_CORE);
 
 
   if(mp->simulation_type == 3) {
@@ -1216,7 +1225,8 @@
                                                 mp->d_minus_deriv_gravity_table,
                                                 mp->d_density_table,
                                                 mp->d_wgll_cube,
-                                                mp->NSPEC_INNER_CORE_STRAIN_ONLY);
+                                                mp->NSPEC_INNER_CORE_STRAIN_ONLY,
+                                                mp->NSPEC_INNER_CORE);
   }
 
   // cudaEventRecord( stop, 0 );
@@ -1270,51 +1280,102 @@
     int nb_colors,nb_blocks_to_compute;
     int istart;
     int color_offset,color_offset_nonpadded;
-    int color_offset_nonpadded_att2,color_offset_nonpadded_att3;
+    int color_offset_nonpadded_att1,color_offset_nonpadded_att2,color_offset_nonpadded_att3;
     int color_offset_nonpadded_strain;
     int color_offset_ispec;
 
     // sets up color loop
-    if( *iphase == 1 ){
-      // outer elements
-      nb_colors = mp->num_colors_outer_inner_core;
-      istart = 0;
+    if( mp->NSPEC_INNER_CORE > COLORING_MIN_NSPEC_INNER_CORE ){
+      if( *iphase == 1 ){
+        // outer elements
+        nb_colors = mp->num_colors_outer_inner_core;
+        istart = 0;
 
-      // array offsets
-      color_offset = 0;
-      color_offset_nonpadded = 0;
-      color_offset_nonpadded_att2 = 0;
-      color_offset_nonpadded_att3 = 0;
-      color_offset_nonpadded_strain = 0;
-      color_offset_ispec = 0;
+        // array offsets
+        color_offset = 0;
+        color_offset_nonpadded = 0;
+        color_offset_nonpadded_att1 = 0;
+        color_offset_nonpadded_att2 = 0;
+        color_offset_nonpadded_att3 = 0;
+        color_offset_nonpadded_strain = 0;
+        color_offset_ispec = 0;
+      }else{
+        // inner elements (start after outer elements)
+        nb_colors = mp->num_colors_outer_inner_core + mp->num_colors_inner_inner_core;
+        istart = mp->num_colors_outer_inner_core;
+
+        // array offsets
+        color_offset = (mp->nspec_outer_inner_core) * NGLL3_PADDED;
+        color_offset_nonpadded = (mp->nspec_outer_inner_core) * NGLL3;
+        color_offset_nonpadded_att1 = (mp->nspec_outer_inner_core) * NGLL3 * N_SLS;
+        // for factor_common array
+        if( mp->attenuation_3D ){
+          color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * NGLL3;
+          color_offset_nonpadded_att3 = (mp->nspec_outer_inner_core) * NGLL3 * N_SLS;
+        }else{
+          color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * 1;
+          color_offset_nonpadded_att3 = (mp->nspec_outer_inner_core) * 1 * N_SLS;
+        }
+        // for idoubling array
+        color_offset_ispec = mp->nspec_outer_inner_core;
+        // for strain
+        if( ! mp->NSPEC_INNER_CORE_STRAIN_ONLY == 1 ){
+          color_offset_nonpadded_strain = (mp->nspec_outer_inner_core) * NGLL3;
+        }
+      }
     }else{
-      // inner elements (start after outer elements)
-      nb_colors = mp->num_colors_outer_inner_core + mp->num_colors_inner_inner_core;
-      istart = mp->num_colors_outer_inner_core;
 
-      // array offsets
-      color_offset = (mp->nspec_outer_inner_core) * NGLL3_PADDED;
-      color_offset_nonpadded = (mp->nspec_outer_inner_core) * NGLL3;
-      // for factor_common array
-      if( mp->attenuation_3D ){
-        color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * NGLL3;
-        color_offset_nonpadded_att3 = (mp->nspec_outer_inner_core) * NGLL3 * N_SLS;
+      // poor element count, only use 1 color per inner/outer run
+
+      if( *iphase == 1 ){
+        // outer elements
+        nb_colors = 1;
+        istart = 0;
+
+        // array offsets
+        color_offset = 0;
+        color_offset_nonpadded = 0;
+        color_offset_nonpadded_att1 = 0;
+        color_offset_nonpadded_att2 = 0;
+        color_offset_nonpadded_att3 = 0;
+        color_offset_nonpadded_strain = 0;
+        color_offset_ispec = 0;
       }else{
-        color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * 1;
-        color_offset_nonpadded_att3 = (mp->nspec_outer_inner_core) * 1 * N_SLS;
+        // inner element colors (start after outer elements)
+        nb_colors = 1;
+        istart = 0;
+
+        // array offsets
+        color_offset = (mp->nspec_outer_inner_core) * NGLL3_PADDED;
+        color_offset_nonpadded = (mp->nspec_outer_inner_core) * NGLL3;
+        color_offset_nonpadded_att1 = (mp->nspec_outer_inner_core) * NGLL3 * N_SLS;
+        // for factor_common array
+        if( mp->attenuation_3D ){
+          color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * NGLL3;
+          color_offset_nonpadded_att3 = (mp->nspec_outer_inner_core) * NGLL3 * N_SLS;
+        }else{
+          color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * 1;
+          color_offset_nonpadded_att3 = (mp->nspec_outer_inner_core) * 1 * N_SLS;
+        }
+        // for idoubling array
+        color_offset_ispec = mp->nspec_outer_inner_core;
+        // for strain
+        if( ! mp->NSPEC_INNER_CORE_STRAIN_ONLY == 1 ){
+          color_offset_nonpadded_strain = (mp->nspec_outer_inner_core) * NGLL3;
+        }
       }
-      // for idoubling array
-      color_offset_ispec = mp->nspec_outer_inner_core;
-      // for strain
-      if( ! mp->NSPEC_INNER_CORE_STRAIN_ONLY == 1 ){
-        color_offset_nonpadded_strain = (mp->nspec_outer_inner_core) * NGLL3;
-      }
     }
 
+
     // loops over colors
     for(int icolor = istart; icolor < nb_colors; icolor++){
 
-      nb_blocks_to_compute = mp->h_num_elem_colors_inner_core[icolor];
+      // gets number of elements for this color
+      if( mp->NSPEC_INNER_CORE > COLORING_MIN_NSPEC_INNER_CORE ){
+        nb_blocks_to_compute = mp->h_num_elem_colors_inner_core[icolor];
+      }else{
+        nb_blocks_to_compute = num_elements;
+      }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
       // checks
@@ -1348,22 +1409,22 @@
                           mp->d_eps_trace_over_3_inner_core + color_offset_nonpadded_strain,
                           mp->d_one_minus_sum_beta_inner_core + color_offset_nonpadded_att2,
                           mp->d_factor_common_inner_core + color_offset_nonpadded_att3,
-                          mp->d_R_xx_inner_core + color_offset_nonpadded,
-                          mp->d_R_yy_inner_core + color_offset_nonpadded,
-                          mp->d_R_xy_inner_core + color_offset_nonpadded,
-                          mp->d_R_xz_inner_core + color_offset_nonpadded,
-                          mp->d_R_yz_inner_core + color_offset_nonpadded,
+                          mp->d_R_xx_inner_core + color_offset_nonpadded_att1,
+                          mp->d_R_yy_inner_core + color_offset_nonpadded_att1,
+                          mp->d_R_xy_inner_core + color_offset_nonpadded_att1,
+                          mp->d_R_xz_inner_core + color_offset_nonpadded_att1,
+                          mp->d_R_yz_inner_core + color_offset_nonpadded_att1,
                           mp->d_b_epsilondev_xx_inner_core + color_offset_nonpadded,
                           mp->d_b_epsilondev_yy_inner_core + color_offset_nonpadded,
                           mp->d_b_epsilondev_xy_inner_core + color_offset_nonpadded,
                           mp->d_b_epsilondev_xz_inner_core + color_offset_nonpadded,
                           mp->d_b_epsilondev_yz_inner_core + color_offset_nonpadded,
                           mp->d_b_eps_trace_over_3_inner_core + color_offset_nonpadded,
-                          mp->d_b_R_xx_inner_core + color_offset_nonpadded,
-                          mp->d_b_R_yy_inner_core + color_offset_nonpadded,
-                          mp->d_b_R_xy_inner_core + color_offset_nonpadded,
-                          mp->d_b_R_xz_inner_core + color_offset_nonpadded,
-                          mp->d_b_R_yz_inner_core + color_offset_nonpadded,
+                          mp->d_b_R_xx_inner_core + color_offset_nonpadded_att1,
+                          mp->d_b_R_yy_inner_core + color_offset_nonpadded_att1,
+                          mp->d_b_R_xy_inner_core + color_offset_nonpadded_att1,
+                          mp->d_b_R_xz_inner_core + color_offset_nonpadded_att1,
+                          mp->d_b_R_yz_inner_core + color_offset_nonpadded_att1,
                           mp->d_c11store_inner_core + color_offset,
                           mp->d_c12store_inner_core + color_offset,
                           mp->d_c13store_inner_core + color_offset,
@@ -1375,6 +1436,7 @@
       color_offset += nb_blocks_to_compute * NGLL3_PADDED;
       // for no-aligned arrays
       color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+      color_offset_nonpadded_att1 += nb_blocks_to_compute * NGLL3 * N_SLS;
       // for factor_common array
       if( mp->attenuation_3D ){
         color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3;

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2012-08-15 19:55:44 UTC (rev 20570)
@@ -126,7 +126,8 @@
                                        realw time,
                                        realw two_omega_earth,
                                        realw deltat,
-                                       realw* d_A_array_rotation,realw* d_B_array_rotation){
+                                       realw* d_A_array_rotation,realw* d_B_array_rotation,
+                                       int NSPEC_OUTER_CORE){
 
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
   int tx = threadIdx.x;
@@ -440,13 +441,20 @@
     //mesh coloring
     if( use_mesh_coloring_gpu ){
 
-      // no atomic operation needed, colors don't share global points between elements
+      if( NSPEC_OUTER_CORE > COLORING_MIN_NSPEC_OUTER_CORE ){
+        // no atomic operation needed, colors don't share global points between elements
 #ifdef USE_TEXTURES_FIELDS
-      d_potential_dot_dot[iglob] = tex1Dfetch(d_accel_oc_tex, iglob) + sum_terms;
+        d_potential_dot_dot[iglob] = tex1Dfetch(d_accel_oc_tex, iglob) + sum_terms;
 #else
-      d_potential_dot_dot[iglob] += sum_terms;
+        d_potential_dot_dot[iglob] += sum_terms;
 #endif // USE_TEXTURES_FIELDS
+      }else{
+        // poor element count, only use 1 color per inner/outer run
+        // forces atomic operations
+        atomicAdd(&d_potential_dot_dot[iglob],sum_terms);
+      }
 
+
     }else{
 
       atomicAdd(&d_potential_dot_dot[iglob],sum_terms);
@@ -466,8 +474,7 @@
                          realw* d_gammax,realw* d_gammay,realw* d_gammaz,
                          realw time, realw b_time,
                          realw* d_A_array_rotation,realw* d_B_array_rotation,
-                         realw* d_b_A_array_rotation,realw* d_b_B_array_rotation
-                         ){
+                         realw* d_b_A_array_rotation,realw* d_b_B_array_rotation){
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("before outer_core kernel Kernel_2");
@@ -518,7 +525,8 @@
                                                           time,
                                                           mp->d_two_omega_earth,
                                                           mp->d_deltat,
-                                                          d_A_array_rotation,d_B_array_rotation);
+                                                          d_A_array_rotation,d_B_array_rotation,
+                                                          mp->NSPEC_OUTER_CORE);
 
   if(mp->simulation_type == 3) {
     Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
@@ -545,7 +553,8 @@
                                                             b_time,
                                                             mp->d_b_two_omega_earth,
                                                             mp->d_b_deltat,
-                                                            d_b_A_array_rotation,d_b_B_array_rotation);
+                                                            d_b_A_array_rotation,d_b_B_array_rotation,
+                                                            mp->NSPEC_OUTER_CORE);
   }
 
   // cudaEventRecord( stop, 0 );
@@ -596,7 +605,7 @@
   if( num_elements == 0 ) return;
 
   // mesh coloring
-  if( mp->use_mesh_coloring_gpu ){
+  if( mp->use_mesh_coloring_gpu){
 
     // note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements
     //         and followed by acoustic ones.
@@ -607,28 +616,56 @@
     int color_offset,color_offset_nonpadded;
 
     // sets up color loop
-    if( *iphase == 1 ){
-      // outer elements
-      nb_colors = mp->num_colors_outer_outer_core;
-      istart = 0;
+    if( mp->NSPEC_OUTER_CORE > COLORING_MIN_NSPEC_OUTER_CORE ){
+      if( *iphase == 1 ){
+        // outer elements
+        nb_colors = mp->num_colors_outer_outer_core;
+        istart = 0;
 
-      // array offsets
-      color_offset = 0;
-      color_offset_nonpadded = 0;
+        // array offsets
+        color_offset = 0;
+        color_offset_nonpadded = 0;
+      }else{
+        // inner element colors (start after outer elements)
+        nb_colors = mp->num_colors_outer_outer_core + mp->num_colors_inner_outer_core;
+        istart = mp->num_colors_outer_outer_core;
+
+        // array offsets (inner elements start after outer ones)
+        color_offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
+        color_offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
+      }
     }else{
-      // inner element colors (start after outer elements)
-      nb_colors = mp->num_colors_outer_outer_core + mp->num_colors_inner_outer_core;
-      istart = mp->num_colors_outer_outer_core;
 
-      // array offsets (inner elements start after outer ones)
-      color_offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
-      color_offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
+      // poor element count, only use 1 color per inner/outer run
+
+      if( *iphase == 1 ){
+        // outer elements
+        nb_colors = 1;
+        istart = 0;
+
+        // array offsets
+        color_offset = 0;
+        color_offset_nonpadded = 0;
+      }else{
+        // inner element colors (start after outer elements)
+        nb_colors = 1;
+        istart = 0;
+
+        // array offsets (inner elements start after outer ones)
+        color_offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
+        color_offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
+      }
     }
 
     // loops over colors
     for(int icolor = istart; icolor < nb_colors; icolor++){
 
-      nb_blocks_to_compute = mp->h_num_elem_colors_outer_core[icolor];
+      // gets number of elements for this color
+      if( mp->NSPEC_OUTER_CORE > COLORING_MIN_NSPEC_OUTER_CORE ){
+        nb_blocks_to_compute = mp->h_num_elem_colors_outer_core[icolor];
+      }else{
+        nb_blocks_to_compute = num_elements;
+      }
 
       Kernel_2_outer_core(nb_blocks_to_compute,mp,
                           *iphase,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2012-08-15 19:55:44 UTC (rev 20570)
@@ -601,13 +601,9 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
 
-  // copies surface buffer to GPU
-  cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
-             NDIM*NGLL2*(mp->nspec_top)*sizeof(realw),cudaMemcpyHostToDevice);
-
-  int num_blocks_x = mp->nspec_top;
   realw deltat = *deltat_f;
 
+  int num_blocks_x = mp->nspec2D_top_crust_mantle;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
@@ -617,6 +613,12 @@
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(NGLL2,1,1);
 
+  // copies surface buffer to GPU
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+                                     NDIM*NGLL2*(mp->nspec2D_top_crust_mantle)*sizeof(realw),
+                                     cudaMemcpyHostToDevice),90900);
+
+  // calculates noise strength kernel
   compute_kernels_strength_noise_cuda_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
                                                                mp->d_ibelm_top_crust_mantle,
                                                                mp->d_ibool_crust_mantle,
@@ -626,7 +628,7 @@
                                                                mp->d_normal_z_noise,
                                                                mp->d_Sigma_kl,
                                                                deltat,
-                                                               mp->nspec_top);
+                                                               mp->nspec2D_top_crust_mantle);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_kernels_strength_noise_cuda_kernel");

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-08-15 19:55:44 UTC (rev 20570)
@@ -40,8 +40,8 @@
 
 */
 
-#ifndef GPU_MESH_
-#define GPU_MESH_
+#ifndef CUDA_MESH_H
+#define CUDA_MESH_H
 
 #include <sys/types.h>
 #include <unistd.h>
@@ -137,6 +137,18 @@
 #pragma message ("\nCompiling with: USE_MESH_COLORING_GPU enabled\n")
 #endif
 
+// note: mesh coloring has a tradeoff between extra costs for looping over colors
+//          and slowliness of atomic updates;
+//          in general, the more elements per color the more efficient
+//
+// thresholds for coloring :
+//   we assume that the crust/mantle region has enough elements for coloring
+//
+// minimum number of elements required in inner core before mesh coloring becomes attractive
+#define COLORING_MIN_NSPEC_INNER_CORE 1000
+// minimum number of elements required in outer core before mesh coloring becomes attractive
+#define COLORING_MIN_NSPEC_OUTER_CORE 1000
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // Texture memory usage:
@@ -311,19 +323,14 @@
 
   int nspec_outer_crust_mantle;
   int nspec_inner_crust_mantle;
-  int nspec2D_top_crust_mantle;
   int nspec2D_bottom_crust_mantle;
 
   int num_colors_inner_crust_mantle;
   int num_colors_outer_crust_mantle;
   int* h_num_elem_colors_crust_mantle;
 
-  int* d_ibelm_top_crust_mantle;
   int* d_ibelm_bottom_crust_mantle;
 
-  // normal definition for coupling regions
-  realw* d_normal_top_crust_mantle;
-
   // ------------------------------------------------------------------ //
   // outer_core
   // ------------------------------------------------------------------ //
@@ -482,15 +489,13 @@
   // ------------------------------------------------------------------ //
   // oceans
   // ------------------------------------------------------------------ //
-  int NGLOB_CRUST_MANTLE_OCEANS;
+  int npoin_oceans;
 
   // model parameter
   realw* d_rmass_ocean_load;
+  int* d_iglob_ocean_load;
+  realw* d_normal_ocean_load;
 
-  // temporary global array: used to synchronize updates on global accel array
-  int* d_updated_dof_ocean_load;
-  int* d_b_updated_dof_ocean_load;
-
   // ------------------------------------------------------------------ //
   // attenuation
   // ------------------------------------------------------------------ //
@@ -668,8 +673,6 @@
   // ------------------------------------------------------------------ //
   int noise_tomography;
 
-  int nspec_top;
-
   realw* d_noise_surface_movie;
   realw* d_noise_sourcearray;
 
@@ -677,6 +680,10 @@
   realw* d_normal_y_noise;
   realw* d_normal_z_noise;
   realw* d_mask_noise;
+
+  // free surface
+  int nspec2D_top_crust_mantle;
+  int* d_ibelm_top_crust_mantle;
   realw* d_jacobian2D_top_crust_mantle;
 
   // noise sensitivity kernel
@@ -685,4 +692,4 @@
 } Mesh;
 
 
-#endif
+#endif // CUDA_MESH_H

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2012-08-15 19:55:44 UTC (rev 20570)
@@ -154,7 +154,7 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
-  int num_blocks_x = mp->nspec_top;
+  int num_blocks_x = mp->nspec2D_top_crust_mantle;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
@@ -164,14 +164,14 @@
   dim3 threads(NGLL2,1,1);
 
   noise_transfer_surface_to_host_kernel<<<grid,threads>>>(mp->d_ibelm_top_crust_mantle,
-                                                          mp->nspec_top,
+                                                          mp->nspec2D_top_crust_mantle,
                                                           mp->d_ibool_crust_mantle,
                                                           mp->d_displ_crust_mantle,
                                                           mp->d_noise_surface_movie);
 
   // copies noise array to CPU
   cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
-             NDIM*NGLL2*(mp->nspec_top)*sizeof(realw),cudaMemcpyDeviceToHost);
+             NDIM*NGLL2*(mp->nspec2D_top_crust_mantle)*sizeof(realw),cudaMemcpyDeviceToHost);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("noise_transfer_surface_to_host");
@@ -304,7 +304,7 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
 
-  int num_blocks_x = mp->nspec_top;
+  int num_blocks_x = mp->nspec2D_top_crust_mantle;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
@@ -315,7 +315,7 @@
 
   // copies surface movie to GPU
   cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
-             NDIM*NGLL2*(mp->nspec_top)*sizeof(realw),cudaMemcpyHostToDevice);
+             NDIM*NGLL2*(mp->nspec2D_top_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice);
 
   switch(mp->noise_tomography) {
   case 2:
@@ -323,7 +323,7 @@
     noise_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
                                                           mp->d_ibool_crust_mantle,
                                                           mp->d_ibelm_top_crust_mantle,
-                                                          mp->nspec_top,
+                                                          mp->nspec2D_top_crust_mantle,
                                                           mp->d_noise_surface_movie,
                                                           mp->d_normal_x_noise,
                                                           mp->d_normal_y_noise,
@@ -338,7 +338,7 @@
     noise_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
                                                           mp->d_ibool_crust_mantle,
                                                           mp->d_ibelm_top_crust_mantle,
-                                                          mp->nspec_top,
+                                                          mp->nspec2D_top_crust_mantle,
                                                           mp->d_noise_surface_movie,
                                                           mp->d_normal_x_noise,
                                                           mp->d_normal_y_noise,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2012-08-15 19:55:44 UTC (rev 20570)
@@ -26,8 +26,8 @@
  !=====================================================================
  */
 
-#ifndef CUDA_HEADER_H
-#define CUDA_HEADER_H
+#ifndef CUDA_PREPARE_H
+#define CUDA_PREPARE_H
 
 typedef float realw;  // type of "working" variables
 
@@ -260,4 +260,4 @@
 }
 
 
-#endif //CUDA_HEADER_H
+#endif //CUDA_PREPARE_H

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-08-15 19:55:44 UTC (rev 20570)
@@ -41,9 +41,50 @@
 #include "mesh_constants_cuda.h"
 #include "prepare_constants_cuda.h"
 
+/* ----------------------------------------------------------------------------------------------- */
 
+// helper functions
+
 /* ----------------------------------------------------------------------------------------------- */
 
+
+// copies integer array from CPU host to GPU device
+void copy_todevice_int(void** d_array_addr_ptr,int* h_array,int size){
+  TRACE("copy_todevice_int");
+
+  // allocates memory on GPU
+  //
+  // note: cudaMalloc uses a double-pointer, such that it can return an error code in case it fails
+  //          we thus pass the address to the pointer above (as void double-pointer) to have it
+  //          pointing to the correct pointer of the array here
+  print_CUDA_error_if_any(cudaMalloc((void**)d_array_addr_ptr,size*sizeof(int)),
+                          10000);
+
+  // copies values onto GPU
+  //
+  // note: cudaMemcpy uses the pointer to the array, we thus re-cast the value of
+  //          the double-pointer above to have the correct pointer to the array
+  print_CUDA_error_if_any(cudaMemcpy((int*) *d_array_addr_ptr,h_array,size*sizeof(int),cudaMemcpyHostToDevice),
+                          10001);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// copies integer array from CPU host to GPU device
+void copy_todevice_realw(void** d_array_addr_ptr,realw* h_array,int size){
+  TRACE("copy_todevice_realw");
+
+  // allocates memory on GPU
+  print_CUDA_error_if_any(cudaMalloc((void**)d_array_addr_ptr,size*sizeof(realw)),
+                          20000);
+  // copies values onto GPU
+  print_CUDA_error_if_any(cudaMemcpy((realw*) *d_array_addr_ptr,h_array,size*sizeof(realw),cudaMemcpyHostToDevice),
+                          20001);
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // GPU preparation
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -64,7 +105,6 @@
                                         int* nrec,int* nrec_local, int* nadj_rec_local,
                                         int* NSPEC_CRUST_MANTLE, int* NGLOB_CRUST_MANTLE,
                                         int* NSPEC_CRUST_MANTLE_STRAIN_ONLY,
-                                        int* NGLOB_CRUST_MANTLE_OCEANS,
                                         int* NSPEC_OUTER_CORE, int* NGLOB_OUTER_CORE,
                                         int* NSPEC_INNER_CORE, int* NGLOB_INNER_CORE,
                                         int* NSPEC_INNER_CORE_STRAIN_ONLY,
@@ -87,7 +127,7 @@
                                         int* ANISOTROPIC_KL_f,
                                         int* APPROXIMATE_HESS_KL_f) {
 
-TRACE("prepare_constants_device");
+  TRACE("prepare_constants_device");
 
   // allocates mesh parameter structure
   Mesh* mp = (Mesh*) malloc( sizeof(Mesh) );
@@ -140,7 +180,6 @@
   mp->NSPEC_CRUST_MANTLE = *NSPEC_CRUST_MANTLE;
   mp->NGLOB_CRUST_MANTLE = *NGLOB_CRUST_MANTLE;
   mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY = *NSPEC_CRUST_MANTLE_STRAIN_ONLY;
-  mp->NGLOB_CRUST_MANTLE_OCEANS = *NGLOB_CRUST_MANTLE_OCEANS;
 
   mp->NSPEC_OUTER_CORE = *NSPEC_OUTER_CORE;
   mp->NGLOB_OUTER_CORE = *NGLOB_OUTER_CORE;
@@ -191,59 +230,42 @@
   mp->nsources_local = *nsources_local;
   if( mp->simulation_type == 1  || mp->simulation_type == 3 ){
     // not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2)
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_sourcearrays,
-                                       sizeof(realw)* *NSOURCES*3*NGLL3),1301);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_sourcearrays, h_sourcearrays,
-                                       sizeof(realw)* *NSOURCES*3*NGLL3,cudaMemcpyHostToDevice),1302);
+    copy_todevice_realw((void**)&mp->d_sourcearrays,h_sourcearrays,(*NSOURCES)*NDIM*NGLL3);
+
     // buffer for source time function values
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_stf_pre_compute,
-                                       *NSOURCES*sizeof(double)),1303);
+                                       (*NSOURCES)*sizeof(double)),1303);
   }
+  copy_todevice_int((void**)&mp->d_islice_selected_source,h_islice_selected_source,(*NSOURCES));
+  copy_todevice_int((void**)&mp->d_ispec_selected_source,h_ispec_selected_source,(*NSOURCES));
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_source,
-                                     sizeof(int) * *NSOURCES),1401);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_source, h_islice_selected_source,
-                                     sizeof(int)* *NSOURCES,cudaMemcpyHostToDevice),1402);
-
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_ispec_selected_source,
-                                     sizeof(int)* *NSOURCES),1403);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_source, h_ispec_selected_source,
-                                     sizeof(int)* *NSOURCES,cudaMemcpyHostToDevice),1404);
-
-
   // receiver stations
   // note that:   size(number_receiver_global) = nrec_local
   //                   size(ispec_selected_rec) = nrec
   // number of receiver located in this partition
   mp->nrec_local = *nrec_local;
   if( mp->nrec_local > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_number_receiver_global),mp->nrec_local*sizeof(int)),1);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_number_receiver_global,h_number_receiver_global,
-                                     mp->nrec_local*sizeof(int),cudaMemcpyHostToDevice),1512);
+    copy_todevice_int((void**)&mp->d_number_receiver_global,h_number_receiver_global,mp->nrec_local);
 
     // for seismograms
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
-                                       3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+                                       NDIM*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
 
-    mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
+    mp->h_station_seismo_field = (realw*) malloc( NDIM*NGLL3*(mp->nrec_local)*sizeof(realw) );
     if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
 
   }
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_selected_rec),(*nrec)*sizeof(int)),1513);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_rec,h_ispec_selected_rec,
-                                     (*nrec)*sizeof(int),cudaMemcpyHostToDevice),1514);
+  copy_todevice_int((void**)&mp->d_ispec_selected_rec,h_ispec_selected_rec,(*nrec));
 
   // receiver adjoint source arrays only used for noise and adjoint simulations
   // adjoint source arrays
   mp->nadj_rec_local = *nadj_rec_local;
   if( mp->nadj_rec_local > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
-                                       (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),6003);
 
+    // prepares local irec array:
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
                                        (mp->nadj_rec_local)*sizeof(int)),6004);
 
-    // prepares local irec array:
     // the irec_local variable needs to be precomputed (as
     // h_pre_comp..), because normally it is in the loop updating accel,
     // and due to how it's incremented, it cannot be parallelized
@@ -264,8 +286,12 @@
     free(h_pre_computed_irec);
 
     // temporary array to prepare extracted source array values
-    mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
+    mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw) );
     if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
+                                       (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw)),6003);
+
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -290,8 +316,7 @@
                                               realw* b_deltat,
                                               realw* b_A_array_rotation,
                                               realw* b_B_array_rotation,
-                                              int* NSPEC_OUTER_CORE_ROTATION
-                                              ) {
+                                              int* NSPEC_OUTER_CORE_ROTATION) {
 
   TRACE("prepare_fields_rotation_device");
 
@@ -304,30 +329,16 @@
   mp->d_two_omega_earth = *two_omega_earth;
   mp->d_deltat = *deltat;
 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_A_array_rotation,
-                                     NGLL3*(*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw)),9000);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_A_array_rotation, A_array_rotation,
-                                     NGLL3*(*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw),cudaMemcpyHostToDevice),9001);
+  copy_todevice_realw((void**)&mp->d_A_array_rotation,A_array_rotation,NGLL3*(*NSPEC_OUTER_CORE_ROTATION));
+  copy_todevice_realw((void**)&mp->d_B_array_rotation,B_array_rotation,NGLL3*(*NSPEC_OUTER_CORE_ROTATION));
 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_B_array_rotation,
-                                     NGLL3*(*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw)),9002);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_B_array_rotation, B_array_rotation,
-                                     NGLL3*(*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw),cudaMemcpyHostToDevice),9003);
-
   // backward/reconstructed fields
   if( mp->simulation_type == 3 ){
     mp->d_b_two_omega_earth = *b_two_omega_earth;
     mp->d_b_deltat = *b_deltat;
 
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_A_array_rotation,
-                                       NGLL3*(*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw)),9000);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_A_array_rotation, b_A_array_rotation,
-                                       NGLL3*(*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw),cudaMemcpyHostToDevice),9001);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_B_array_rotation,
-                                       NGLL3*(*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw)),9002);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_B_array_rotation, b_B_array_rotation,
-                                       NGLL3*(*NSPEC_OUTER_CORE_ROTATION)*sizeof(realw),cudaMemcpyHostToDevice),9003);
+    copy_todevice_realw((void**)&mp->d_b_A_array_rotation,b_A_array_rotation,NGLL3*(*NSPEC_OUTER_CORE_ROTATION));
+    copy_todevice_realw((void**)&mp->d_b_B_array_rotation,b_B_array_rotation,NGLL3*(*NSPEC_OUTER_CORE_ROTATION));
   }
 }
 
@@ -347,8 +358,7 @@
                                              realw* minus_deriv_gravity_table,
                                              realw* density_table,
                                              realw* h_wgll_cube,
-                                             int* NRAD_GRAVITY
-                                             ) {
+                                             int* NRAD_GRAVITY) {
 
   TRACE("prepare_fields_gravity_device");
 
@@ -358,10 +368,7 @@
     // no gravity case
 
     // d ln(rho)/dr needed for the no gravity fluid potential
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_d_ln_density_dr_table,
-                                       (*NRAD_GRAVITY)*sizeof(realw)),8000);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_d_ln_density_dr_table, d_ln_density_dr_table,
-                                       (*NRAD_GRAVITY)*sizeof(realw),cudaMemcpyHostToDevice),8001);
+    copy_todevice_realw((void**)&mp->d_d_ln_density_dr_table,d_ln_density_dr_table,(*NRAD_GRAVITY));
 
   }else{
     // gravity case
@@ -370,25 +377,10 @@
     setConst_wgll_cube(h_wgll_cube,mp);
 
     // prepares gravity arrays
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_minus_rho_g_over_kappa_fluid,
-                                       (*NRAD_GRAVITY)*sizeof(realw)),8000);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_rho_g_over_kappa_fluid, minus_rho_g_over_kappa_fluid,
-                                       (*NRAD_GRAVITY)*sizeof(realw),cudaMemcpyHostToDevice),8001);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_minus_gravity_table,
-                                       (*NRAD_GRAVITY)*sizeof(realw)),8000);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_gravity_table, minus_gravity_table,
-                                       (*NRAD_GRAVITY)*sizeof(realw),cudaMemcpyHostToDevice),8001);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_minus_deriv_gravity_table,
-                                       (*NRAD_GRAVITY)*sizeof(realw)),8000);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_deriv_gravity_table, minus_deriv_gravity_table,
-                                       (*NRAD_GRAVITY)*sizeof(realw),cudaMemcpyHostToDevice),8001);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_density_table,
-                                       (*NRAD_GRAVITY)*sizeof(realw)),8000);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_density_table, density_table,
-                                       (*NRAD_GRAVITY)*sizeof(realw),cudaMemcpyHostToDevice),8001);
+    copy_todevice_realw((void**)&mp->d_minus_rho_g_over_kappa_fluid,minus_rho_g_over_kappa_fluid,(*NRAD_GRAVITY));
+    copy_todevice_realw((void**)&mp->d_minus_gravity_table,minus_gravity_table,(*NRAD_GRAVITY));
+    copy_todevice_realw((void**)&mp->d_minus_deriv_gravity_table,minus_deriv_gravity_table,(*NRAD_GRAVITY));
+    copy_todevice_realw((void**)&mp->d_density_table,density_table,(*NRAD_GRAVITY));
   }
 }
 
@@ -418,8 +410,7 @@
                                                  realw* factor_common_inner_core,
                                                  realw* one_minus_sum_beta_inner_core,
                                                  realw* alphaval,realw* betaval,realw* gammaval,
-                                                 realw* b_alphaval,realw* b_betaval,realw* b_gammaval
-                                                 ) {
+                                                 realw* b_alphaval,realw* b_betaval,realw* b_gammaval) {
 
   TRACE("prepare_fields_attenuat_device");
   int R_size1,R_size2,R_size3;
@@ -430,131 +421,61 @@
   if( ! mp->attenuation ){ exit_on_cuda_error("prepare_fields_attenuat_device attenuation not properly initialized"); }
 
   // crust_mantle
+  R_size1 = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
   if( mp->attenuation_3D ){
-    R_size1 = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
     R_size2 = NGLL3*mp->NSPEC_CRUST_MANTLE;
     R_size3 = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
   }else{
-    R_size1 = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
     R_size2 = 1*mp->NSPEC_CRUST_MANTLE;
     R_size3 = N_SLS*1*mp->NSPEC_CRUST_MANTLE;
   }
 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_one_minus_sum_beta_crust_mantle,
-                                     R_size2*sizeof(realw)),4430);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta_crust_mantle,one_minus_sum_beta_crust_mantle,
-                                     R_size2*sizeof(realw),cudaMemcpyHostToDevice),4431);
+  copy_todevice_realw((void**)&mp->d_one_minus_sum_beta_crust_mantle,one_minus_sum_beta_crust_mantle,R_size2);
 
   if( ! mp->use_attenuation_mimic ){
     // common factor
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_factor_common_crust_mantle,
-                                         R_size3*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common_crust_mantle,factor_common_crust_mantle,
-                                         R_size3*sizeof(realw),cudaMemcpyHostToDevice),4433);
-
+    copy_todevice_realw((void**)&mp->d_factor_common_crust_mantle,factor_common_crust_mantle,R_size3);
     // memory variables
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xx_crust_mantle,
-                                       R_size1*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_yy_crust_mantle,
-                                       R_size1*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xy_crust_mantle,
-                                       R_size1*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xz_crust_mantle,
-                                       R_size1*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_yz_crust_mantle,
-                                       R_size1*sizeof(realw)),4401);
-
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx_crust_mantle,R_xx_crust_mantle,
-                                         R_size1*sizeof(realw),cudaMemcpyHostToDevice),4800);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy_crust_mantle,R_yy_crust_mantle,
-                                         R_size1*sizeof(realw),cudaMemcpyHostToDevice),4800);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy_crust_mantle,R_xy_crust_mantle,
-                                         R_size1*sizeof(realw),cudaMemcpyHostToDevice),4800);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz_crust_mantle,R_xz_crust_mantle,
-                                         R_size1*sizeof(realw),cudaMemcpyHostToDevice),4800);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz_crust_mantle,R_yz_crust_mantle,
-                                         R_size1*sizeof(realw),cudaMemcpyHostToDevice),4800);
+    copy_todevice_realw((void**)&mp->d_R_xx_crust_mantle,R_xx_crust_mantle,R_size1);
+    copy_todevice_realw((void**)&mp->d_R_yy_crust_mantle,R_yy_crust_mantle,R_size1);
+    copy_todevice_realw((void**)&mp->d_R_xy_crust_mantle,R_xy_crust_mantle,R_size1);
+    copy_todevice_realw((void**)&mp->d_R_xz_crust_mantle,R_xz_crust_mantle,R_size1);
+    copy_todevice_realw((void**)&mp->d_R_yz_crust_mantle,R_yz_crust_mantle,R_size1);
   }
 
   // inner_core
+  R_size1 = N_SLS*NGLL3*mp->NSPEC_INNER_CORE;
   if( mp->attenuation_3D ){
-    R_size1 = N_SLS*NGLL3*mp->NSPEC_INNER_CORE;
     R_size2 = NGLL3*mp->NSPEC_INNER_CORE;
     R_size3 = N_SLS*NGLL3*mp->NSPEC_INNER_CORE;
   }else{
-    R_size1 = N_SLS*NGLL3*mp->NSPEC_INNER_CORE;
     R_size2 = 1*mp->NSPEC_INNER_CORE;
     R_size3 = N_SLS*1*mp->NSPEC_INNER_CORE;
   }
 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_one_minus_sum_beta_inner_core,
-                                     R_size2*sizeof(realw)),4430);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta_inner_core,one_minus_sum_beta_inner_core,
-                                     R_size2*sizeof(realw),cudaMemcpyHostToDevice),4431);
+  copy_todevice_realw((void**)&mp->d_one_minus_sum_beta_inner_core,one_minus_sum_beta_inner_core,R_size2);
 
   if( ! mp->use_attenuation_mimic ){
     // common factor
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_factor_common_inner_core,
-                                       R_size3*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common_inner_core,factor_common_inner_core,
-                                       R_size3*sizeof(realw),cudaMemcpyHostToDevice),4433);
-
+    copy_todevice_realw((void**)&mp->d_factor_common_inner_core,factor_common_inner_core,R_size3);
     // memory variables
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xx_inner_core,
-                                       R_size1*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_yy_inner_core,
-                                       R_size1*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xy_inner_core,
-                                       R_size1*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_xz_inner_core,
-                                       R_size1*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_R_yz_inner_core,
-                                       R_size1*sizeof(realw)),4401);
-
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx_inner_core,R_xx_inner_core,
-                                       R_size1*sizeof(realw),cudaMemcpyHostToDevice),4402);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy_inner_core,R_yy_inner_core,
-                                       R_size1*sizeof(realw),cudaMemcpyHostToDevice),4402);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy_inner_core,R_xy_inner_core,
-                                       R_size1*sizeof(realw),cudaMemcpyHostToDevice),4402);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz_inner_core,R_xz_inner_core,
-                                       R_size1*sizeof(realw),cudaMemcpyHostToDevice),4402);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz_inner_core,R_yz_inner_core,
-                                       R_size1*sizeof(realw),cudaMemcpyHostToDevice),4402);
+    copy_todevice_realw((void**)&mp->d_R_xx_inner_core,R_xx_inner_core,R_size1);
+    copy_todevice_realw((void**)&mp->d_R_yy_inner_core,R_yy_inner_core,R_size1);
+    copy_todevice_realw((void**)&mp->d_R_xy_inner_core,R_xy_inner_core,R_size1);
+    copy_todevice_realw((void**)&mp->d_R_xz_inner_core,R_xz_inner_core,R_size1);
+    copy_todevice_realw((void**)&mp->d_R_yz_inner_core,R_yz_inner_core,R_size1);
   }
 
   // alpha,beta,gamma factors
-  print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_alphaval),
-                                     N_SLS*sizeof(realw)),4434);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
-                                     N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
+  copy_todevice_realw((void**)&mp->d_alphaval,alphaval,N_SLS);
+  copy_todevice_realw((void**)&mp->d_betaval,betaval,N_SLS);
+  copy_todevice_realw((void**)&mp->d_gammaval,gammaval,N_SLS);
 
-  print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_betaval),
-                                     N_SLS*sizeof(realw)),4436);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
-                                     N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_gammaval),
-                                     N_SLS*sizeof(realw)),4438);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
-                                     N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
-
   if( mp->simulation_type == 3 ){
     // alpha,beta,gamma factors for backward fields
-    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_b_alphaval),
-                                       N_SLS*sizeof(realw)),5434);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_b_betaval),
-                                       N_SLS*sizeof(realw)),5436);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_b_gammaval),
-                                       N_SLS*sizeof(realw)),5438);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439);
+    copy_todevice_realw((void**)&mp->d_b_alphaval,b_alphaval,N_SLS);
+    copy_todevice_realw((void**)&mp->d_b_betaval,b_betaval,N_SLS);
+    copy_todevice_realw((void**)&mp->d_b_gammaval,b_gammaval,N_SLS);
   }
 }
 
@@ -590,8 +511,7 @@
                                             realw* b_epsilondev_xz_inner_core,
                                             realw* b_epsilondev_yz_inner_core,
                                             realw* eps_trace_over_3_inner_core,
-                                            realw* b_eps_trace_over_3_inner_core
-                                            ) {
+                                            realw* b_eps_trace_over_3_inner_core) {
 
   TRACE("prepare_fields_strain_device");
   int R_size,size_strain_only;
@@ -603,125 +523,48 @@
 
   // crust_mantle
   R_size = NGLL3*mp->NSPEC_CRUST_MANTLE;
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xx_crust_mantle,
-                                     R_size*sizeof(realw)),4432);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_yy_crust_mantle,
-                                     R_size*sizeof(realw)),4432);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xy_crust_mantle,
-                                     R_size*sizeof(realw)),4432);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xz_crust_mantle,
-                                     R_size*sizeof(realw)),4432);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_yz_crust_mantle,
-                                     R_size*sizeof(realw)),4432);
+  copy_todevice_realw((void**)&mp->d_epsilondev_xx_crust_mantle,epsilondev_xx_crust_mantle,R_size);
+  copy_todevice_realw((void**)&mp->d_epsilondev_yy_crust_mantle,epsilondev_yy_crust_mantle,R_size);
+  copy_todevice_realw((void**)&mp->d_epsilondev_xy_crust_mantle,epsilondev_xy_crust_mantle,R_size);
+  copy_todevice_realw((void**)&mp->d_epsilondev_xz_crust_mantle,epsilondev_xz_crust_mantle,R_size);
+  copy_todevice_realw((void**)&mp->d_epsilondev_yz_crust_mantle,epsilondev_yz_crust_mantle,R_size);
 
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx_crust_mantle,epsilondev_xx_crust_mantle,
-                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy_crust_mantle,epsilondev_yy_crust_mantle,
-                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy_crust_mantle,epsilondev_xy_crust_mantle,
-                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz_crust_mantle,epsilondev_xz_crust_mantle,
-                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz_crust_mantle,epsilondev_yz_crust_mantle,
-                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-
   // strain
   size_strain_only = NGLL3*(mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_eps_trace_over_3_crust_mantle,
-                                      size_strain_only*sizeof(realw)),4401);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_eps_trace_over_3_crust_mantle,eps_trace_over_3_crust_mantle,
-                                      size_strain_only*sizeof(realw),cudaMemcpyHostToDevice),4402);
+  copy_todevice_realw((void**)&mp->d_eps_trace_over_3_crust_mantle,eps_trace_over_3_crust_mantle,size_strain_only);
 
   // backward/reconstructed fields
   if( mp->simulation_type == 3 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xx_crust_mantle,
-                                       R_size*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_yy_crust_mantle,
-                                       R_size*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xy_crust_mantle,
-                                       R_size*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xz_crust_mantle,
-                                       R_size*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_yz_crust_mantle,
-                                       R_size*sizeof(realw)),4432);
-
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx_crust_mantle,b_epsilondev_xx_crust_mantle,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy_crust_mantle,b_epsilondev_yy_crust_mantle,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy_crust_mantle,b_epsilondev_xy_crust_mantle,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz_crust_mantle,b_epsilondev_xz_crust_mantle,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz_crust_mantle,b_epsilondev_yz_crust_mantle,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_eps_trace_over_3_crust_mantle,
-                                         R_size*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_eps_trace_over_3_crust_mantle,b_eps_trace_over_3_crust_mantle,
-                                         R_size*sizeof(realw),cudaMemcpyHostToDevice),4402);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xx_crust_mantle,b_epsilondev_xx_crust_mantle,R_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_yy_crust_mantle,b_epsilondev_yy_crust_mantle,R_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xy_crust_mantle,b_epsilondev_xy_crust_mantle,R_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xz_crust_mantle,b_epsilondev_xz_crust_mantle,R_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_yz_crust_mantle,b_epsilondev_yz_crust_mantle,R_size);
+    //strain
+    copy_todevice_realw((void**)&mp->d_b_eps_trace_over_3_crust_mantle,b_eps_trace_over_3_crust_mantle,R_size);
   }
 
   // inner_core
   R_size = NGLL3*mp->NSPEC_INNER_CORE;
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xx_inner_core,
-                                     R_size*sizeof(realw)),4432);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_yy_inner_core,
-                                     R_size*sizeof(realw)),4432);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xy_inner_core,
-                                     R_size*sizeof(realw)),4432);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_xz_inner_core,
-                                     R_size*sizeof(realw)),4432);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_epsilondev_yz_inner_core,
-                                     R_size*sizeof(realw)),4432);
+  copy_todevice_realw((void**)&mp->d_epsilondev_xx_inner_core,epsilondev_xx_inner_core,R_size);
+  copy_todevice_realw((void**)&mp->d_epsilondev_yy_inner_core,epsilondev_yy_inner_core,R_size);
+  copy_todevice_realw((void**)&mp->d_epsilondev_xy_inner_core,epsilondev_xy_inner_core,R_size);
+  copy_todevice_realw((void**)&mp->d_epsilondev_xz_inner_core,epsilondev_xz_inner_core,R_size);
+  copy_todevice_realw((void**)&mp->d_epsilondev_yz_inner_core,epsilondev_yz_inner_core,R_size);
 
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx_inner_core,epsilondev_xx_inner_core,
-                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy_inner_core,epsilondev_yy_inner_core,
-                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy_inner_core,epsilondev_xy_inner_core,
-                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz_inner_core,epsilondev_xz_inner_core,
-                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz_inner_core,epsilondev_yz_inner_core,
-                                     R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-
   // strain
   size_strain_only = NGLL3*(mp->NSPEC_INNER_CORE_STRAIN_ONLY);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_eps_trace_over_3_inner_core,
-                                     size_strain_only*sizeof(realw)),4401);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_eps_trace_over_3_inner_core,eps_trace_over_3_inner_core,
-                                     size_strain_only*sizeof(realw),cudaMemcpyHostToDevice),4402);
+  copy_todevice_realw((void**)&mp->d_eps_trace_over_3_inner_core,eps_trace_over_3_inner_core,size_strain_only);
+
   // backward/reconstructed fields
   if( mp->simulation_type == 3 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xx_inner_core,
-                                       R_size*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_yy_inner_core,
-                                       R_size*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xy_inner_core,
-                                       R_size*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_xz_inner_core,
-                                       R_size*sizeof(realw)),4432);
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_epsilondev_yz_inner_core,
-                                       R_size*sizeof(realw)),4432);
-
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx_inner_core,b_epsilondev_xx_inner_core,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy_inner_core,b_epsilondev_yy_inner_core,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy_inner_core,b_epsilondev_xy_inner_core,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz_inner_core,b_epsilondev_xz_inner_core,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz_inner_core,b_epsilondev_yz_inner_core,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4433);
-
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_eps_trace_over_3_inner_core,
-                                       R_size*sizeof(realw)),4401);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_eps_trace_over_3_inner_core,b_eps_trace_over_3_inner_core,
-                                       R_size*sizeof(realw),cudaMemcpyHostToDevice),4402);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xx_inner_core,b_epsilondev_xx_inner_core,R_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_yy_inner_core,b_epsilondev_yy_inner_core,R_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xy_inner_core,b_epsilondev_xy_inner_core,R_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_xz_inner_core,b_epsilondev_xz_inner_core,R_size);
+    copy_todevice_realw((void**)&mp->d_b_epsilondev_yz_inner_core,b_epsilondev_yz_inner_core,R_size);
+    // strain
+    copy_todevice_realw((void**)&mp->d_b_eps_trace_over_3_inner_core,b_eps_trace_over_3_inner_core,R_size);
   }
 }
 
@@ -761,8 +604,7 @@
                                             realw* jacobian2D_xmin_outer_core, realw* jacobian2D_xmax_outer_core,
                                             realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
                                             realw* jacobian2D_bottom_outer_core,
-                                            realw* vp_outer_core
-                                            ) {
+                                            realw* vp_outer_core) {
 
   TRACE("prepare_fields_absorb_device");
   int size;
@@ -780,65 +622,24 @@
 
   // vp & vs
   size = NGLL3*(mp->NSPEC_CRUST_MANTLE);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rho_vp_crust_mantle,
-                                     size*sizeof(realw)),2201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp_crust_mantle,rho_vp_crust_mantle,
-                                     size*sizeof(realw),cudaMemcpyHostToDevice),2202);
+  copy_todevice_realw((void**)&mp->d_rho_vp_crust_mantle,rho_vp_crust_mantle,size);
+  copy_todevice_realw((void**)&mp->d_rho_vs_crust_mantle,rho_vs_crust_mantle,size);
 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rho_vs_crust_mantle,
-                                     size*sizeof(realw)),2201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs_crust_mantle,rho_vs_crust_mantle,
-                                     size*sizeof(realw),cudaMemcpyHostToDevice),2202);
-
   // ijk index arrays
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nkmin_xi_crust_mantle,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_xi_crust_mantle,nkmin_xi_crust_mantle,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  copy_todevice_int((void**)&mp->d_nkmin_xi_crust_mantle,nkmin_xi_crust_mantle,2*(*NSPEC2DMAX_XMIN_XMAX_CM));
+  copy_todevice_int((void**)&mp->d_nkmin_eta_crust_mantle,nkmin_eta_crust_mantle,2*(*NSPEC2DMAX_YMIN_YMAX_CM));
+  copy_todevice_int((void**)&mp->d_njmin_crust_mantle,njmin_crust_mantle,2*(*NSPEC2DMAX_XMIN_XMAX_CM));
+  copy_todevice_int((void**)&mp->d_njmax_crust_mantle,njmax_crust_mantle,2*(*NSPEC2DMAX_XMIN_XMAX_CM));
+  copy_todevice_int((void**)&mp->d_nimin_crust_mantle,nimin_crust_mantle,2*(*NSPEC2DMAX_YMIN_YMAX_CM));
+  copy_todevice_int((void**)&mp->d_nimax_crust_mantle,nimax_crust_mantle,2*(*NSPEC2DMAX_YMIN_YMAX_CM));
 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nkmin_eta_crust_mantle,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_eta_crust_mantle,nkmin_eta_crust_mantle,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmin_crust_mantle,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_njmin_crust_mantle,njmin_crust_mantle,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmax_crust_mantle,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_njmax_crust_mantle,njmax_crust_mantle,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nimin_crust_mantle,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_nimin_crust_mantle,nimin_crust_mantle,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nimax_crust_mantle,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_nimax_crust_mantle,nimax_crust_mantle,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_CM)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-
   // xmin
   if( mp->nspec2D_xmin_crust_mantle > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_xmin_crust_mantle,
-                                       (mp->nspec2D_xmin_crust_mantle)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmin_crust_mantle,ibelm_xmin_crust_mantle,
-                                       (mp->nspec2D_xmin_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_xmin_crust_mantle,
-                            NDIM*NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_xmin_crust_mantle,normal_xmin_crust_mantle,
-                            NDIM*NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmin_crust_mantle,
-                            NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmin_crust_mantle,jacobian2D_xmin_crust_mantle,
-                            NGLL2*(mp->nspec2D_xmin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
+    copy_todevice_int((void**)&mp->d_ibelm_xmin_crust_mantle,ibelm_xmin_crust_mantle,mp->nspec2D_xmin_crust_mantle);
+    copy_todevice_realw((void**)&mp->d_normal_xmin_crust_mantle,normal_xmin_crust_mantle,
+                        NDIM*NGLL2*(mp->nspec2D_xmin_crust_mantle));
+    copy_todevice_realw((void**)&mp->d_jacobian2D_xmin_crust_mantle,jacobian2D_xmin_crust_mantle,
+                        NGLL2*(mp->nspec2D_xmin_crust_mantle));
     // boundary buffer
     if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmin_crust_mantle,
@@ -848,21 +649,11 @@
 
   // xmax
   if( mp->nspec2D_xmax_crust_mantle > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_xmax_crust_mantle,
-                                       (mp->nspec2D_xmax_crust_mantle)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmax_crust_mantle,ibelm_xmax_crust_mantle,
-                                       (mp->nspec2D_xmax_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_xmax_crust_mantle,
-                                       NDIM*NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_xmax_crust_mantle,normal_xmax_crust_mantle,
-                                       NDIM*NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmax_crust_mantle,
-                                       NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmax_crust_mantle,jacobian2D_xmax_crust_mantle,
-                                       NGLL2*(mp->nspec2D_xmax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
+    copy_todevice_int((void**)&mp->d_ibelm_xmax_crust_mantle,ibelm_xmax_crust_mantle,mp->nspec2D_xmax_crust_mantle);
+    copy_todevice_realw((void**)&mp->d_normal_xmax_crust_mantle,normal_xmax_crust_mantle,
+                        NDIM*NGLL2*(mp->nspec2D_xmax_crust_mantle));
+    copy_todevice_realw((void**)&mp->d_jacobian2D_xmax_crust_mantle,jacobian2D_xmax_crust_mantle,
+                        NGLL2*(mp->nspec2D_xmax_crust_mantle));
     // boundary buffer
     if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmax_crust_mantle,
@@ -872,21 +663,11 @@
 
   // ymin
   if( mp->nspec2D_ymin_crust_mantle > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_ymin_crust_mantle,
-                                       (mp->nspec2D_ymin_crust_mantle)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymin_crust_mantle,ibelm_ymin_crust_mantle,
-                                       (mp->nspec2D_ymin_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_ymin_crust_mantle,
-                                       NDIM*NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_ymin_crust_mantle,normal_ymin_crust_mantle,
-                                       NDIM*NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymin_crust_mantle,
-                                       NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymin_crust_mantle,jacobian2D_ymin_crust_mantle,
-                                       NGLL2*(mp->nspec2D_ymin_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
+    copy_todevice_int((void**)&mp->d_ibelm_ymin_crust_mantle,ibelm_ymin_crust_mantle,mp->nspec2D_ymin_crust_mantle);
+    copy_todevice_realw((void**)&mp->d_normal_ymin_crust_mantle,normal_ymin_crust_mantle,
+                        NDIM*NGLL2*(mp->nspec2D_ymin_crust_mantle));
+    copy_todevice_realw((void**)&mp->d_jacobian2D_ymin_crust_mantle,jacobian2D_ymin_crust_mantle,
+                        NGLL2*(mp->nspec2D_ymin_crust_mantle));
     // boundary buffer
     if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymin_crust_mantle,
@@ -896,21 +677,11 @@
 
   // ymax
   if( mp->nspec2D_ymax_crust_mantle > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_ymax_crust_mantle,
-                                       (mp->nspec2D_ymax_crust_mantle)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymax_crust_mantle,ibelm_ymax_crust_mantle,
-                                       (mp->nspec2D_ymax_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_normal_ymax_crust_mantle,
-                                       NDIM*NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_ymax_crust_mantle,normal_ymax_crust_mantle,
-                                       NDIM*NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymax_crust_mantle,
-                                       NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymax_crust_mantle,jacobian2D_ymax_crust_mantle,
-                                       NGLL2*(mp->nspec2D_ymax_crust_mantle)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
+    copy_todevice_int((void**)&mp->d_ibelm_ymax_crust_mantle,ibelm_ymax_crust_mantle,mp->nspec2D_ymax_crust_mantle);
+    copy_todevice_realw((void**)&mp->d_normal_ymax_crust_mantle,normal_ymax_crust_mantle,
+                        NDIM*NGLL2*(mp->nspec2D_ymax_crust_mantle));
+    copy_todevice_realw((void**)&mp->d_jacobian2D_ymax_crust_mantle,jacobian2D_ymax_crust_mantle,
+                        NGLL2*(mp->nspec2D_ymax_crust_mantle));
     // boundary buffer
     if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymax_crust_mantle,
@@ -928,54 +699,21 @@
 
   // vp
   size = NGLL3*(mp->NSPEC_OUTER_CORE);
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_vp_outer_core,
-                                     size*sizeof(realw)),2201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_vp_outer_core,vp_outer_core,
-                                     size*sizeof(realw),cudaMemcpyHostToDevice),2202);
+  copy_todevice_realw((void**)&mp->d_vp_outer_core,vp_outer_core,size);
 
   // ijk index arrays
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nkmin_xi_outer_core,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_xi_outer_core,nkmin_xi_outer_core,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
+  copy_todevice_int((void**)&mp->d_nkmin_xi_outer_core,nkmin_xi_outer_core,2*(*NSPEC2DMAX_XMIN_XMAX_OC));
+  copy_todevice_int((void**)&mp->d_nkmin_eta_outer_core,nkmin_eta_outer_core,2*(*NSPEC2DMAX_YMIN_YMAX_OC));
+  copy_todevice_int((void**)&mp->d_njmin_outer_core,njmin_outer_core,2*(*NSPEC2DMAX_XMIN_XMAX_OC));
+  copy_todevice_int((void**)&mp->d_njmax_outer_core,njmax_outer_core,2*(*NSPEC2DMAX_XMIN_XMAX_OC));
+  copy_todevice_int((void**)&mp->d_nimin_outer_core,nimin_outer_core,2*(*NSPEC2DMAX_YMIN_YMAX_OC));
+  copy_todevice_int((void**)&mp->d_nimax_outer_core,nimax_outer_core,2*(*NSPEC2DMAX_YMIN_YMAX_OC));
 
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nkmin_eta_outer_core,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_nkmin_eta_outer_core,nkmin_eta_outer_core,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmin_outer_core,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_njmin_outer_core,njmin_outer_core,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_njmax_outer_core,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_njmax_outer_core,njmax_outer_core,
-                                     2*(*NSPEC2DMAX_XMIN_XMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nimin_outer_core,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_nimin_outer_core,nimin_outer_core,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nimax_outer_core,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int)),1201);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_nimax_outer_core,nimax_outer_core,
-                                     2*(*NSPEC2DMAX_YMIN_YMAX_OC)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
   // xmin
   if( mp->nspec2D_xmin_outer_core > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_xmin_outer_core,
-                                       (mp->nspec2D_xmin_outer_core)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmin_outer_core,ibelm_xmin_outer_core,
-                                       (mp->nspec2D_xmin_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmin_outer_core,
-                                       NGLL2*(mp->nspec2D_xmin_outer_core)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmin_outer_core,jacobian2D_xmin_outer_core,
-                                       NGLL2*(mp->nspec2D_xmin_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
+    copy_todevice_int((void**)&mp->d_ibelm_xmin_outer_core,ibelm_xmin_outer_core,mp->nspec2D_xmin_outer_core);
+    copy_todevice_realw((void**)&mp->d_jacobian2D_xmin_outer_core,jacobian2D_xmin_outer_core,
+                        NGLL2*(mp->nspec2D_xmin_outer_core));
     // boundary buffer
     if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmin_outer_core,
@@ -985,16 +723,9 @@
 
   // xmax
   if( mp->nspec2D_xmax_outer_core > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_xmax_outer_core,
-                                       (mp->nspec2D_xmax_outer_core)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_xmax_outer_core,ibelm_xmax_outer_core,
-                                       (mp->nspec2D_xmax_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_xmax_outer_core,
-                                       NGLL2*(mp->nspec2D_xmax_outer_core)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_xmax_outer_core,jacobian2D_xmax_outer_core,
-                                       NGLL2*(mp->nspec2D_xmax_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
+    copy_todevice_int((void**)&mp->d_ibelm_xmax_outer_core,ibelm_xmax_outer_core,mp->nspec2D_xmax_outer_core);
+    copy_todevice_realw((void**)&mp->d_jacobian2D_xmax_outer_core,jacobian2D_xmax_outer_core,
+                        NGLL2*(mp->nspec2D_xmax_outer_core));
     // boundary buffer
     if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_xmax_outer_core,
@@ -1004,16 +735,9 @@
 
   // ymin
   if( mp->nspec2D_ymin_outer_core > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_ymin_outer_core,
-                                       (mp->nspec2D_ymin_outer_core)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymin_outer_core,ibelm_ymin_outer_core,
-                                       (mp->nspec2D_ymin_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymin_outer_core,
-                                       NGLL2*(mp->nspec2D_ymin_outer_core)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymin_outer_core,jacobian2D_ymin_outer_core,
-                                       NGLL2*(mp->nspec2D_ymin_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
+    copy_todevice_int((void**)&mp->d_ibelm_ymin_outer_core,ibelm_ymin_outer_core,mp->nspec2D_ymin_outer_core);
+    copy_todevice_realw((void**)&mp->d_jacobian2D_ymin_outer_core,jacobian2D_ymin_outer_core,
+                        NGLL2*(mp->nspec2D_ymin_outer_core));
     // boundary buffer
     if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymin_outer_core,
@@ -1023,16 +747,9 @@
 
   // ymax
   if( mp->nspec2D_ymax_outer_core > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_ymax_outer_core,
-                                       (mp->nspec2D_ymax_outer_core)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_ymax_outer_core,ibelm_ymax_outer_core,
-                                       (mp->nspec2D_ymax_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_ymax_outer_core,
-                                       NGLL2*(mp->nspec2D_ymax_outer_core)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_ymax_outer_core,jacobian2D_ymax_outer_core,
-                                       NGLL2*(mp->nspec2D_ymax_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
+    copy_todevice_int((void**)&mp->d_ibelm_ymax_outer_core,ibelm_ymax_outer_core,mp->nspec2D_ymax_outer_core);
+    copy_todevice_realw((void**)&mp->d_jacobian2D_ymax_outer_core,jacobian2D_ymax_outer_core,
+                        NGLL2*(mp->nspec2D_ymax_outer_core));
     // boundary buffer
     if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_ymax_outer_core,
@@ -1042,16 +759,9 @@
 
   // zmin
   if( mp->nspec2D_zmin_outer_core > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_zmin_outer_core,
-                                       (mp->nspec2D_zmin_outer_core)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_zmin_outer_core,ibelm_bottom_outer_core,
-                                       (mp->nspec2D_zmin_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_jacobian2D_zmin_outer_core,
-                                       NGLL2*(mp->nspec2D_zmin_outer_core)*sizeof(realw)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_zmin_outer_core,jacobian2D_bottom_outer_core,
-                                       NGLL2*(mp->nspec2D_zmin_outer_core)*sizeof(realw),cudaMemcpyHostToDevice),1202);
-
+    copy_todevice_int((void**)&mp->d_ibelm_zmin_outer_core,ibelm_bottom_outer_core,mp->nspec2D_zmin_outer_core);
+    copy_todevice_realw((void**)&mp->d_jacobian2D_zmin_outer_core,jacobian2D_bottom_outer_core,
+                        NGLL2*(mp->nspec2D_zmin_outer_core));
     // boundary buffer
     if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_zmin_outer_core,
@@ -1081,8 +791,7 @@
                                           int* num_interfaces_outer_core,
                                           int* max_nibool_interfaces_outer_core,
                                           int* nibool_interfaces_outer_core,
-                                          int* ibool_interfaces_outer_core
-                                          ){
+                                          int* ibool_interfaces_outer_core){
 
   TRACE("prepare_mpi_buffers_device");
 
@@ -1095,16 +804,11 @@
   mp->max_nibool_interfaces_crust_mantle = *max_nibool_interfaces_crust_mantle;
   if( mp->num_interfaces_crust_mantle > 0 ){
     // number of ibool entries array
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_crust_mantle,
-                                       (mp->num_interfaces_crust_mantle)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_crust_mantle,nibool_interfaces_crust_mantle,
-                                       (mp->num_interfaces_crust_mantle)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    copy_todevice_int((void**)&mp->d_nibool_interfaces_crust_mantle,nibool_interfaces_crust_mantle,
+                      mp->num_interfaces_crust_mantle);
     // ibool entries (iglob indices) values on interface
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_crust_mantle,
-                                       (mp->num_interfaces_crust_mantle)*(mp->max_nibool_interfaces_crust_mantle)*sizeof(int)),1203);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle,
-                                       (mp->num_interfaces_crust_mantle)*(mp->max_nibool_interfaces_crust_mantle)*sizeof(int),
-                                       cudaMemcpyHostToDevice),1204);
+    copy_todevice_int((void**)&mp->d_ibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle,
+                      (mp->num_interfaces_crust_mantle)*(mp->max_nibool_interfaces_crust_mantle));
     // allocates mpi buffer for exchange with cpu
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_crust_mantle),
                                        NDIM*(mp->max_nibool_interfaces_crust_mantle)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);
@@ -1115,16 +819,11 @@
   mp->max_nibool_interfaces_inner_core = *max_nibool_interfaces_inner_core;
   if( mp->num_interfaces_inner_core > 0 ){
     // number of ibool entries array
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_inner_core,
-                                       (mp->num_interfaces_inner_core)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_inner_core,nibool_interfaces_inner_core,
-                                       (mp->num_interfaces_inner_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    copy_todevice_int((void**)&mp->d_nibool_interfaces_inner_core,nibool_interfaces_inner_core,
+                      mp->num_interfaces_inner_core);
     // ibool entries (iglob indices) values on interface
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_inner_core,
-                                       (mp->num_interfaces_inner_core)*(mp->max_nibool_interfaces_inner_core)*sizeof(int)),1203);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_inner_core,ibool_interfaces_inner_core,
-                                       (mp->num_interfaces_inner_core)*(mp->max_nibool_interfaces_inner_core)*sizeof(int),
-                                       cudaMemcpyHostToDevice),1204);
+    copy_todevice_int((void**)&mp->d_ibool_interfaces_inner_core,ibool_interfaces_inner_core,
+                      (mp->num_interfaces_inner_core)*(mp->max_nibool_interfaces_inner_core));
     // allocates mpi buffer for exchange with cpu
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_inner_core),
                                        NDIM*(mp->max_nibool_interfaces_inner_core)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);
@@ -1136,22 +835,15 @@
   mp->max_nibool_interfaces_outer_core = *max_nibool_interfaces_outer_core;
   if( mp->num_interfaces_outer_core > 0 ){
     // number of ibool entries array
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_outer_core,
-                                       (mp->num_interfaces_outer_core)*sizeof(int)),1201);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_outer_core,nibool_interfaces_outer_core,
-                                       (mp->num_interfaces_outer_core)*sizeof(int),cudaMemcpyHostToDevice),1202);
+    copy_todevice_int((void**)&mp->d_nibool_interfaces_outer_core,nibool_interfaces_outer_core,
+                      mp->num_interfaces_outer_core);
     // ibool entries (iglob indices) values on interface
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_outer_core,
-                                       (mp->num_interfaces_outer_core)*(mp->max_nibool_interfaces_outer_core)*sizeof(int)),1203);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_outer_core,ibool_interfaces_outer_core,
-                                       (mp->num_interfaces_outer_core)*(mp->max_nibool_interfaces_outer_core)*sizeof(int),
-                                       cudaMemcpyHostToDevice),1204);
+    copy_todevice_int((void**)&mp->d_ibool_interfaces_outer_core,ibool_interfaces_outer_core,
+                      (mp->num_interfaces_outer_core)*(mp->max_nibool_interfaces_outer_core));
     // allocates mpi buffer for exchange with cpu
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_outer_core),
                                        (mp->max_nibool_interfaces_outer_core)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);
   }
-
-
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -1164,9 +856,9 @@
 extern "C"
 void FC_FUNC_(prepare_fields_noise_device,
               PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
-                                           int* nspec_top,
-                                           int* ibelm_top_crust_mantle,
+                                           int* NSPEC_TOP,
                                            int* NSTEP,
+                                           int* h_ibelm_top_crust_mantle,
                                            realw* noise_sourcearray,
                                            realw* normal_x_noise,
                                            realw* normal_y_noise,
@@ -1179,58 +871,34 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
 
   // free surface
-  mp->nspec_top = *nspec_top;
-  if( mp->nspec_top > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibelm_top_crust_mantle,
-                                       mp->nspec_top*sizeof(int)),7001);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_top_crust_mantle,ibelm_top_crust_mantle,
-                                       mp->nspec_top*sizeof(int),cudaMemcpyHostToDevice),7002);
-
+  mp->nspec2D_top_crust_mantle = *NSPEC_TOP;
+  if( mp->nspec2D_top_crust_mantle > 0 ){
+    // note: d_ibelm_top_crust_mantle will only be needed for noise computations
+    copy_todevice_int((void**)&mp->d_ibelm_top_crust_mantle,h_ibelm_top_crust_mantle,mp->nspec2D_top_crust_mantle);
     // alloc storage for the surface buffer to be copied
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
-                                       NDIM*NGLL2*(mp->nspec_top)*sizeof(realw)),7005);
+                                       NDIM*NGLL2*(mp->nspec2D_top_crust_mantle)*sizeof(realw)),7005);
   }else{
     // for global mesh: each crust/mantle slice should have at top a free surface
-    exit_on_cuda_error("prepare_fields_noise_device nspec_top not properly initialized");
+    exit_on_cuda_error("prepare_fields_noise_device NSPEC_TOP not properly initialized");
   }
 
 
   // prepares noise source array
   if( mp->noise_tomography == 1 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
-                                       NDIM*NGLL3*(*NSTEP)*sizeof(realw)),7101);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray,noise_sourcearray,
-                                       NDIM*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),7102);
+    copy_todevice_realw((void**)&mp->d_noise_sourcearray,noise_sourcearray,NDIM*NGLL3*(*NSTEP));
   }
 
   // prepares noise directions
   if( mp->noise_tomography > 1 ){
-    int nface_size = NGLL2*(mp->nspec_top);
+    int nface_size = NGLL2*(mp->nspec2D_top_crust_mantle);
     // allocates memory on GPU
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
-                                       nface_size*sizeof(realw)),7301);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7306);
+    copy_todevice_realw((void**)&mp->d_normal_x_noise,normal_x_noise,nface_size);
+    copy_todevice_realw((void**)&mp->d_normal_y_noise,normal_y_noise,nface_size);
+    copy_todevice_realw((void**)&mp->d_normal_z_noise,normal_z_noise,nface_size);
 
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
-                                       nface_size*sizeof(realw)),7302);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7307);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
-                                       nface_size*sizeof(realw)),7303);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
-                                       nface_size*sizeof(realw)),7304);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_jacobian2D_top_crust_mantle,
-                                       nface_size*sizeof(realw)),7305);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_top_crust_mantle, jacobian2D_top_crust_mantle,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7310);
+    copy_todevice_realw((void**)&mp->d_mask_noise,mask_noise,nface_size);
+    copy_todevice_realw((void**)&mp->d_jacobian2D_top_crust_mantle,jacobian2D_top_crust_mantle,nface_size);
   }
 
   // prepares noise strength kernel
@@ -1251,6 +919,46 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+// OCEANS
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_oceans_device,
+              PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
+                                     int* npoin_oceans,
+                                     realw* h_rmass_ocean_load_selected,
+                                     int* h_iglob_ocean_load,
+                                     realw* h_normal_ocean_load) {
+
+  TRACE("prepare_oceans_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // arrays with global points on ocean surface
+  mp->npoin_oceans = *npoin_oceans;
+
+  // checks for global partitions, each slice must have a top surface with points on it
+  if( mp->npoin_oceans == 0 ){ exit_on_cuda_error("prepare_oceans_device has zero npoin_oceans"); }
+
+  // mass matrix
+  copy_todevice_realw((void**)&mp->d_rmass_ocean_load,h_rmass_ocean_load_selected,mp->npoin_oceans);
+
+  // global point indices
+  copy_todevice_int((void**)&mp->d_iglob_ocean_load,h_iglob_ocean_load,mp->npoin_oceans);
+
+  // normals
+  copy_todevice_realw((void**)&mp->d_normal_ocean_load,h_normal_ocean_load,NDIM*mp->npoin_oceans);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_oceans_device");
+#endif
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // Earth regions
 
 // CRUST / MANTLE
@@ -1270,9 +978,6 @@
                                           realw* h_rmassx,
                                           realw* h_rmassy,
                                           realw* h_rmassz,
-                                          realw* h_normal_top_crust_mantle,
-                                          int* h_ibelm_top_crust_mantle,
-                                          int* h_ibelm_bottom_crust_mantle,
                                           int* h_ibool,
                                           realw* h_xstore, realw* h_ystore, realw* h_zstore,
                                           int* h_ispec_is_tiso,
@@ -1287,8 +992,8 @@
                                           int* phase_ispec_inner,
                                           int* nspec_outer,
                                           int* nspec_inner,
-                                          int* NSPEC2D_TOP_CM,
                                           int* NSPEC2D_BOTTOM_CM,
+                                          int* h_ibelm_bottom_crust_mantle,
                                           int* NCHUNKS_VAL,
                                           int* num_colors_outer,
                                           int* num_colors_inner,
@@ -1336,20 +1041,19 @@
   }
 
   // global indexing
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_crust_mantle, size_padded*sizeof(int)),1021);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_crust_mantle, h_ibool,
-                                     NGLL3*(mp->NSPEC_CRUST_MANTLE)*sizeof(int),cudaMemcpyHostToDevice),1022);
+  copy_todevice_int((void**)&mp->d_ibool_crust_mantle,h_ibool,NGLL3*(mp->NSPEC_CRUST_MANTLE));
 
+//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_crust_mantle, size_padded*sizeof(int)),1021);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_crust_mantle, h_ibool,
+//                                     NGLL3*(mp->NSPEC_CRUST_MANTLE)*sizeof(int),cudaMemcpyHostToDevice),1022);
+
   // transverse isotropic elements
   // only needed if not anisotropic 3D mantle
   if( ! mp->anisotropic_3D_mantle ){
     // no anisotropy
 
     // transverse isotropy flag
-    print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_tiso_crust_mantle, 
-                                       (mp->NSPEC_CRUST_MANTLE)*sizeof(int)),1025);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_tiso_crust_mantle, h_ispec_is_tiso,
-                                       (mp->NSPEC_CRUST_MANTLE)*sizeof(int),cudaMemcpyHostToDevice),1025);
+    copy_todevice_int((void**)&mp->d_ispec_is_tiso_crust_mantle,h_ispec_is_tiso,mp->NSPEC_CRUST_MANTLE);
 
     // kappavstore, kappahstore
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappavstore_crust_mantle, size_padded*sizeof(realw)),1010);
@@ -1482,49 +1186,27 @@
 
   // mesh locations
   // ystore & zstore needed for tiso elements
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ystore_crust_mantle),sizeof(realw)*size_glob),2005);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ystore_crust_mantle,h_ystore,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_zstore_crust_mantle),sizeof(realw)*size_glob),2005);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_zstore_crust_mantle,h_zstore,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  copy_todevice_realw((void**)&mp->d_ystore_crust_mantle,h_ystore,size_glob);
+  copy_todevice_realw((void**)&mp->d_zstore_crust_mantle,h_zstore,size_glob);
 
+
   // xstore only needed when gravity is on
   if( mp->gravity ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_xstore_crust_mantle),sizeof(realw)*size_glob),2005);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_xstore_crust_mantle,h_xstore,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+    copy_todevice_realw((void**)&mp->d_xstore_crust_mantle,h_xstore,size_glob);
   }
 
   // inner/outer elements
   mp->num_phase_ispec_crust_mantle = *num_phase_ispec;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_crust_mantle),
-                                     mp->num_phase_ispec_crust_mantle*2*sizeof(int)),2008);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_crust_mantle,phase_ispec_inner,
-                                     mp->num_phase_ispec_crust_mantle*2*sizeof(int),cudaMemcpyHostToDevice),2101);
+  copy_todevice_int((void**)&mp->d_phase_ispec_inner_crust_mantle,phase_ispec_inner,
+                    mp->num_phase_ispec_crust_mantle*2);
 
   mp->nspec_outer_crust_mantle = *nspec_outer;
   mp->nspec_inner_crust_mantle = *nspec_inner;
 
-  // CMB/ocean coupling
-  mp->nspec2D_top_crust_mantle = *NSPEC2D_TOP_CM;
+  // CMB/fluid outer core coupling
   mp->nspec2D_bottom_crust_mantle = *NSPEC2D_BOTTOM_CM;
-  int size_tcm = NGLL2*(mp->nspec2D_top_crust_mantle);
+  copy_todevice_int((void**)&mp->d_ibelm_bottom_crust_mantle,h_ibelm_bottom_crust_mantle,mp->nspec2D_bottom_crust_mantle);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_normal_top_crust_mantle),
-                                     sizeof(realw)*NDIM*size_tcm),40020);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_top_crust_mantle,h_normal_top_crust_mantle,
-                                     sizeof(realw)*NDIM*size_tcm,cudaMemcpyHostToDevice),40030);
-
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ibelm_top_crust_mantle),
-                                     sizeof(int)*(mp->nspec2D_top_crust_mantle)),40021);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ibelm_bottom_crust_mantle),
-                                     sizeof(int)*(mp->nspec2D_bottom_crust_mantle)),40021);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_top_crust_mantle,h_ibelm_top_crust_mantle,
-                                     sizeof(int)*(mp->nspec2D_top_crust_mantle),cudaMemcpyHostToDevice),40031);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_bottom_crust_mantle,h_ibelm_bottom_crust_mantle,
-                                     sizeof(int)*(mp->nspec2D_bottom_crust_mantle),cudaMemcpyHostToDevice),40031);
-
   // wavefield
   int size = NDIM * mp->NGLOB_CRUST_MANTLE;
 
@@ -1555,21 +1237,13 @@
 
   // mass matrices
   if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmassx_crust_mantle),sizeof(realw)*size_glob),2005);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_rmassx_crust_mantle,h_rmassx,
-               sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmassy_crust_mantle),sizeof(realw)*size_glob),2005);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_rmassy_crust_mantle,h_rmassy,
-               sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+    copy_todevice_realw((void**)&mp->d_rmassx_crust_mantle,h_rmassx,size_glob);
+    copy_todevice_realw((void**)&mp->d_rmassy_crust_mantle,h_rmassy,size_glob);
   }
+  copy_todevice_realw((void**)&mp->d_rmassz_crust_mantle,h_rmassz,size_glob);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmassz_crust_mantle),sizeof(realw)*size_glob),2005);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmassz_crust_mantle,h_rmassz,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
-
   // kernels
   if( mp->simulation_type == 3 ){
-
     size = NGLL3*(mp->NSPEC_CRUST_MANTLE);
 
     // density kernel
@@ -1626,12 +1300,6 @@
                                          realw* h_gammax, realw* h_gammay, realw* h_gammaz,
                                          realw* h_rho, realw* h_kappav,
                                          realw* h_rmass,
-                                         realw* h_normal_top_outer_core,
-                                         realw* h_normal_bottom_outer_core,
-                                         realw* h_jacobian2D_top_outer_core,
-                                         realw* h_jacobian2D_bottom_outer_core,
-                                         int* h_ibelm_top_outer_core,
-                                         int* h_ibelm_bottom_outer_core,
                                          int* h_ibool,
                                          realw* h_xstore, realw* h_ystore, realw* h_zstore,
                                          int* num_phase_ispec,
@@ -1640,6 +1308,12 @@
                                          int* nspec_inner,
                                          int* NSPEC2D_TOP_OC,
                                          int* NSPEC2D_BOTTOM_OC,
+                                         realw* h_normal_top_outer_core,
+                                         realw* h_normal_bottom_outer_core,
+                                         realw* h_jacobian2D_top_outer_core,
+                                         realw* h_jacobian2D_bottom_outer_core,
+                                         int* h_ibelm_top_outer_core,
+                                         int* h_ibelm_bottom_outer_core,
                                          int* num_colors_outer,
                                          int* num_colors_inner,
                                          int* num_elem_colors) {
@@ -1700,28 +1374,21 @@
   }
 
   // global indexing
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_outer_core, size_padded*sizeof(int)),1021);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_outer_core, h_ibool,
-                                     NGLL3*(mp->NSPEC_OUTER_CORE)*sizeof(int),cudaMemcpyHostToDevice),1022);
+  copy_todevice_int((void**)&mp->d_ibool_outer_core,h_ibool,NGLL3*(mp->NSPEC_OUTER_CORE));
 
+//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_outer_core, size_padded*sizeof(int)),1021);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_outer_core, h_ibool,
+//                                     NGLL3*(mp->NSPEC_OUTER_CORE)*sizeof(int),cudaMemcpyHostToDevice),1022);
+
   // mesh locations
   // always needed
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_xstore_outer_core),sizeof(realw)*size_glob),2005);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_xstore_outer_core,h_xstore,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ystore_outer_core),sizeof(realw)*size_glob),2005);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ystore_outer_core,h_ystore,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_zstore_outer_core),sizeof(realw)*size_glob),2005);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_zstore_outer_core,h_zstore,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  copy_todevice_realw((void**)&mp->d_xstore_outer_core,h_xstore,size_glob);
+  copy_todevice_realw((void**)&mp->d_ystore_outer_core,h_ystore,size_glob);
+  copy_todevice_realw((void**)&mp->d_zstore_outer_core,h_zstore,size_glob);
 
   // inner/outer elements
   mp->num_phase_ispec_outer_core = *num_phase_ispec;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_outer_core),
-                                     mp->num_phase_ispec_outer_core*2*sizeof(int)),2008);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_outer_core,phase_ispec_inner,
-                                     mp->num_phase_ispec_outer_core*2*sizeof(int),cudaMemcpyHostToDevice),2101);
+  copy_todevice_int((void**)&mp->d_phase_ispec_inner_outer_core,phase_ispec_inner,mp->num_phase_ispec_outer_core*2);
 
   mp->nspec_outer_outer_core = *nspec_outer;
   mp->nspec_inner_outer_core = *nspec_inner;
@@ -1732,20 +1399,14 @@
   int size_toc = NGLL2*(mp->nspec2D_top_outer_core);
   int size_boc = NGLL2*(mp->nspec2D_bottom_outer_core);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_normal_top_outer_core),sizeof(realw)*NDIM*size_toc),40020);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_normal_bottom_outer_core),sizeof(realw)*NDIM*size_boc),40021);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_top_outer_core,h_normal_top_outer_core,sizeof(realw)*NDIM*size_toc,cudaMemcpyHostToDevice),40030);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_bottom_outer_core,h_normal_bottom_outer_core,sizeof(realw)*NDIM*size_boc,cudaMemcpyHostToDevice),40031);
+  copy_todevice_realw((void**)&mp->d_normal_top_outer_core,h_normal_top_outer_core,NDIM*size_toc);
+  copy_todevice_realw((void**)&mp->d_normal_bottom_outer_core,h_normal_bottom_outer_core,NDIM*size_boc);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_jacobian2D_top_outer_core),sizeof(realw)*size_toc),40022);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_jacobian2D_bottom_outer_core),sizeof(realw)*size_boc),40023);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_top_outer_core,h_jacobian2D_top_outer_core,sizeof(realw)*size_toc,cudaMemcpyHostToDevice),40032);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_jacobian2D_bottom_outer_core,h_jacobian2D_bottom_outer_core,sizeof(realw)*size_boc,cudaMemcpyHostToDevice),40033);
+  copy_todevice_realw((void**)&mp->d_jacobian2D_top_outer_core,h_jacobian2D_top_outer_core,size_toc);
+  copy_todevice_realw((void**)&mp->d_jacobian2D_bottom_outer_core,h_jacobian2D_bottom_outer_core,size_boc);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ibelm_top_outer_core),sizeof(int)*(mp->nspec2D_top_outer_core)),40024);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ibelm_bottom_outer_core),sizeof(int)*(mp->nspec2D_bottom_outer_core)),40025);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_top_outer_core,h_ibelm_top_outer_core,sizeof(int)*(mp->nspec2D_top_outer_core),cudaMemcpyHostToDevice),40034);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_bottom_outer_core,h_ibelm_bottom_outer_core,sizeof(int)*(mp->nspec2D_bottom_outer_core),cudaMemcpyHostToDevice),40035);
+  copy_todevice_int((void**)&mp->d_ibelm_top_outer_core,h_ibelm_top_outer_core,mp->nspec2D_top_outer_core);
+  copy_todevice_int((void**)&mp->d_ibelm_bottom_outer_core,h_ibelm_bottom_outer_core,mp->nspec2D_bottom_outer_core);
 
   // wavefield
   int size = mp->NGLOB_OUTER_CORE;
@@ -1773,15 +1434,11 @@
   }
   #endif
 
-
   // mass matrix
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_outer_core),sizeof(realw)*size_glob),2005);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_outer_core,h_rmass,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  copy_todevice_realw((void**)&mp->d_rmass_outer_core,h_rmass,size_glob);
 
   // kernels
   if( mp->simulation_type == 3 ){
-
     size = NGLL3*(mp->NSPEC_OUTER_CORE);
 
     // density kernel
@@ -1820,7 +1477,6 @@
                                          realw* h_gammax, realw* h_gammay, realw* h_gammaz,
                                          realw* h_rho, realw* h_kappav, realw* h_muv,
                                          realw* h_rmass,
-                                         int* h_ibelm_top_inner_core,
                                          int* h_ibool,
                                          realw* h_xstore, realw* h_ystore, realw* h_zstore,
                                          realw *c11store,realw *c12store,realw *c13store,
@@ -1831,6 +1487,7 @@
                                          int* nspec_outer,
                                          int* nspec_inner,
                                          int* NSPEC2D_TOP_IC,
+                                         int* h_ibelm_top_inner_core,
                                          int* num_colors_outer,
                                          int* num_colors_inner,
                                          int* num_elem_colors) {
@@ -1929,44 +1586,32 @@
   }
 
   // global indexing
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_inner_core, size_padded*sizeof(int)),1021);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_inner_core, h_ibool,
-                                     NGLL3*(mp->NSPEC_INNER_CORE)*sizeof(int),cudaMemcpyHostToDevice),1022);
+  copy_todevice_int((void**)&mp->d_ibool_inner_core,h_ibool,NGLL3*(mp->NSPEC_INNER_CORE));
 
+//  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_inner_core, size_padded*sizeof(int)),1021);
+//  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_inner_core, h_ibool,
+//                                     NGLL3*(mp->NSPEC_INNER_CORE)*sizeof(int),cudaMemcpyHostToDevice),1022);
+
   // fictious element flags
-  print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_idoubling_inner_core,
-                                     mp->NSPEC_INNER_CORE*sizeof(int)),2010);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_idoubling_inner_core, h_idoubling_inner_core,
-                                     mp->NSPEC_INNER_CORE*sizeof(int),cudaMemcpyHostToDevice),2011);
+  copy_todevice_int((void**)&mp->d_idoubling_inner_core,h_idoubling_inner_core,mp->NSPEC_INNER_CORE);
 
   // mesh locations
   // only needed when gravity is on
   if( mp->gravity ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_xstore_inner_core),sizeof(realw)*size_glob),2005);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_xstore_inner_core,h_xstore,
-                                       sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ystore_inner_core),sizeof(realw)*size_glob),2005);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_ystore_inner_core,h_ystore,
-                                       sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_zstore_inner_core),sizeof(realw)*size_glob),2005);
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_zstore_inner_core,h_zstore,
-                                       sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+    copy_todevice_realw((void**)&mp->d_xstore_inner_core,h_xstore,size_glob);
+    copy_todevice_realw((void**)&mp->d_ystore_inner_core,h_ystore,size_glob);
+    copy_todevice_realw((void**)&mp->d_zstore_inner_core,h_zstore,size_glob);
   }
 
   // inner/outer elements
   mp->num_phase_ispec_inner_core = *num_phase_ispec;
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_inner_core),
-                                     mp->num_phase_ispec_inner_core*2*sizeof(int)),2008);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_inner_core,phase_ispec_inner,
-                                     mp->num_phase_ispec_inner_core*2*sizeof(int),cudaMemcpyHostToDevice),2101);
+  copy_todevice_int((void**)&mp->d_phase_ispec_inner_inner_core,phase_ispec_inner,mp->num_phase_ispec_inner_core*2);
 
   mp->nspec_outer_inner_core = *nspec_outer;
   mp->nspec_inner_inner_core = *nspec_inner;
   mp->nspec2D_top_inner_core = *NSPEC2D_TOP_IC;
+  copy_todevice_int((void**)&mp->d_ibelm_top_inner_core,h_ibelm_top_inner_core,mp->nspec2D_top_inner_core);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ibelm_top_inner_core),sizeof(int)*(mp->nspec2D_top_inner_core)),40021);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_ibelm_top_inner_core,h_ibelm_top_inner_core,sizeof(int)*(mp->nspec2D_top_inner_core),cudaMemcpyHostToDevice),40031);
-
   // wavefield
   int size = NDIM * mp->NGLOB_INNER_CORE;
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ_inner_core),sizeof(realw)*size),4001);
@@ -1993,15 +1638,11 @@
   }
   #endif
 
-
   // mass matrix
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_inner_core),sizeof(realw)*size_glob),2005);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_inner_core,h_rmass,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
+  copy_todevice_realw((void**)&mp->d_rmass_inner_core,h_rmass,size_glob);
 
   // kernels
   if( mp->simulation_type == 3 ){
-
     size = NGLL3*(mp->NSPEC_INNER_CORE);
 
     // density kernel
@@ -2028,44 +1669,10 @@
 #endif
 }
 
-/* ----------------------------------------------------------------------------------------------- */
 
-// OCEANS
 
 /* ----------------------------------------------------------------------------------------------- */
 
-extern "C"
-void FC_FUNC_(prepare_oceans_device,
-              PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
-                                     realw* h_rmass_ocean_load) {
-
-  TRACE("prepare_oceans_device");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-
-  // mass matrix
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
-                                     sizeof(realw)*mp->NGLOB_CRUST_MANTLE_OCEANS),4501);
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,h_rmass_ocean_load,
-                                     sizeof(realw)*mp->NGLOB_CRUST_MANTLE_OCEANS,cudaMemcpyHostToDevice),4502);
-
-  // temporary global array: used to synchronize updates on global accel array
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
-                                     sizeof(int)*mp->NGLOB_CRUST_MANTLE_OCEANS),4503);
-  if( mp->simulation_type == 3 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_updated_dof_ocean_load),
-                                       sizeof(int)*mp->NGLOB_CRUST_MANTLE_OCEANS),4504);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("prepare_oceans_device");
-#endif
-}
-
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
 // cleanup
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -2380,10 +1987,8 @@
   if( mp->gravity ){
     cudaFree(mp->d_xstore_crust_mantle);
   }
-  cudaFree(mp->d_phase_ispec_inner_crust_mantle);
 
-  cudaFree(mp->d_normal_top_crust_mantle);
-  cudaFree(mp->d_ibelm_top_crust_mantle);
+  cudaFree(mp->d_phase_ispec_inner_crust_mantle);
   cudaFree(mp->d_ibelm_bottom_crust_mantle);
 
   cudaFree(mp->d_displ_crust_mantle);
@@ -2467,18 +2072,9 @@
   cudaFree(mp->d_gammay_inner_core);
   cudaFree(mp->d_gammaz_inner_core);
 
-
   cudaFree(mp->d_muvstore_inner_core);
   cudaFree(mp->d_ibool_inner_core);
 
-  if( mp->oceans ){
-    cudaFree(mp->d_rmass_ocean_load);
-    cudaFree(mp->d_updated_dof_ocean_load);
-    if( mp->simulation_type == 3 ){
-      cudaFree(mp->d_b_updated_dof_ocean_load);    
-    }
-  }
-
   if( mp->gravity ){
     cudaFree(mp->d_xstore_inner_core);
     cudaFree(mp->d_ystore_inner_core);
@@ -2522,6 +2118,12 @@
   }
   cudaFree(mp->d_rmass_inner_core);
 
+  if( mp->oceans ){
+    cudaFree(mp->d_rmass_ocean_load);
+    cudaFree(mp->d_iglob_ocean_load);
+    cudaFree(mp->d_normal_ocean_load);
+  }
+
   // releases previous contexts
   cudaThreadExit();
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-08-15 19:55:44 UTC (rev 20570)
@@ -81,36 +81,6 @@
 void FC_FUNC_(get_free_device_memory,
               get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {} 
 
-void FC_FUNC_(check_max_norm_displ_gpu,
-              CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {} 
-
-void FC_FUNC_(check_max_norm_vector,
-              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {} 
-
-void FC_FUNC_(check_max_norm_displ,
-              CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {} 
-
-void FC_FUNC_(check_max_norm_b_displ_gpu,
-              CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {} 
-
-void FC_FUNC_(check_max_norm_b_accel_gpu,
-              CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {} 
-
-void FC_FUNC_(check_max_norm_b_veloc_gpu,
-              CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {} 
-
-void FC_FUNC_(check_max_norm_b_displ,
-              CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {} 
-
-void FC_FUNC_(check_max_norm_b_accel,
-              CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {} 
-
-void FC_FUNC_(check_error_vectors,
-              CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {} 
-
-void FC_FUNC_(get_max_accel,
-              GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} 
-
 void FC_FUNC_(check_norm_acoustic_from_device,
               CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
@@ -126,7 +96,37 @@
                                              realw* strain_norm2,
                                              long* Mesh_pointer_f) {} 
 
+ void FC_FUNC_(check_max_norm_displ_gpu,
+ CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {} 
 
+ void FC_FUNC_(check_max_norm_vector,
+ CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {} 
+
+ void FC_FUNC_(check_max_norm_displ,
+ CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {} 
+
+ void FC_FUNC_(check_max_norm_b_displ_gpu,
+ CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {} 
+
+ void FC_FUNC_(check_max_norm_b_accel_gpu,
+ CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {} 
+
+ void FC_FUNC_(check_max_norm_b_veloc_gpu,
+ CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {} 
+
+ void FC_FUNC_(check_max_norm_b_displ,
+ CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {} 
+
+ void FC_FUNC_(check_max_norm_b_accel,
+ CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {} 
+
+ void FC_FUNC_(check_error_vectors,
+ CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {} 
+
+ void FC_FUNC_(get_max_accel,
+ GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {} 
+
+
 //
 // src/cuda/compute_add_sources_elastic_cuda.cu
 //
@@ -299,7 +299,7 @@
                                int* SIMULATION_TYPE_f,
                                realw* b_deltatover2_F,
                                int* OCEANS,
-             int* NCHUNKS_VAL) {} 
+                               int* NCHUNKS_VAL) {} 
 
 void FC_FUNC_(kernel_3_b_cuda,
               KERNEL_3_B_CUDA)(long* Mesh_pointer,
@@ -363,7 +363,6 @@
                                         int* nrec,int* nrec_local, int* nadj_rec_local,
                                         int* NSPEC_CRUST_MANTLE, int* NGLOB_CRUST_MANTLE,
                                         int* NSPEC_CRUST_MANTLE_STRAIN_ONLY,
-                                        int* NGLOB_CRUST_MANTLE_OCEANS,
                                         int* NSPEC_OUTER_CORE, int* NGLOB_OUTER_CORE,
                                         int* NSPEC_INNER_CORE, int* NGLOB_INNER_CORE,
                                         int* NSPEC_INNER_CORE_STRAIN_ONLY,
@@ -396,8 +395,7 @@
                                               realw* b_deltat,
                                               realw* b_A_array_rotation,
                                               realw* b_B_array_rotation,
-                                              int* NSPEC_OUTER_CORE_ROTATION
-                                              ) {} 
+                                              int* NSPEC_OUTER_CORE_ROTATION) {} 
 
 void FC_FUNC_(prepare_fields_gravity_device,
               PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
@@ -407,8 +405,7 @@
                                              realw* minus_deriv_gravity_table,
                                              realw* density_table,
                                              realw* h_wgll_cube,
-                                             int* NRAD_GRAVITY
-                                             ) {} 
+                                             int* NRAD_GRAVITY) {} 
 
 void FC_FUNC_(prepare_fields_attenuat_device,
               PREPARE_FIELDS_ATTENUAT_DEVICE)(long* Mesh_pointer_f,
@@ -427,8 +424,7 @@
                                                  realw* factor_common_inner_core,
                                                  realw* one_minus_sum_beta_inner_core,
                                                  realw* alphaval,realw* betaval,realw* gammaval,
-                                                 realw* b_alphaval,realw* b_betaval,realw* b_gammaval
-                                                 ) {} 
+                                                 realw* b_alphaval,realw* b_betaval,realw* b_gammaval) {} 
 
 void FC_FUNC_(prepare_fields_strain_device,
               PREPARE_FIELDS_STRAIN_DEVICE)(long* Mesh_pointer_f,
@@ -455,8 +451,7 @@
                                             realw* b_epsilondev_xz_inner_core,
                                             realw* b_epsilondev_yz_inner_core,
                                             realw* eps_trace_over_3_inner_core,
-                                            realw* b_eps_trace_over_3_inner_core
-                                            ) {} 
+                                            realw* b_eps_trace_over_3_inner_core) {} 
 
 void FC_FUNC_(prepare_fields_absorb_device,
               PREPARE_FIELDS_ABSORB_DEVICE)(long* Mesh_pointer_f,
@@ -487,8 +482,7 @@
                                             realw* jacobian2D_xmin_outer_core, realw* jacobian2D_xmax_outer_core,
                                             realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
                                             realw* jacobian2D_bottom_outer_core,
-                                            realw* vp_outer_core
-                                            ) {} 
+                                            realw* vp_outer_core) {} 
 
 void FC_FUNC_(prepare_mpi_buffers_device,
               PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f,
@@ -503,14 +497,13 @@
                                           int* num_interfaces_outer_core,
                                           int* max_nibool_interfaces_outer_core,
                                           int* nibool_interfaces_outer_core,
-                                          int* ibool_interfaces_outer_core
-                                          ){} 
+                                          int* ibool_interfaces_outer_core){} 
 
 void FC_FUNC_(prepare_fields_noise_device,
               PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
-                                           int* nspec_top,
-                                           int* ibelm_top_crust_mantle,
+                                           int* NSPEC_TOP,
                                            int* NSTEP,
+                                           int* h_ibelm_top_crust_mantle,
                                            realw* noise_sourcearray,
                                            realw* normal_x_noise,
                                            realw* normal_y_noise,
@@ -518,6 +511,13 @@
                                            realw* mask_noise,
                                            realw* jacobian2D_top_crust_mantle) {} 
 
+void FC_FUNC_(prepare_oceans_device,
+              PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
+                                     int* npoin_oceans,
+                                     realw* h_rmass_ocean_load_selected,
+                                     int* h_iglob_ocean_load,
+                                     realw* h_normal_ocean_load) {} 
+
 void FC_FUNC_(prepare_crust_mantle_device,
              PREPARE_CRUST_MANTLE_DEVICE)(long* Mesh_pointer_f,
                                           realw* h_xix, realw* h_xiy, realw* h_xiz,
@@ -530,9 +530,6 @@
                                           realw* h_rmassx,
                                           realw* h_rmassy,
                                           realw* h_rmassz,
-                                          realw* h_normal_top_crust_mantle,
-                                          int* h_ibelm_top_crust_mantle,
-                                          int* h_ibelm_bottom_crust_mantle,
                                           int* h_ibool,
                                           realw* h_xstore, realw* h_ystore, realw* h_zstore,
                                           int* h_ispec_is_tiso,
@@ -547,8 +544,8 @@
                                           int* phase_ispec_inner,
                                           int* nspec_outer,
                                           int* nspec_inner,
-                                          int* NSPEC2D_TOP_CM,
                                           int* NSPEC2D_BOTTOM_CM,
+                                          int* h_ibelm_bottom_crust_mantle,
                                           int* NCHUNKS_VAL,
                                           int* num_colors_outer,
                                           int* num_colors_inner,
@@ -561,12 +558,6 @@
                                          realw* h_gammax, realw* h_gammay, realw* h_gammaz,
                                          realw* h_rho, realw* h_kappav,
                                          realw* h_rmass,
-                                         realw* h_normal_top_outer_core,
-                                         realw* h_normal_bottom_outer_core,
-                                         realw* h_jacobian2D_top_outer_core,
-                                         realw* h_jacobian2D_bottom_outer_core,
-                                         int* h_ibelm_top_outer_core,
-                                         int* h_ibelm_bottom_outer_core,
                                          int* h_ibool,
                                          realw* h_xstore, realw* h_ystore, realw* h_zstore,
                                          int* num_phase_ispec,
@@ -575,6 +566,12 @@
                                          int* nspec_inner,
                                          int* NSPEC2D_TOP_OC,
                                          int* NSPEC2D_BOTTOM_OC,
+                                         realw* h_normal_top_outer_core,
+                                         realw* h_normal_bottom_outer_core,
+                                         realw* h_jacobian2D_top_outer_core,
+                                         realw* h_jacobian2D_bottom_outer_core,
+                                         int* h_ibelm_top_outer_core,
+                                         int* h_ibelm_bottom_outer_core,
                                          int* num_colors_outer,
                                          int* num_colors_inner,
                                          int* num_elem_colors) {} 
@@ -586,7 +583,6 @@
                                          realw* h_gammax, realw* h_gammay, realw* h_gammaz,
                                          realw* h_rho, realw* h_kappav, realw* h_muv,
                                          realw* h_rmass,
-                                         int* h_ibelm_top_inner_core,
                                          int* h_ibool,
                                          realw* h_xstore, realw* h_ystore, realw* h_zstore,
                                          realw *c11store,realw *c12store,realw *c13store,
@@ -597,14 +593,11 @@
                                          int* nspec_outer,
                                          int* nspec_inner,
                                          int* NSPEC2D_TOP_IC,
+                                         int* h_ibelm_top_inner_core,
                                          int* num_colors_outer,
                                          int* num_colors_inner,
                                          int* num_elem_colors) {} 
 
-void FC_FUNC_(prepare_oceans_device,
-              PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
-             realw* h_rmass_ocean_load) {} 
-
 void FC_FUNC_(prepare_cleanup_device,
               PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
               int* NCHUNKS_VAL) {} 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_MPI_interfaces.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_MPI_interfaces.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_MPI_interfaces.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -284,7 +284,7 @@
   integer :: ier
   ! for central cube buffers
   integer :: nspec2D_xmin_inner_core,nspec2D_xmax_inner_core, &
-            nspec2D_ymin_inner_core,nspec2D_ymax_inner_core  
+            nspec2D_ymin_inner_core,nspec2D_ymax_inner_core
   integer, dimension(:),allocatable :: ibelm_xmin_inner_core,ibelm_xmax_inner_core
   integer, dimension(:),allocatable :: ibelm_ymin_inner_core,ibelm_ymax_inner_core
   integer, dimension(:),allocatable :: ibelm_top_inner_core
@@ -418,7 +418,7 @@
               ibelm_bottom_inner_core(NSPEC2D_BOTTOM_IC), &
               stat=ier)
       if( ier /= 0 ) call exit_MPI(myrank,'error allocating central cube index arrays')
-      
+
       ! gets coupling arrays for inner core
       nspec2D_xmin_inner_core = nspec2D_xmin
       nspec2D_xmax_inner_core = nspec2D_xmax

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_mass_matrices.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_mass_matrices.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_mass_matrices.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -106,7 +106,7 @@
 
           iglob = ibool(i,j,k,ispec)
           weight = wxgll(i)*wygll(j)*wzgll(k)
-          
+
           ! definition depends if region is fluid or solid
           select case( iregion_code)
           case( IREGION_CRUST_MANTLE, IREGION_INNER_CORE )
@@ -161,7 +161,7 @@
       ! loops over surface points
       do j = 1,NGLLY
         do i = 1,NGLLX
-          
+
           ! if 3D Earth with topography, compute local height of oceans
           if( TOPOGRAPHY ) then
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_regions_mesh.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_regions_mesh.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_regions_mesh.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -104,7 +104,7 @@
     case default
       call exit_MPI(myrank,'error ipass value in create_regions_mesh')
     end select
-    call flush_IMAIN()    
+    call flush_IMAIN()
   endif
 
   ! create the name for the database of the current slide and region
@@ -115,7 +115,7 @@
   if( myrank == 0) then
     write(IMAIN,*)
     write(IMAIN,*) '  ...allocating arrays '
-    call flush_IMAIN()    
+    call flush_IMAIN()
   endif
   call crm_allocate_arrays(iregion_code,nspec,ipass, &
                           NSPEC2DMAX_XMIN_XMAX,NSPEC2DMAX_YMIN_YMAX, &
@@ -127,7 +127,7 @@
   if( myrank == 0) then
     write(IMAIN,*)
     write(IMAIN,*) '  ...setting up layers '
-    call flush_IMAIN()    
+    call flush_IMAIN()
   endif
   call crm_setup_layers(iregion_code,nspec,ipass,NEX_PER_PROC_ETA)
 
@@ -136,7 +136,7 @@
   if( myrank == 0) then
     write(IMAIN,*)
     write(IMAIN,*) '  ...creating mesh elements '
-    call flush_IMAIN()    
+    call flush_IMAIN()
   endif
   call crm_create_elements(iregion_code,nspec,ipass, &
                           NEX_PER_PROC_XI,NEX_PER_PROC_ETA, &
@@ -151,7 +151,7 @@
     if( myrank == 0) then
       write(IMAIN,*)
       write(IMAIN,*) '  ...creating global addressing'
-      call flush_IMAIN()      
+      call flush_IMAIN()
     endif
     call crm_setup_indexing(nspec,nglob_theor,npointot)
 
@@ -161,7 +161,7 @@
     if( myrank == 0) then
       write(IMAIN,*)
       write(IMAIN,*) '  ...creating MPI buffers'
-      call flush_IMAIN()      
+      call flush_IMAIN()
     endif
     call crm_setup_mpi_buffers(npointot,nspec,iregion_code)
 
@@ -179,7 +179,7 @@
     if( myrank == 0) then
       write(IMAIN,*)
       write(IMAIN,*) '  ...precomputing jacobian'
-      call flush_IMAIN()      
+      call flush_IMAIN()
     endif
     call get_jacobian_boundaries(myrank,iboun,nspec,xstore,ystore,zstore, &
               dershape2D_x,dershape2D_y,dershape2D_bottom,dershape2D_top, &
@@ -200,7 +200,7 @@
     if( myrank == 0) then
       write(IMAIN,*)
       write(IMAIN,*) '  ...creating chunk buffers'
-      call flush_IMAIN()      
+      call flush_IMAIN()
     endif
     call create_chunk_buffers(iregion_code,nspec,ibool,idoubling, &
                               xstore,ystore,zstore,nglob_theor, &
@@ -218,7 +218,7 @@
     if( myrank == 0) then
       write(IMAIN,*)
       write(IMAIN,*) '  ...preparing MPI interfaces'
-      call flush_IMAIN()      
+      call flush_IMAIN()
     endif
     ! creates MPI interface arrays
     call create_MPI_interfaces(iregion_code)
@@ -245,7 +245,7 @@
     if( myrank == 0) then
       write(IMAIN,*)
       write(IMAIN,*) '  ...element mesh coloring '
-      call flush_IMAIN()      
+      call flush_IMAIN()
     endif
     call setup_color_perm(iregion_code)
 
@@ -277,7 +277,7 @@
     if( myrank == 0) then
       write(IMAIN,*)
       write(IMAIN,*) '  ...creating mass matrix'
-      call flush_IMAIN()      
+      call flush_IMAIN()
     endif
 
     ! allocates mass matrices in this slice (will be fully assembled in the solver)
@@ -331,7 +331,7 @@
     if( myrank == 0) then
       write(IMAIN,*)
       write(IMAIN,*) '  ...saving binary files'
-      call flush_IMAIN()      
+      call flush_IMAIN()
     endif
     ! saves mesh and model parameters
     call save_arrays_solver(myrank,nspec,nglob,idoubling,ibool, &
@@ -357,11 +357,11 @@
       if( myrank == 0) then
         write(IMAIN,*)
         write(IMAIN,*) '  ...saving boundary mesh files'
-        call flush_IMAIN()        
+        call flush_IMAIN()
       endif
       ! saves boundary file
       call save_arrays_solver_boundary()
-      
+
     endif
 
     ! compute volume, bottom and top area of that part of the slice
@@ -383,7 +383,7 @@
       if( myrank == 0) then
         write(IMAIN,*)
         write(IMAIN,*) '  ...saving AVS mesh files'
-        call flush_IMAIN()        
+        call flush_IMAIN()
       endif
       call crm_save_mesh_files(nspec,npointot,iregion_code)
     endif
@@ -915,17 +915,18 @@
       ! time estimate
       tCPU = wtime() - time_start
 
-      ! outputs remaining time (poor estimation)
-      tCPU = (1.0-(ilayer_loop-ifirst_region+1.0)/(ilast_region-ifirst_region+1.0)) &
-                /(ilayer_loop-ifirst_region+1.0)/(ilast_region-ifirst_region+1.0)*tCPU*10.0
-      
-      write(IMAIN,*) "    ",(ilayer_loop-ifirst_region+1.0)/(ilast_region-ifirst_region+1.0) * 100.0," %", &
-                    " time remaining:", tCPU,"s"
-      
       ! outputs current time on system
       call date_and_time(VALUES=tval)
-      write(IMAIN,*) "    ",tval(5),"h",tval(6),"min",tval(7),".",tval(8),"sec"
 
+      ! debug: outputs remaining time (poor estimation)
+      !tCPU = (1.0-(ilayer_loop-ifirst_region+1.0)/(ilast_region-ifirst_region+1.0)) &
+      !          /(ilayer_loop-ifirst_region+1.0)/(ilast_region-ifirst_region+1.0)*tCPU*10.0
+
+      ! user output
+      write(IMAIN,'(a,f5.1,a,a,i2.2,a,i2.2,a,i2.2,a)') &
+        "    ",(ilayer_loop-ifirst_region+1.0)/(ilast_region-ifirst_region+1.0) * 100.0,"%", &
+        "    time: ",tval(5),"h ",tval(6),"min ",tval(7),"sec"
+
       ! flushes I/O buffer
       call flush_IMAIN()
     endif

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/get_perm_color.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/get_perm_color.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/get_perm_color.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -75,13 +75,13 @@
   !user output
   if(myrank == 0) then
     write(IMAIN,*) '     colors:'
-    write(IMAIN,*) '     number of colors for inner elements = ',nb_colors_inner_elements
-    write(IMAIN,*) '     number of colors for outer elements = ',nb_colors_outer_elements
-    write(IMAIN,*) '     total number of colors (sum of both) = ', nb_colors_inner_elements + nb_colors_outer_elements
+    write(IMAIN,*) '       number of colors for inner elements = ',nb_colors_inner_elements
+    write(IMAIN,*) '       number of colors for outer elements = ',nb_colors_outer_elements
+    write(IMAIN,*) '       total number of colors (sum of both) = ', nb_colors_inner_elements + nb_colors_outer_elements
     write(IMAIN,*) '     elements:'
-    write(IMAIN,*) '     number of elements for outer elements  = ',nspec_outer
-    write(IMAIN,*) '     number of elements for inner elements  = ',nspec_inner
-    write(IMAIN,*) '     total number of elements for domain elements  = ',nspec_domain
+    write(IMAIN,*) '       number of elements for outer elements  = ',nspec_outer
+    write(IMAIN,*) '       number of elements for inner elements  = ',nspec_inner
+    write(IMAIN,*) '       total number of elements for domain elements  = ',nspec_domain
   endif
 
   ! total number of colors used

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_atten3D_QRFSI12.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_atten3D_QRFSI12.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_atten3D_QRFSI12.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -52,12 +52,12 @@
   ! QRFSI12 constants
   integer,parameter :: NKQ=8,MAXL_Q=12
   integer,parameter :: NSQ=(MAXL_Q+1)**2,NDEPTHS_REFQ=913
-  
+
   ! model_atten3D_QRFSI12_variables
   double precision,dimension(:,:),allocatable :: QRFSI12_Q_dqmu
   double precision,dimension(:),allocatable :: QRFSI12_Q_spknt
   double precision,dimension(:),allocatable :: QRFSI12_Q_refdepth,QRFSI12_Q_refqmu
-  
+
   end module model_atten3D_QRFSI12_par
 
 !
@@ -69,7 +69,7 @@
 ! standard routine to setup model
 
   use model_atten3D_QRFSI12_par
-  
+
   implicit none
 
   include "constants.h"
@@ -77,7 +77,7 @@
   include 'mpif.h'
 
   integer :: myrank
-  
+
   ! local parameters
   integer :: ier
 
@@ -129,7 +129,7 @@
 ! get the dq model coefficients
   open(unit=10,file=QRFSI12,status='old',action='read',iostat=ier)
   if( ier /= 0 ) call exit_MPI(0,'error opening model file QRFSI12.dat')
-  
+
   do k=1,NKQ
     read(10,*)index
     j=0
@@ -164,7 +164,7 @@
 ! get the depths and 1/Q values of the reference model
   open(11,file=QRFSI12_ref,status='old',action='read',iostat=ier)
   if( ier /= 0 ) call exit_MPI(0,'error opening model file ref_QRFSI12')
-  
+
   do j=1,NDEPTHS_REFQ
     read(11,*)QRFSI12_Q_refdepth(j),QRFSI12_Q_refqmu(j)
   enddo
@@ -201,14 +201,14 @@
 
   ! in Colleen's original code theta refers to the latitude.  Here we have redefined theta to be colatitude
   ! to agree with the rest of specfem
- 
+
   ! debug
   !  print *,'entering QRFSI12 subroutine'
 
   ylat=90.0d0-theta
   xlon=phi
 
-  ! only checks radius for crust, idoubling is missleading for oceanic crust 
+  ! only checks radius for crust, idoubling is missleading for oceanic crust
   ! when we want to expand mantle up to surface...
 
 !  !if(idoubling == IFLAG_CRUST .or. radius >= rmoho) then
@@ -222,27 +222,27 @@
 
     !debug
     !   print *,'QRFSI12: we are in the inner core'
-    
+
     Qmu = 84.6d0
-    
+
   else if(idoubling == IFLAG_OUTER_CORE_NORMAL) then
-  
+
     ! we are in the outer core
-    
+
     !debug
     !   print *,'QRFSI12: we are in the outer core'
-    
+
     Qmu = 0.0d0
-    
-  else 
-    
+
+  else
+
     ! we are in the mantle
-    
+
     depth = 6371.-radius
-    
+
     !debug
     !   print *,'QRFSI12: we are in the mantle at depth',depth
-    
+
     ifnd=0
     do i=2,NDEPTHS_REFQ
       if(depth >= QRFSI12_Q_refdepth(i-1) .and. depth < QRFSI12_Q_refdepth(i))then
@@ -282,7 +282,7 @@
 
     Qmu = 1/smallq
 
-    ! if Qmu is larger than MAX_ATTENUATION_VALUE, set it to ATTENUATION_COMP_MAXIMUM.  
+    ! if Qmu is larger than MAX_ATTENUATION_VALUE, set it to ATTENUATION_COMP_MAXIMUM.
     ! This assumes that this value is high enough that at this point there is almost no attenuation at all.
     if(Qmu >= ATTENUATION_COMP_MAXIMUM) Qmu = 0.99d0*ATTENUATION_COMP_MAXIMUM
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_attenuation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_attenuation.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_attenuation.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -230,7 +230,7 @@
   integer :: myrank,REFERENCE_1D_MODEL
   double precision :: RICB, RCMB, R670, R220, R80
   logical :: CRUSTAL
-  
+
   ! local parameters
   double precision :: tau_e(N_SLS)
   double precision :: Qb
@@ -248,21 +248,21 @@
   else if(REFERENCE_1D_MODEL == REFERENCE_MODEL_IASP91) then
     AM_V%Qn = 12
   else if(REFERENCE_1D_MODEL == REFERENCE_MODEL_AK135) then
-    ! redefines "pure" 1D model without crustal modification 
+    ! redefines "pure" 1D model without crustal modification
     call define_model_ak135(.FALSE.)
     AM_V%Qn = NR_AK135
   else if(REFERENCE_1D_MODEL == REFERENCE_MODEL_1066A) then
-    ! redefines "pure" 1D model without crustal modification 
+    ! redefines "pure" 1D model without crustal modification
     call define_model_1066a(.FALSE.)
     AM_V%Qn = NR_1066A
   else if(REFERENCE_1D_MODEL == REFERENCE_MODEL_1DREF) then
-    ! redefines "pure" 1D model without crustal modification 
+    ! redefines "pure" 1D model without crustal modification
     call define_model_1dref(.FALSE.)
     AM_V%Qn = NR_REF
   else if(REFERENCE_1D_MODEL == REFERENCE_MODEL_JP1D) then
     AM_V%Qn = 12
   else if(REFERENCE_1D_MODEL == REFERENCE_MODEL_SEA1D) then
-    ! redefines "pure" 1D model without crustal modification 
+    ! redefines "pure" 1D model without crustal modification
     call define_model_sea1d(.FALSE.)
     AM_V%Qn = NR_SEA1D
   else
@@ -308,20 +308,20 @@
   ! re-defines 1D models with crustal modification if necessary
   if( CRUSTAL ) then
     if(REFERENCE_1D_MODEL == REFERENCE_MODEL_AK135) then
-      ! redefines 1D model with crustal modification 
+      ! redefines 1D model with crustal modification
       call define_model_ak135(CRUSTAL)
     else if(REFERENCE_1D_MODEL == REFERENCE_MODEL_1066A) then
-      ! redefines 1D model with crustal modification 
+      ! redefines 1D model with crustal modification
       call define_model_1066a(CRUSTAL)
     else if(REFERENCE_1D_MODEL == REFERENCE_MODEL_1DREF) then
-      ! redefines 1D model with crustal modification 
+      ! redefines 1D model with crustal modification
       call define_model_1dref(CRUSTAL)
     else if(REFERENCE_1D_MODEL == REFERENCE_MODEL_SEA1D) then
-      ! redefines 1D model with crustal modification 
+      ! redefines 1D model with crustal modification
       call define_model_sea1d(CRUSTAL)
     endif
   endif
-  
+
   end subroutine model_attenuation_setup
 
 !

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_s362ani.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_s362ani.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/model_s362ani.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -58,7 +58,7 @@
   real(kind=4),dimension(:),allocatable :: wk1,wk2,wk3
 
   integer, dimension(maxhpa) :: lmxhpa,itypehpa,numcoe,nconpt
-  
+
   integer,dimension(:,:),allocatable :: itpspl,iconpt
   integer,dimension(:),allocatable :: ihpakern,ivarkern
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/save_arrays_solver.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/save_arrays_solver.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/save_arrays_solver.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -79,7 +79,7 @@
   open(unit=27,file=prname(1:len_trim(prname))//'array_dims.txt', &
         status='unknown',action='write',iostat=ier)
   if( ier /= 0 ) call exit_mpi(myrank,'error opening array_dims file')
-  
+
   write(27,*) nspec
   write(27,*) nglob
   close(27)
@@ -320,7 +320,7 @@
   ! uncomment for vp & vs model storage
   if( SAVE_MESH_FILES ) then
     ! outputs model files in binary format
-    call save_arrays_solver_meshfiles(myrank,nspec)    
+    call save_arrays_solver_meshfiles(myrank,nspec)
   endif
 
   end subroutine save_arrays_solver
@@ -340,7 +340,7 @@
 
   use create_regions_mesh_par2,only: &
     rhostore,kappavstore,kappahstore,muvstore,muhstore,eta_anisostore, &
-    Qmu_store, &    
+    Qmu_store, &
     prname
 
   implicit none
@@ -352,7 +352,7 @@
   integer :: i,j,k,ispec,ier
   real(kind=CUSTOM_REAL) :: scaleval1,scaleval2
   real(kind=CUSTOM_REAL),dimension(:,:,:,:),allocatable :: temp_store
-  
+
   ! scaling factors to re-dimensionalize units
   scaleval1 = sngl( sqrt(PI*GRAV*RHOAV)*(R_EARTH/1000.0d0) )
   scaleval2 = sngl( RHOAV/1000.0d0 )
@@ -425,12 +425,12 @@
     write(27) eta_anisostore
     close(27)
   endif ! TRANSVERSE_ISOTROPY
-  
+
   ! shear attenuation
   if( ATTENUATION ) then
     ! saves Qmu_store to full custom_real array
     ! uses temporary array
-    allocate(temp_store(NGLLX,NGLLY,NGLLZ,nspec))            
+    allocate(temp_store(NGLLX,NGLLY,NGLLZ,nspec))
     if (ATTENUATION_3D) then
       ! attenuation arrays are fully 3D
       if(CUSTOM_REAL == SIZE_REAL) then
@@ -453,7 +453,7 @@
             enddo
           enddo
         enddo
-      enddo        
+      enddo
     endif
 
     ! Qmu
@@ -465,8 +465,8 @@
     close(27)
 
     ! frees temporary memory
-    deallocate(temp_store)      
-  endif ! ATTENUATION    
+    deallocate(temp_store)
+  endif ! ATTENUATION
 
   end subroutine save_arrays_solver_meshfiles
 !

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_MPI_interfaces.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_MPI_interfaces.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_MPI_interfaces.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -32,11 +32,11 @@
     INCLUDE_CENTRAL_CUBE,myrank,NUMFACES_SHARED
 
   use create_MPI_interfaces_par
-  
+
   use MPI_crust_mantle_par
   use MPI_outer_core_par
-  use MPI_inner_core_par    
-    
+  use MPI_inner_core_par
+
   implicit none
 
   integer,intent(in):: iregion_code
@@ -89,7 +89,7 @@
 
   ! frees arrays not needed any further
   deallocate(iprocfrom_faces,iprocto_faces,imsg_type)
-  deallocate(iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners)  
+  deallocate(iproc_master_corners,iproc_worker1_corners,iproc_worker2_corners)
   deallocate(buffer_send_chunkcorn_scalar,buffer_recv_chunkcorn_scalar)
   deallocate(buffer_send_chunkcorn_vector,buffer_recv_chunkcorn_vector)
   select case( iregion_code )
@@ -112,7 +112,7 @@
     deallocate(iboolleft_eta_inner_core,iboolright_eta_inner_core)
     deallocate(iboolfaces_inner_core)
   end select
-  
+
   ! synchronizes MPI processes
   call sync_all()
 
@@ -393,7 +393,7 @@
 
   use create_MPI_interfaces_par
   use MPI_inner_core_par
-  
+
   implicit none
 
   integer :: MAX_NEIGHBOURS,max_nibool
@@ -478,7 +478,7 @@
                  NGLOB_INNER_CORE, &
                  test_flag,ndim_assemble, &
                  iproc_eta,addressing,NCHUNKS,NPROC_XI,NPROC_ETA)
-    
+
     ! frees array not needed anymore
     deallocate(ibelm_bottom_inner_core)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_color_perm.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_color_perm.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_color_perm.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -51,7 +51,7 @@
 
   ! user output
   if(myrank == 0) then
-    write(IMAIN,*) 'mesh coloring: ',USE_MESH_COLORING_GPU
+    write(IMAIN,*) '  mesh coloring: ',USE_MESH_COLORING_GPU
   endif
 
   select case( iregion_code )
@@ -272,8 +272,12 @@
 
   integer :: nspec_outer,nspec_inner,nspec_domain
   integer :: nspec_outer_min_global,nspec_outer_max_global
-  integer :: nb_colors,nb_colors_min,nb_colors_max
+  integer :: nspec_inner_min_global,nspec_inner_max_global
+  integer :: min_elem_global,max_elem_global
 
+  integer :: nb_colors
+  integer :: nb_colors_min,nb_colors_max
+
   integer :: icolor,ispec,ispec_counter
   integer :: ispec_inner,ispec_outer
   integer :: ier
@@ -379,8 +383,6 @@
     call exit_MPI(myrank, 'incorrect total number of elements in all the colors of the mesh for outer elements')
   endif
 
-
-
   ! debug: no mesh coloring, only creates dummy coloring arrays
   if( DEBUG_COLOR ) then
     nb_colors_outer_elements = 0
@@ -455,6 +457,20 @@
     close(99)
   endif
 
+  ! checks non-zero elements in colors
+  do icolor = 1,nb_colors_outer_elements + nb_colors_inner_elements
+    ! checks
+    if( num_of_elems_in_this_color(icolor) == 0 ) then
+      print *,'rank: ',myrank,'domain:',idomain,' nspec = ',nspec_domain
+      print *,'error zero elements in this color:',icolor
+      print *,'total number of elements in all the colors of the mesh = ', &
+        sum(num_of_elems_in_this_color)
+      call exit_MPI(myrank, 'zero elements in a color of the mesh')
+    endif
+  enddo
+
+
+
   ! sets up domain coloring arrays
   select case(idomain)
   case( IREGION_CRUST_MANTLE )
@@ -536,16 +552,23 @@
   call min_all_i(nb_colors,nb_colors_min)
   call max_all_i(nb_colors,nb_colors_max)
 
-  ! user output
+  ! min/max of elements per color
+  call min_all_i(minval(num_of_elems_in_this_color(:)),min_elem_global)
+  call max_all_i(maxval(num_of_elems_in_this_color(:)),max_elem_global)
+
+  ! min/max of inner/outer elements
+  call min_all_i(nspec_inner,nspec_inner_min_global)
+  call max_all_i(nspec_inner,nspec_inner_max_global)
   call min_all_i(nspec_outer,nspec_outer_min_global)
   call max_all_i(nspec_outer,nspec_outer_max_global)
-  call min_all_i(nspec_outer,nspec_outer_min_global)
-  call max_all_i(nspec_outer,nspec_outer_max_global)
+
+  ! user output
   if(myrank == 0) then
-    write(IMAIN,*) '       colors min = ',nb_colors_min
-    write(IMAIN,*) '       colors max = ',nb_colors_max
-    write(IMAIN,*) '       outer elements: min = ',nspec_outer_min_global
-    write(IMAIN,*) '       outer elements: max = ',nspec_outer_max_global
+    write(IMAIN,*) '     total colors:'
+    write(IMAIN,*) '       total colors min/max = ',nb_colors_min,nb_colors_max
+    write(IMAIN,*) '       elements per color min/max = ',min_elem_global,max_elem_global
+    write(IMAIN,*) '       inner elements min/max = ',nspec_inner_min_global,nspec_inner_max_global
+    write(IMAIN,*) '       outer elements min/max = ',nspec_outer_min_global,nspec_outer_max_global
   endif
 
   ! debug: outputs permutation array as vtk file

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_model.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_model.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/setup_model.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -185,5 +185,5 @@
 
   ! flushes I/O buffer
   call flush_IMAIN()
-  
+
   end subroutine sm_output_info

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -116,19 +116,19 @@
   implicit none
 
   include "constants.h"
-  
+
   ! only master process writes out to main output file
   ! file I/O in fortran is buffered by default
   !
-  ! note: Fortran2003 includes a FLUSH statement 
+  ! note: Fortran2003 includes a FLUSH statement
   !          which is implemented by most compilers by now
   !
   ! otherwise:
-  !   a) comment out the line below 
+  !   a) comment out the line below
   !   b) try to use instead: call flush(IMAIN)
-    
+
   flush(IMAIN)
-      
+
   end subroutine flush_IMAIN
 
 !-------------------------------------------------------------------------------------------------

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/get_model_parameters.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/get_model_parameters.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/get_model_parameters.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -158,19 +158,35 @@
 !
 !---
 
+  ! default values
 
   ! uses PREM as the 1D reference model by default
-  ! uses no mantle heterogeneities by default
-  ! uses no 3D model by default
+  REFERENCE_1D_MODEL = REFERENCE_MODEL_PREM
+
+  ! uses no anisotropic 3D model by default
   ANISOTROPIC_3D_MANTLE = .false.
-  ATTENUATION_3D = .false.
+
+  ! attenuation
+  ! note: to save memory, one can set ATTENUATION_3D to .false. which
+  !          will create attenuation arrays storing only 1 point per element
+  !          (otherwise, 3D attenuation arrays will be defined for all GLL points)
+  if( USE_3D_ATTENUATION ) then
+    ATTENUATION_3D = .true.
+  else
+    ATTENUATION_3D = .false.
+  endif
+
+  ! no crustal mesh stretching and 3D crust models by default
   CASE_3D = .false.
   CRUSTAL = .false.
+  ONE_CRUST = .false.
+
+  ! uses no 3D heterogeneity mantle by default
   HETEROGEN_3D_MANTLE = .false.
+  ISOTROPIC_3D_MANTLE = .false.
   HONOR_1D_SPHERICAL_MOHO = .false.
-  ISOTROPIC_3D_MANTLE = .false.
-  ONE_CRUST = .false.
-  REFERENCE_1D_MODEL = REFERENCE_MODEL_PREM
+
+  ! no 3D model by default
   THREE_D_MODEL = 0
   TRANSVERSE_ISOTROPY = .false.
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_compute_parameters.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_compute_parameters.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/read_compute_parameters.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -396,9 +396,6 @@
   if(ABSORBING_CONDITIONS .and. NCHUNKS == 3) &
     stop 'absorbing conditions not supported for three chunks yet'
 
-  if(ATTENUATION_3D .and. .not. ATTENUATION) &
-    stop 'need ATTENUATION to use ATTENUATION_3D'
-
   if(ATTENUATION_NEW .and. .not. ATTENUATION) &
     stop 'need ATTENUATION to use ATTENUATION_NEW'
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -294,10 +294,10 @@
     endif
 
     write(IMAIN,*)
-    
+
     ! flushes file buffer for main output file (IMAIN)
     call flush_IMAIN()
-    
+
     ! write time stamp file to give information about progression of simulation
     write(outputname,"('/timestamp',i6.6)") it
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -30,7 +30,7 @@
                             accel_outer_core,b_accel_outer_core, &
                             normal_top_outer_core,jacobian2D_top_outer_core, &
                             wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
-                            SIMULATION_TYPE,nspec_top)
+                            SIMULATION_TYPE,nspec2D_top)
 
   implicit none
 
@@ -56,7 +56,7 @@
   integer, dimension(NSPEC2D_TOP_OC) :: ibelm_top_outer_core
 
   integer SIMULATION_TYPE
-  integer nspec_top
+  integer nspec2D_top
 
   ! local parameters
   real(kind=CUSTOM_REAL) :: displ_x,displ_y,displ_z,displ_n,nx,ny,nz,weight
@@ -64,16 +64,16 @@
 
 
   ! for surface elements exactly on the CMB
-  do ispec2D = 1,nspec_top !NSPEC2D_TOP(IREGION_OUTER_CORE)
-  
+  do ispec2D = 1,nspec2D_top !NSPEC2D_TOP(IREGION_OUTER_CORE)
+
     ispec = ibelm_top_outer_core(ispec2D)
     ispec_selected = ibelm_bottom_crust_mantle(ispec2D)
 
     ! only for DOFs exactly on the CMB (top of these elements)
-    k = NGLLZ    
+    k = NGLLZ
     ! get displacement on the solid side using pointwise matching
     k_corresp = 1
-    
+
     do j = 1,NGLLY
       do i = 1,NGLLX
         ! corresponding points are located at the bottom of the mantle
@@ -339,7 +339,7 @@
                             normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
                             wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
                             RHO_BOTTOM_OC,minus_g_icb, &
-                            SIMULATION_TYPE,nspec_top)
+                            SIMULATION_TYPE,nspec2D_top)
 
   implicit none
 
@@ -368,14 +368,14 @@
   real(kind=CUSTOM_REAL) minus_g_icb
 
   integer SIMULATION_TYPE
-  integer nspec_top
+  integer nspec2D_top
 
   ! local parameters
   real(kind=CUSTOM_REAL) :: pressure,nx,ny,nz,weight
   integer :: i,j,k,k_corresp,ispec,ispec2D,iglob,iglob_inner_core,ispec_selected
 
   ! for surface elements exactly on the ICB
-  do ispec2D = 1,nspec_top ! NSPEC2D_TOP(IREGION_INNER_CORE)
+  do ispec2D = 1,nspec2D_top ! NSPEC2D_TOP(IREGION_INNER_CORE)
 
     ispec = ibelm_top_inner_core(ispec2D)
     ispec_selected = ibelm_bottom_outer_core(ispec2D)

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -225,7 +225,7 @@
   endif
 
   !   ymax
-  
+
   if (SIMULATION_TYPE == 3 .and. nspec2D_ymax_outer_core > 0)  then
     call read_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,NSTEP-it+1)
   endif
@@ -273,7 +273,7 @@
   endif
 
   ! zmin
-  
+
   ! for surface elements exactly on the ICB
   if (SIMULATION_TYPE == 3 .and. nspec2D_zmin_outer_core > 0)  then
     call read_abs(8,absorb_zmin_outer_core,reclen_zmin,NSTEP-it+1)

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -26,9 +26,11 @@
 !=====================================================================
 
 
-  subroutine get_attenuation_model_3D_or_1D(myrank, prname, one_minus_sum_beta, &
-                                factor_common, scale_factor, tau_s, &
-                                vx, vy, vz, vnspec)
+  subroutine get_attenuation_model_3D_or_1D(myrank, prname, &
+                                           one_minus_sum_beta, &
+                                           factor_common, &
+                                           scale_factor, tau_s, &
+                                           vx, vy, vz, vnspec)
 
   use specfem_par,only: ATTENUATION_VAL, ATTENUATION_3D_VAL
 
@@ -36,12 +38,15 @@
 
   include 'constants.h'
 
-  integer :: myrank,vx,vy,vz,vnspec
-  character(len=150) :: prname
+  integer :: myrank
+
+  integer :: vx,vy,vz,vnspec
   double precision, dimension(vx,vy,vz,vnspec)       :: one_minus_sum_beta, scale_factor
   double precision, dimension(N_SLS,vx,vy,vz,vnspec) :: factor_common
   double precision, dimension(N_SLS)                 :: tau_s
 
+  character(len=150) :: prname
+
   ! local parameters
   integer :: i,j,k,ispec,ier
   double precision, dimension(N_SLS) :: tau_e, fc
@@ -57,8 +62,8 @@
   if( ier /= 0 ) call exit_MPI(myrank,'error opening file attenuation.bin')
 
   read(27) tau_s
-  read(27) factor_common
-  read(27) scale_factor
+  read(27) factor_common ! tau_e_store
+  read(27) scale_factor  ! Qmu_store
   read(27) T_c_source
   close(27)
 
@@ -118,10 +123,11 @@
 
   include 'constants.h'
 
-  double precision, dimension(N_SLS) :: tau_s, tau_e, beta, factor_common
-  double precision  one_minus_sum_beta
+  double precision, dimension(N_SLS),intent(in) :: tau_s, tau_e
+  double precision, dimension(N_SLS),intent(out) :: factor_common
+  double precision,intent(out) ::  one_minus_sum_beta
 
-  double precision, dimension(N_SLS) :: tauinv
+  double precision, dimension(N_SLS) :: tauinv,beta
   integer i
 
   tauinv(:) = -1.0d0 / tau_s(:)
@@ -215,8 +221,9 @@
 
   include 'constants.h'
 
-  double precision, dimension(N_SLS) :: tau_s, alphaval, betaval,gammaval
-  real(kind=CUSTOM_REAL) deltat
+  double precision, dimension(N_SLS), intent(in) :: tau_s
+  double precision, dimension(N_SLS), intent(out) :: alphaval, betaval,gammaval
+  real(kind=CUSTOM_REAL), intent(in) :: deltat
 
   double precision, dimension(N_SLS) :: tauinv
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -705,14 +705,12 @@
   allocate(omsb_crust_mantle_dble(ATT1,ATT2,ATT3,ATT4), &
           factor_scale_crust_mantle_dble(ATT1,ATT2,ATT3,ATT4),stat=ier)
   if( ier /= 0 ) call exit_MPI(myrank,'error allocating omsb crust_mantle arrays')
+  allocate(factor_common_crust_mantle_dble(N_SLS,ATT1,ATT2,ATT3,ATT4),stat=ier)
+  if( ier /= 0 ) call exit_MPI(myrank,'error allocating factor_common crust_mantle array')
 
   allocate(omsb_inner_core_dble(ATT1,ATT2,ATT3,ATT5), &
           factor_scale_inner_core_dble(ATT1,ATT2,ATT3,ATT5),stat=ier)
   if( ier /= 0 ) call exit_MPI(myrank,'error allocating omsb inner_core arrays')
-
-  allocate(factor_common_crust_mantle_dble(N_SLS,ATT1,ATT2,ATT3,ATT4),stat=ier)
-  if( ier /= 0 ) call exit_MPI(myrank,'error allocating factor_common crust_mantle array')
-
   allocate(factor_common_inner_core_dble(N_SLS,ATT1,ATT2,ATT3,ATT5),stat=ier)
   if( ier /= 0 ) call exit_MPI(myrank,'error allocating factor_common inner_core array')
 
@@ -725,9 +723,11 @@
 
   ! reads in attenuation values
   call create_name_database(prnamel, myrank, IREGION_CRUST_MANTLE, LOCAL_PATH)
-  call get_attenuation_model_3D_or_1D(myrank, prnamel, omsb_crust_mantle_dble, &
-           factor_common_crust_mantle_dble,factor_scale_crust_mantle_dble,tau_sigma_dble, &
-           ATT1,ATT2,ATT3,ATT4)
+  call get_attenuation_model_3D_or_1D(myrank, prnamel, &
+                                     omsb_crust_mantle_dble, &
+                                     factor_common_crust_mantle_dble, &
+                                     factor_scale_crust_mantle_dble,tau_sigma_dble, &
+                                     ATT1,ATT2,ATT3,ATT4)
 
   ! INNER_CORE ATTENUATION
   ! initializes
@@ -738,19 +738,21 @@
 
   ! reads in attenuation values
   call create_name_database(prnamel, myrank, IREGION_INNER_CORE, LOCAL_PATH)
-  call get_attenuation_model_3D_or_1D(myrank, prnamel, omsb_inner_core_dble, &
-           factor_common_inner_core_dble,factor_scale_inner_core_dble,tau_sigma_dble, &
-           ATT1,ATT2,ATT3,ATT5)
+  call get_attenuation_model_3D_or_1D(myrank, prnamel, &
+                                     omsb_inner_core_dble, &
+                                     factor_common_inner_core_dble, &
+                                     factor_scale_inner_core_dble,tau_sigma_dble, &
+                                     ATT1,ATT2,ATT3,ATT5)
 
   ! converts to custom real arrays
   if(CUSTOM_REAL == SIZE_REAL) then
-    factor_scale_crust_mantle       = sngl(factor_scale_crust_mantle_dble)
+    factor_scale_crust_mantle = sngl(factor_scale_crust_mantle_dble)
     one_minus_sum_beta_crust_mantle = sngl(omsb_crust_mantle_dble)
-    factor_common_crust_mantle      = sngl(factor_common_crust_mantle_dble)
+    factor_common_crust_mantle = sngl(factor_common_crust_mantle_dble)
 
-    factor_scale_inner_core         = sngl(factor_scale_inner_core_dble)
-    one_minus_sum_beta_inner_core   = sngl(omsb_inner_core_dble)
-    factor_common_inner_core        = sngl(factor_common_inner_core_dble)
+    factor_scale_inner_core = sngl(factor_scale_inner_core_dble)
+    one_minus_sum_beta_inner_core = sngl(omsb_inner_core_dble)
+    factor_common_inner_core = sngl(factor_common_inner_core_dble)
   else
     factor_scale_crust_mantle       = factor_scale_crust_mantle_dble
     one_minus_sum_beta_crust_mantle = omsb_crust_mantle_dble
@@ -788,6 +790,7 @@
 
           if(ANISOTROPIC_3D_MANTLE_VAL) then
             scale_factor_minus_one = scale_factor - 1.d0
+
             mul = c44store_crust_mantle(i,j,k,ispec)
             c11store_crust_mantle(i,j,k,ispec) = c11store_crust_mantle(i,j,k,ispec) &
                     + FOUR_THIRDS * scale_factor_minus_one * mul
@@ -812,6 +815,7 @@
               ! store the original value of \mu to comput \mu*\eps
               muvstore_crust_mantle_3dmovie(i,j,k,ispec)=muvstore_crust_mantle(i,j,k,ispec)
             endif
+
             muvstore_crust_mantle(i,j,k,ispec) = muvstore_crust_mantle(i,j,k,ispec) * scale_factor
 
             ! scales transverse isotropic values for mu_h
@@ -830,12 +834,14 @@
       do j=1,NGLLY
         do i=1,NGLLX
           if( ATTENUATION_3D_VAL ) then
-            scale_factor_minus_one = factor_scale_inner_core(i,j,k,ispec) - 1.d0
+            scale_factor = factor_scale_inner_core(i,j,k,ispec)
           else
-            scale_factor_minus_one = factor_scale_inner_core(1,1,1,ispec) - 1.d0
+            scale_factor = factor_scale_inner_core(1,1,1,ispec)
           endif
 
           if(ANISOTROPIC_INNER_CORE_VAL) then
+            scale_factor_minus_one = scale_factor - 1.d0
+
             mul = muvstore_inner_core(i,j,k,ispec)
             c11store_inner_core(i,j,k,ispec) = c11store_inner_core(i,j,k,ispec) &
                     + FOUR_THIRDS * scale_factor_minus_one * mul
@@ -849,11 +855,8 @@
                     + scale_factor_minus_one * mul
           endif
 
-          if( ATTENUATION_3D_VAL ) then
-            muvstore_inner_core(i,j,k,ispec) = muvstore_inner_core(i,j,k,ispec) * factor_scale_inner_core(i,j,k,ispec)
-          else
-            muvstore_inner_core(i,j,k,ispec) = muvstore_inner_core(i,j,k,ispec) * factor_scale_inner_core(1,1,1,ispec)
-          endif
+          muvstore_inner_core(i,j,k,ispec) = muvstore_inner_core(i,j,k,ispec) * scale_factor
+
         enddo
       enddo
     enddo
@@ -1129,6 +1132,7 @@
 
   ! local parameters
   integer :: ier
+  integer :: i,j,k,ispec,ispec2D,ipoin,iglob
   real :: free_mb,used_mb,total_mb
   ! dummy custom_real variables to convert from double precision
   real(kind=CUSTOM_REAL),dimension(:,:,:),allocatable:: cr_wgll_cube
@@ -1153,7 +1157,7 @@
                                   number_receiver_global,islice_selected_rec,ispec_selected_rec, &
                                   nrec, nrec_local, nadj_rec_local, &
                                   NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
-                                  NSPEC_CRUST_MANTLE_STRAIN_ONLY,NGLOB_CRUST_MANTLE_OCEANS, &
+                                  NSPEC_CRUST_MANTLE_STRAIN_ONLY, &
                                   NSPEC_OUTER_CORE,NGLOB_OUTER_CORE, &
                                   NSPEC_INNER_CORE,NGLOB_INNER_CORE, &
                                   NSPEC_INNER_CORE_STRAIN_ONLY, &
@@ -1306,8 +1310,10 @@
   if ( NOISE_TOMOGRAPHY > 0 ) then
     if(myrank == 0 ) write(IMAIN,*) "  loading noise arrays"
 
-    call prepare_fields_noise_device(Mesh_pointer,nspec_top,ibelm_top_crust_mantle, &
-                                    NSTEP,noise_sourcearray, &
+    call prepare_fields_noise_device(Mesh_pointer,NSPEC_TOP, &
+                                    NSTEP, &
+                                    ibelm_top_crust_mantle, &
+                                    noise_sourcearray, &
                                     normal_x_noise,normal_y_noise,normal_z_noise, &
                                     mask_noise,jacobian2D_top_crust_mantle)
 
@@ -1315,10 +1321,71 @@
 
   ! prepares oceans arrays
   if ( OCEANS_VAL ) then
-     if(myrank == 0 ) write(IMAIN,*) "  loading oceans arrays"
+    if(myrank == 0 ) write(IMAIN,*) "  loading oceans arrays"
 
-     call prepare_oceans_device(Mesh_pointer,rmass_ocean_load)
+    ! prepares gpu arrays for coupling with oceans
+    !
+    ! note: handling of coupling on gpu is slightly different than in CPU routine to avoid using a mutex
+    !          to update acceleration; tests so far have shown, that with a simple mutex implementation
+    !          the results differ between successive runs (probably still due to some race conditions?)
+    !          here we now totally avoid mutex usage and still update each global point only once
 
+    ! counts global points on surface to oceans
+    updated_dof_ocean_load(:) = .false.
+    ipoin = 0
+    do ispec2D = 1,nspec_top
+      ispec = ibelm_top_crust_mantle(ispec2D)
+      k = NGLLZ
+      do j = 1,NGLLY
+        do i = 1,NGLLX
+          ! get global point number
+          iglob = ibool_crust_mantle(i,j,k,ispec)
+          if(.not. updated_dof_ocean_load(iglob)) then
+            ipoin = ipoin + 1
+            updated_dof_ocean_load(iglob) = .true.
+          endif
+        enddo
+      enddo
+    enddo
+
+    ! allocates arrays with all global points on ocean surface
+    npoin_oceans = ipoin
+    allocate(iglob_ocean_load(npoin_oceans), &
+            normal_ocean_load(NDIM,npoin_oceans), &
+            rmass_ocean_load_selected(npoin_oceans), &
+            stat=ier)
+    if( ier /= 0 ) call exit_MPI(myrank,'error allocating oceans arrays')
+
+    ! fills arrays for coupling surface at oceans
+    updated_dof_ocean_load(:) = .false.
+    ipoin = 0
+    do ispec2D = 1,nspec_top
+      ispec = ibelm_top_crust_mantle(ispec2D)
+      k = NGLLZ
+      do j = 1,NGLLY
+        do i = 1,NGLLX
+          ! get global point number
+          iglob = ibool_crust_mantle(i,j,k,ispec)
+          if(.not. updated_dof_ocean_load(iglob)) then
+            ipoin = ipoin + 1
+            ! fills arrays
+            iglob_ocean_load(ipoin) = iglob
+            rmass_ocean_load_selected(ipoin) = rmass_ocean_load(iglob)
+            normal_ocean_load(:,ipoin) = normal_top_crust_mantle(:,i,j,ispec2D)
+            ! masks this global point
+            updated_dof_ocean_load(iglob) = .true.
+          endif
+        enddo
+      enddo
+    enddo
+
+    ! prepares arrays on GPU
+    call prepare_oceans_device(Mesh_pointer,npoin_oceans, &
+                              rmass_ocean_load_selected,iglob_ocean_load,normal_ocean_load)
+
+    ! frees memory
+    deallocate(iglob_ocean_load,rmass_ocean_load_selected,normal_ocean_load)
+
   endif
 
   ! crust/mantle region
@@ -1335,9 +1402,6 @@
                                  rmassx_crust_mantle, &
                                  rmassy_crust_mantle, &
                                  rmassz_crust_mantle, &
-                                 normal_top_crust_mantle, &
-                                 ibelm_top_crust_mantle, &
-                                 ibelm_bottom_crust_mantle, &
                                  ibool_crust_mantle, &
                                  xstore_crust_mantle,ystore_crust_mantle,zstore_crust_mantle, &
                                  ispec_is_tiso_crust_mantle, &
@@ -1350,8 +1414,8 @@
                                  c55store_crust_mantle,c56store_crust_mantle,c66store_crust_mantle, &
                                  num_phase_ispec_crust_mantle,phase_ispec_inner_crust_mantle, &
                                  nspec_outer_crust_mantle,nspec_inner_crust_mantle, &
-                                 NSPEC2D_TOP(IREGION_CRUST_MANTLE), &
                                  NSPEC2D_BOTTOM(IREGION_CRUST_MANTLE), &
+                                 ibelm_bottom_crust_mantle, &
                                  NCHUNKS_VAL, &
                                  num_colors_outer_crust_mantle,num_colors_inner_crust_mantle, &
                                  num_elem_colors_crust_mantle)
@@ -1365,18 +1429,18 @@
                                 gammax_outer_core,gammay_outer_core,gammaz_outer_core, &
                                 rhostore_outer_core,kappavstore_outer_core, &
                                 rmass_outer_core, &
+                                ibool_outer_core, &
+                                xstore_outer_core,ystore_outer_core,zstore_outer_core, &
+                                num_phase_ispec_outer_core,phase_ispec_inner_outer_core, &
+                                nspec_outer_outer_core,nspec_inner_outer_core, &
+                                NSPEC2D_TOP(IREGION_OUTER_CORE), &
+                                NSPEC2D_BOTTOM(IREGION_OUTER_CORE), &
                                 normal_top_outer_core, &
                                 normal_bottom_outer_core, &
                                 jacobian2D_top_outer_core, &
                                 jacobian2D_bottom_outer_core, &
                                 ibelm_top_outer_core, &
                                 ibelm_bottom_outer_core, &
-                                ibool_outer_core, &
-                                xstore_outer_core,ystore_outer_core,zstore_outer_core, &
-                                num_phase_ispec_outer_core,phase_ispec_inner_outer_core, &
-                                nspec_outer_outer_core,nspec_inner_outer_core, &
-                                NSPEC2D_TOP(IREGION_OUTER_CORE), &
-                                NSPEC2D_BOTTOM(IREGION_OUTER_CORE), &
                                 num_colors_outer_outer_core,num_colors_inner_outer_core, &
                                 num_elem_colors_outer_core)
 
@@ -1389,7 +1453,6 @@
                                 gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
                                 rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
                                 rmass_inner_core, &
-                                ibelm_top_inner_core, &
                                 ibool_inner_core, &
                                 xstore_inner_core,ystore_inner_core,zstore_inner_core, &
                                 c11store_inner_core,c12store_inner_core,c13store_inner_core, &
@@ -1398,6 +1461,7 @@
                                 num_phase_ispec_inner_core,phase_ispec_inner_inner_core, &
                                 nspec_outer_inner_core,nspec_inner_inner_core, &
                                 NSPEC2D_TOP(IREGION_INNER_CORE), &
+                                ibelm_top_inner_core, &
                                 num_colors_outer_inner_core,num_colors_inner_inner_core, &
                                 num_elem_colors_inner_core)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2012-08-14 23:15:21 UTC (rev 20569)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2012-08-15 19:55:44 UTC (rev 20570)
@@ -91,6 +91,11 @@
   ! flag to mask ocean-bottom degrees of freedom for ocean load
   logical, dimension(NGLOB_CRUST_MANTLE_OCEANS) :: updated_dof_ocean_load
 
+  integer :: npoin_oceans
+  integer, dimension(:),allocatable :: iglob_ocean_load
+  real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: normal_ocean_load
+  real(kind=CUSTOM_REAL), dimension(:),allocatable :: rmass_ocean_load_selected
+
   !-----------------------------------------------------------------
   ! ellipticity
   !-----------------------------------------------------------------