[cig-commits] r19213 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER: examples/homogeneous_halfspace/in_data_files examples/layered_halfspace/in_data_files examples/meshfem3D_examples/many_interfaces examples/meshfem3D_examples/simple_model examples/meshfem3D_examples/socal1D examples/meshfem3D_examples/socal1D/example_utm examples/noise_tomography/in_data_files examples/tomographic_model/in_data_files in_data_files src/cuda src/shared src/specfem3D

Thu Nov 17 15:51:04 PST 2011

Author: danielpeter
Date: 2011-11-17 15:51:03 -0800 (Thu, 17 Nov 2011)
New Revision: 19213

Modified:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/layered_halfspace/in_data_files/Par_file
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/many_interfaces/Par_file
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/simple_model/Par_file
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/Par_file
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/example_utm/Par_file_utm
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step1
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step2
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step3
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/tomographic_model/in_data_files/Par_file
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file.in
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/param_reader.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
Log:
updates device initialization

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file
===================================================================

--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/homogeneous_halfspace/in_data_files/Par_file	2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .false.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/layered_halfspace/in_data_files/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/layered_halfspace/in_data_files/Par_file	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/layered_halfspace/in_data_files/Par_file	2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/many_interfaces/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/many_interfaces/Par_file	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/many_interfaces/Par_file	2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/simple_model/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/simple_model/Par_file	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/simple_model/Par_file	2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/Par_file	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/Par_file	2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/example_utm/Par_file_utm
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/example_utm/Par_file_utm	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/meshfem3D_examples/socal1D/example_utm/Par_file_utm	2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step1
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step1	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step1	2011-11-17 23:51:03 UTC (rev 19213)
@@ -22,6 +22,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step2
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step2	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step2	2011-11-17 23:51:03 UTC (rev 19213)
@@ -22,6 +22,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step3
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step3	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/noise_tomography/in_data_files/Par_file_step3	2011-11-17 23:51:03 UTC (rev 19213)
@@ -22,6 +22,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/tomographic_model/in_data_files/Par_file
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/tomographic_model/in_data_files/Par_file	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/examples/tomographic_model/in_data_files/Par_file	2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .true.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file.in	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/in_data_files/Par_file.in	2011-11-17 23:51:03 UTC (rev 19213)
@@ -21,6 +21,7 @@
 ATTENUATION                     = .false.
 USE_OLSEN_ATTENUATION           = .false.
 ANISOTROPY                      = .false.
+GRAVITY                         = .false.
 
 # absorbing boundary conditions for a regional simulation
 ABSORBING_CONDITIONS            = .false.

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -368,7 +368,7 @@
   realw* h_max;
   int blocksize = 256;
 
-  int num_blocks_x = ceil(mp->NGLOB_AB/blocksize);
+  int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
   //printf("num_blocks_x %i \n",num_blocks_x);
 
   h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
@@ -506,7 +506,7 @@
   realw* h_max;
   int blocksize = 256;
 
-  int num_blocks_x = ceil(mp->NGLOB_AB/blocksize);
+  int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
   //printf("num_blocks_x %i \n",num_blocks_x);
 
   h_max = (realw*) calloc(num_blocks_x,sizeof(realw));

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -117,7 +117,7 @@
   int num_blocks_x = NSOURCES;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -171,7 +171,7 @@
   int num_blocks_x = NSOURCES;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -293,7 +293,7 @@
   int num_blocks_x = mp->nadj_rec_local;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -114,7 +114,7 @@
   int num_blocks_x = NSOURCES;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -168,7 +168,7 @@
   int num_blocks_x = NSOURCES;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -331,7 +331,7 @@
   int num_blocks_x = mp->nadj_rec_local;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -133,7 +133,7 @@
   int num_blocks_x = num_coupling_ac_el_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -287,7 +287,7 @@
   int num_blocks_x = num_coupling_ac_el_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -88,7 +88,7 @@
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -196,7 +196,7 @@
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -546,7 +546,7 @@
   int num_blocks_x = nb_blocks_to_compute;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -778,7 +778,7 @@
    int num_blocks_x = size_padded/blocksize;
    int num_blocks_y = 1;
    while(num_blocks_x > 65535) {
-     num_blocks_x = ceil(num_blocks_x/2.0);
+     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
      num_blocks_y = num_blocks_y*2;
    }
    dim3 grid(num_blocks_x,num_blocks_y);
@@ -822,7 +822,7 @@
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
   dim3 grid(num_blocks_x,num_blocks_y);
@@ -913,7 +913,7 @@
     int num_blocks_x = mp->num_free_surface_faces;
     int num_blocks_y = 1;
     while(num_blocks_x > 65535) {
-      num_blocks_x = ceil(num_blocks_x/2.0);
+      num_blocks_x = (int) ceil(num_blocks_x*0.5f);
       num_blocks_y = num_blocks_y*2;
     }
     dim3 grid(num_blocks_x,num_blocks_y,1);

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -102,7 +102,7 @@
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -212,7 +212,7 @@
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -1146,7 +1146,7 @@
   int num_blocks_x = nb_blocks_to_compute;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -1580,7 +1580,7 @@
    int num_blocks_x = size_padded/blocksize;
    int num_blocks_y = 1;
    while(num_blocks_x > 65535) {
-     num_blocks_x = ceil(num_blocks_x/2.0);
+     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
      num_blocks_y = num_blocks_y*2;
    }
 
@@ -1633,7 +1633,7 @@
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -1738,7 +1738,7 @@
   int num_blocks_x = mp->num_free_surface_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -119,7 +119,7 @@
   int num_blocks_x = mp->NSPEC_AB;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -213,7 +213,7 @@
   int num_blocks_x = mp->num_free_surface_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -429,7 +429,7 @@
   int num_blocks_x = mp->NSPEC_AB;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -596,7 +596,7 @@
   int num_blocks_x = mp->NSPEC_AB;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -146,7 +146,7 @@
   int num_blocks_x = mp->d_num_abs_boundary_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -158,7 +158,7 @@
   int num_blocks_x = mp->d_num_abs_boundary_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -98,7 +98,7 @@
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -200,7 +200,7 @@
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -138,7 +138,7 @@
   int num_blocks_x = mp->num_free_surface_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
   dim3 grid(num_blocks_x,num_blocks_y,1);
@@ -250,7 +250,7 @@
   int num_blocks_x = mp->num_free_surface_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
   dim3 grid(num_blocks_x,num_blocks_y,1);

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -156,7 +156,7 @@
 
   get_free_memory(&free_db,&used_db,&total_db);
 
-  sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_mem_usage_proc_%03d.txt",myrank);
+  sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_mem_usage_proc_%06d.txt",myrank);
   fp = fopen(filename,"a+");
   fprintf(fp,"%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
    used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
@@ -420,6 +420,123 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 extern "C"
+void FC_FUNC_(prepare_cuda_device,
+              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+  TRACE("prepare_cuda_device");
+
+  // Gets rank number of MPI process
+  int myrank = *myrank_f;
+
+  // cuda initialization (needs -lcuda library)
+  // daniel: note, cuInit initializes the driver API.
+  //             it is needed for any following CUDA driver API function call (format cuFUNCTION(..) )
+  //             however, for the CUDA runtime API functions (format cudaFUNCTION(..) )
+  //             the initialization is implicit, thus cuInit() here would not be needed...
+  CUresult status = cuInit(0);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA driver API device initialization failed\n");
+
+  // returns a handle to the first cuda compute device
+  CUdevice dev;
+  status = cuDeviceGet(&dev, 0);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device not found\n");
+
+  // gets device properties
+  int major,minor;
+  status = cuDeviceComputeCapability(&major,&minor,dev);
+  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device information not found\n");
+
+  // make sure that the device has compute capability >= 1.3
+  if (major < 1){
+    fprintf(stderr,"Compute capability major number should be at least 1, got: %d \nexiting...\n",major);
+    exit_on_error("CUDA Compute capability major number should be at least 1\n");
+  }
+  if (major == 1 && minor < 3){
+    fprintf(stderr,"Compute capability should be at least 1.3, got: %d.%d \nexiting...\n",major,minor);
+    exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
+  }
+
+  //daniel: from here on we use the runtime API  ...
+  // Gets number of GPU devices
+  int device_count = 0;
+  cudaGetDeviceCount(&device_count);
+  exit_on_cuda_error("CUDA runtime cudaGetDeviceCount: check if driver and runtime libraries work together\nexiting...\n");
+  
+  // returns device count to fortran
+  if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
+  *ncuda_devices = device_count;
+
+  // Sets the active device
+  if(device_count > 1) {
+    // generalized for more GPUs per node
+    cudaSetDevice( myrank % device_count );
+    exit_on_cuda_error("cudaSetDevice");
+  }
+
+  // returns a handle to the active device
+  int device;
+  cudaGetDevice(&device);
+
+  // get device properties
+  struct cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp,device);
+
+  // exit if the machine has no CUDA-enabled device
+  if (deviceProp.major == 9999 && deviceProp.minor == 9999){
+    fprintf(stderr,"No CUDA-enabled device found, exiting...\n\n");
+    exit_on_error("CUDA runtime error: there is no CUDA-enabled device found\n");
+  }
+
+  // outputs device infos to file
+  char filename[BUFSIZ];
+  FILE* fp;
+  sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_device_info_proc_%06d.txt",myrank);
+  fp = fopen(filename,"a+");
+
+  // display device properties
+  fprintf(fp,"Device Name = %s\n",deviceProp.name);
+  fprintf(fp,"multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
+  fprintf(fp,"totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
+  fprintf(fp,"totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
+  fprintf(fp,"sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
+  fprintf(fp,"Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+  fprintf(fp,"Maximum size of each dimension of a block: %d x %d x %d\n",
+          deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
+  fprintf(fp,"Maximum sizes of each dimension of a grid: %d x %d x %d\n",
+          deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
+  fprintf(fp,"Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
+  if(deviceProp.canMapHostMemory){
+    fprintf(fp,"canMapHostMemory: TRUE\n");
+  }else{
+    fprintf(fp,"canMapHostMemory: FALSE\n");
+  }
+  if(deviceProp.deviceOverlap){
+    fprintf(fp,"deviceOverlap: TRUE\n");
+  }else{
+    fprintf(fp,"deviceOverlap: FALSE\n");
+  }
+
+  // make sure that the device has compute capability >= 1.3
+  //if (deviceProp.major < 1){
+  //  fprintf(stderr,"Compute capability major number should be at least 1, exiting...\n\n");
+  //  exit_on_error("CUDA Compute capability major number should be at least 1");
+  //}
+  //if (deviceProp.major == 1 && deviceProp.minor < 3){
+  //  fprintf(stderr,"Compute capability should be at least 1.3, exiting...\n");
+  //  exit_on_error("CUDA Compute capability major number should be at least 1.3");
+  //}
+
+  // outputs initial memory infos via cudaMemGetInfo()
+  double free_db,used_db,total_db;
+  get_free_memory(&free_db,&used_db,&total_db);
+  fprintf(fp,"%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank,
+          used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+
+  fclose(fp);
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
 void FC_FUNC_(prepare_constants_device,
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
                                         int* h_NGLLX,
@@ -453,34 +570,10 @@
                                         int* nrec_local_f,
                                         int* SIMULATION_TYPE,
                                         int* USE_MESH_COLORING_GPU_f,
-                                        int* nspec_acoustic,int* nspec_elastic,
-                                        int* myrank_f,int* ncuda_devices) {
+                                        int* nspec_acoustic,int* nspec_elastic) {
 
 TRACE("prepare_constants_device");
 
-  int device_count = 0;
-
-  // cuda initialization (needs -lcuda library)
-  //cuInit(0);
-  CUresult status = cuInit(0);
-  if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device initialization failed");
-
-  // Gets number of GPU devices
-  cudaGetDeviceCount(&device_count);
-  //printf("Cuda Devices: %d\n", device_count);
-  if (device_count == 0) exit_on_error("There is no device supporting CUDA\n");
-  *ncuda_devices = device_count;
-
-  // Gets rank number of MPI process
-  int myrank = *myrank_f;
-
-  // Sets the active device
-  if(device_count > 1) {
-    // generalized for more GPUs per node
-    cudaSetDevice( myrank%device_count );
-    exit_on_cuda_error("cudaSetDevice");
-  }
-
   // allocates mesh parameter structure
   Mesh* mp = (Mesh*) malloc( sizeof(Mesh) );
   if (mp == NULL) exit_on_error("error allocating mesh pointer");
@@ -670,75 +763,9 @@
 #endif
 }
 
-/* ----------------------------------------------------------------------------------------------- */
 
-// purely adjoint & kernel simulations
-
 /* ----------------------------------------------------------------------------------------------- */
 
-extern "C"
-void FC_FUNC_(prepare_sim2_or_3_const_device,
-              PREPARE_SIM2_OR_3_CONST_DEVICE)(
-                                              long* Mesh_pointer_f,
-                                              int* islice_selected_rec,
-                                              int* islice_selected_rec_size,
-                                              int* nadj_rec_local,
-                                              int* nrec,
-                                              int* myrank) {
-
-TRACE("prepare_sim2_or_3_const_device");
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-
-  // allocates arrays for receivers
-  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
-                                     *islice_selected_rec_size*sizeof(int)),7001);
-  // copies arrays to GPU device
-  print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_rec,islice_selected_rec,
-                                     *islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),7002);
-
-  // adjoint source arrays
-  mp->nadj_rec_local = *nadj_rec_local;
-  if( mp->nadj_rec_local > 0 ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
-                                       (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),7003);
-
-    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
-                                       (mp->nadj_rec_local)*sizeof(int)),7004);
-
-    // prepares local irec array:
-    // the irec_local variable needs to be precomputed (as
-    // h_pre_comp..), because normally it is in the loop updating accel,
-    // and due to how it's incremented, it cannot be parallelized
-    int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) );
-    if( h_pre_computed_irec == NULL ) exit_on_error("prepare_sim2_or_3_const_device: h_pre_computed_irec not allocated\n");
-
-    int irec_local = 0;
-    for(int irec = 0; irec < *nrec; irec++) {
-      if(*myrank == islice_selected_rec[irec]) {
-        irec_local++;
-        h_pre_computed_irec[irec_local-1] = irec;
-      }
-    }
-    if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
-    // copies values onto GPU
-    print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
-                                       (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),7010);
-    free(h_pre_computed_irec);
-
-    // temporary array to prepare extracted source array values
-    mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
-    if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
-
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("prepare_sim2_or_3_const_device");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
 // for ACOUSTIC simulations
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -778,42 +805,42 @@
   int size_glob = mp->NGLOB_AB;
 
   // allocates arrays on device (GPU)
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),9001);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),9002);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),9003);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),2001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),2002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),2003);
 
   // mpi buffer
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
-                      (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),9004);
+                      (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),2004);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),9005);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
-                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),9100);
+                                     sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
 
   // padded array
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),9006);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),2006);
   // transfer constant element data with padding
   for(int i=0; i < mp->NSPEC_AB; i++) {
     print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3],
-                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),9106);
+                                       NGLL3*sizeof(realw),cudaMemcpyHostToDevice),2106);
   }
 
   // non-padded array
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),9007);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),2007);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),9105);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),2105);
 
   // phase elements
   mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic),
-                                      mp->num_phase_ispec_acoustic*2*sizeof(int)),9008);
+                                      mp->num_phase_ispec_acoustic*2*sizeof(int)),2008);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
-                                     mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),9101);
+                                     mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),2101);
 
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),
-                                     mp->NSPEC_AB*sizeof(int)),9009);
+                                     mp->NSPEC_AB*sizeof(int)),2009);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
-                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),9102);
+                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),2102);
 
   // free surface
   if( *NOISE_TOMOGRAPHY == 0 ){
@@ -821,30 +848,30 @@
     mp->num_free_surface_faces = *num_free_surface_faces;
     if( mp->num_free_surface_faces > 0 ){
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
-                                       mp->num_free_surface_faces*sizeof(int)),9201);
+                                       mp->num_free_surface_faces*sizeof(int)),2201);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-                                       mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9203);
+                                       mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2203);
 
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int)),9202);
+                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int)),2202);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9204);
+                                       3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2204);
     }
   }
 
   // absorbing boundaries
   if( *ABSORBING_CONDITIONS ){
     mp->d_b_reclen_potential = *b_reclen_potential;
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),9301);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),2301);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
-                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),9302);
+                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2302);
   }
 
 
   // for seismograms
   if( mp->nrec_local > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
-                                       mp->nrec_local*NGLL3*sizeof(realw)),9107);
+                                       mp->nrec_local*NGLL3*sizeof(realw)),2107);
 
     mp->h_station_seismo_potential = (realw*) malloc( mp->nrec_local*NGLL3*sizeof(realw) );
     if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
@@ -854,24 +881,24 @@
   // coupling with elastic parts
   if( *ELASTIC_SIMULATION && *num_coupling_ac_el_faces > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec),
-                                       (*num_coupling_ac_el_faces)*sizeof(int)),9601);
+                                       (*num_coupling_ac_el_faces)*sizeof(int)),2601);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,
-                                       (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9602);
+                                       (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2602);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
-                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),9603);
+                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),2603);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
-                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9604);
+                                       3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2604);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
-                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),9605);
+                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2605);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
-                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),9606);
+                                        3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2606);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
-                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),9607);
+                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2607);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
-                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),9608);
+                                        NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2608);
 
   }
 
@@ -906,26 +933,26 @@
   if( *SIMULATION_TYPE != 3 ) return;
 
   // allocates backward/reconstructed arrays on device (GPU)
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),9014);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),9015);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),9016);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),3014);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),3015);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),3016);
 
   // allocates kernels
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9017);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9018);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3017);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3018);
 
   // initializes kernel values to zero
   print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),9019);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),3019);
   print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),9020);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),3020);
 
   // preconditioner
   if( *APPROXIMATE_HESS_KL ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9030);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3030);
     // initializes with zeros
     print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),9031);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),3031);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1003,25 +1030,25 @@
   int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
   int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
 
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),8001);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),8002);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),8003);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),4001);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003);
 
   // mpi buffer
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
-                        3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),8004);
+                        3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004);
 
   // mass matrix
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),8005);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
   // transfer element data
   print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
-                                     sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8010);
+                                     sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010);
 
 
   // element indices
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),8009);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
-                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),8012);
+                                     mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012);
 
   // daniel: check
   //check_ispec_is(Mesh_pointer_f,1);
@@ -1029,9 +1056,9 @@
   // phase elements
   mp->num_phase_ispec_elastic = *num_phase_ispec_elastic;
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
-                                     mp->num_phase_ispec_elastic*2*sizeof(int)),8008);
+                                     mp->num_phase_ispec_elastic*2*sizeof(int)),4008);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
-                                     mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),8011);
+                                     mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011);
 
   //daniel: check
   //check_phase_ispec(Mesh_pointer_f,1);
@@ -1039,7 +1066,7 @@
   // for seismograms
   if( mp->nrec_local > 0 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
-                                     3*NGLL3*(mp->nrec_local)*sizeof(realw)),8015);
+                                     3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
 
     mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
     if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
@@ -1048,22 +1075,22 @@
   // absorbing conditions
   if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
     // non-padded arrays
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),8006);
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),8007);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007);
 
     // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
     print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8013);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8014);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014);
 
     // absorb_field array used for file i/o
     if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )){
       mp->d_b_reclen_field = *h_b_reclen_field;
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
-                                       mp->d_b_reclen_field),8016);
+                                       mp->d_b_reclen_field),4016);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
-                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),8017);
+                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017);
     }
   }
 
@@ -1073,25 +1100,25 @@
     int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
 
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
-                                       epsilondev_size*sizeof(realw)),8301);
+                                       epsilondev_size*sizeof(realw)),4301);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8302);
+                                       cudaMemcpyHostToDevice),4302);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
-                                       epsilondev_size*sizeof(realw)),8302);
+                                       epsilondev_size*sizeof(realw)),4302);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8303);
+                                       cudaMemcpyHostToDevice),4303);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
-                                       epsilondev_size*sizeof(realw)),8304);
+                                       epsilondev_size*sizeof(realw)),4304);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8305);
+                                       cudaMemcpyHostToDevice),4305);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
-                                       epsilondev_size*sizeof(realw)),8306);
+                                       epsilondev_size*sizeof(realw)),4306);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8307);
+                                       cudaMemcpyHostToDevice),4307);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
-                                       epsilondev_size*sizeof(realw)),8308);
+                                       epsilondev_size*sizeof(realw)),4308);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8309);
+                                       cudaMemcpyHostToDevice),4309);
 
   }
 
@@ -1099,56 +1126,56 @@
   if( *ATTENUATION ){
     // memory arrays
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
-                                       (*R_size)*sizeof(realw)),8401);
+                                       (*R_size)*sizeof(realw)),4401);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8402);
+                                       cudaMemcpyHostToDevice),4402);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
-                                       (*R_size)*sizeof(realw)),8403);
+                                       (*R_size)*sizeof(realw)),4403);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8404);
+                                       cudaMemcpyHostToDevice),4404);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
-                                       (*R_size)*sizeof(realw)),8405);
+                                       (*R_size)*sizeof(realw)),4405);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8406);
+                                       cudaMemcpyHostToDevice),4406);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
-                                       (*R_size)*sizeof(realw)),8407);
+                                       (*R_size)*sizeof(realw)),4407);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8408);
+                                       cudaMemcpyHostToDevice),4408);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
-                                       (*R_size)*sizeof(realw)),8409);
+                                       (*R_size)*sizeof(realw)),4409);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8410);
+                                       cudaMemcpyHostToDevice),4410);
 
     // attenuation factors
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),8430);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),4430);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8431);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
-                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),8432);
+                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
-                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8433);
+                                       N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433);
 
     // alpha,beta,gamma factors
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
-                                       N_SLS*sizeof(realw)),8434);
+                                       N_SLS*sizeof(realw)),4434);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8435);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
-                                       N_SLS*sizeof(realw)),8436);
+                                       N_SLS*sizeof(realw)),4436);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8437);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
-                                       N_SLS*sizeof(realw)),8438);
+                                       N_SLS*sizeof(realw)),4438);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8439);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
 
   }
 
@@ -1156,92 +1183,92 @@
   if( *ANISOTROPY ){
     // allocates memory on GPU
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),
-                                       size_padded*sizeof(realw)),8700);
+                                       size_padded*sizeof(realw)),4700);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),
-                                       size_padded*sizeof(realw)),8701);
+                                       size_padded*sizeof(realw)),4701);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),
-                                       size_padded*sizeof(realw)),8702);
+                                       size_padded*sizeof(realw)),4702);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),
-                                       size_padded*sizeof(realw)),8703);
+                                       size_padded*sizeof(realw)),4703);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),
-                                       size_padded*sizeof(realw)),8704);
+                                       size_padded*sizeof(realw)),4704);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),
-                                       size_padded*sizeof(realw)),8705);
+                                       size_padded*sizeof(realw)),4705);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),
-                                       size_padded*sizeof(realw)),8706);
+                                       size_padded*sizeof(realw)),4706);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),
-                                       size_padded*sizeof(realw)),8707);
+                                       size_padded*sizeof(realw)),4707);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),
-                                       size_padded*sizeof(realw)),8708);
+                                       size_padded*sizeof(realw)),4708);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),
-                                       size_padded*sizeof(realw)),8709);
+                                       size_padded*sizeof(realw)),4709);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),
-                                       size_padded*sizeof(realw)),8710);
+                                       size_padded*sizeof(realw)),4710);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),
-                                       size_padded*sizeof(realw)),8711);
+                                       size_padded*sizeof(realw)),4711);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),
-                                       size_padded*sizeof(realw)),8712);
+                                       size_padded*sizeof(realw)),4712);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),
-                                       size_padded*sizeof(realw)),8713);
+                                       size_padded*sizeof(realw)),4713);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),
-                                       size_padded*sizeof(realw)),8714);
+                                       size_padded*sizeof(realw)),4714);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),
-                                       size_padded*sizeof(realw)),8715);
+                                       size_padded*sizeof(realw)),4715);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),
-                                       size_padded*sizeof(realw)),8716);
+                                       size_padded*sizeof(realw)),4716);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),
-                                       size_padded*sizeof(realw)),8717);
+                                       size_padded*sizeof(realw)),4717);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),
-                                       size_padded*sizeof(realw)),8718);
+                                       size_padded*sizeof(realw)),4718);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),
-                                       size_padded*sizeof(realw)),8719);
+                                       size_padded*sizeof(realw)),4719);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),
-                                       size_padded*sizeof(realw)),8720);
+                                       size_padded*sizeof(realw)),4720);
 
     // transfer constant element data with padding
     for(int i=0;i < mp->NSPEC_AB;i++) {
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store + i*NGLL3_PADDED, &c11store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8800);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4800);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store + i*NGLL3_PADDED, &c12store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8801);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4801);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store + i*NGLL3_PADDED, &c13store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8802);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4802);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c14store + i*NGLL3_PADDED, &c14store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8803);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4803);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c15store + i*NGLL3_PADDED, &c15store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8804);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4804);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c16store + i*NGLL3_PADDED, &c16store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8805);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4805);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c22store + i*NGLL3_PADDED, &c22store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8806);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4806);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c23store + i*NGLL3_PADDED, &c23store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8807);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4807);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c24store + i*NGLL3_PADDED, &c24store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8808);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4808);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c25store + i*NGLL3_PADDED, &c25store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8809);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4809);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c26store + i*NGLL3_PADDED, &c26store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8810);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4810);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store + i*NGLL3_PADDED, &c33store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8811);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4811);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c34store + i*NGLL3_PADDED, &c34store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8812);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4812);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c35store + i*NGLL3_PADDED, &c35store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8813);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4813);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c36store + i*NGLL3_PADDED, &c36store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8814);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4814);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store + i*NGLL3_PADDED, &c44store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8815);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4815);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c45store + i*NGLL3_PADDED, &c45store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8816);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4816);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c46store + i*NGLL3_PADDED, &c46store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8817);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4817);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c55store + i*NGLL3_PADDED, &c55store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8818);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4818);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c56store + i*NGLL3_PADDED, &c56store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8819);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4819);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_c66store + i*NGLL3_PADDED, &c66store[i*NGLL3],
-                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8820);
+                                         NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4820);
     }
   }
 
@@ -1252,28 +1279,28 @@
     if( mp->num_free_surface_faces > 0 ){
       // mass matrix
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
-                                         sizeof(realw)*mp->NGLOB_AB),8501);
+                                         sizeof(realw)*mp->NGLOB_AB),4501);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
-                                         sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8502);
+                                         sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502);
       // surface normal
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
-                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),8503);
+                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503);
       print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
-                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),8504);
+                                         3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504);
 
       // temporary global array: used to synchronize updates on global accel array
       print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
-                                         sizeof(int)*mp->NGLOB_AB),8505);
+                                         sizeof(int)*mp->NGLOB_AB),4505);
 
       if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
         print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
-                                          mp->num_free_surface_faces*sizeof(int)),8601);
+                                          mp->num_free_surface_faces*sizeof(int)),4601);
         print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-                                          mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),8603);
+                                          mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603);
         print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int)),8602);
+                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602);
         print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),8604);
+                                          3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604);
       }
     }
   }
@@ -1317,22 +1344,22 @@
 
   // kernel simulations
   // allocates backward/reconstructed arrays on device (GPU)
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),8201);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),8202);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),8203);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),5201);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),5202);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),5203);
 
   // allocates kernels
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8204);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8205);
-  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8206);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5204);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5205);
+  print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5206);
 
   // initializes kernel values to zero
   print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),8207);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5207);
   print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),8208);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5208);
   print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,
-                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),8209);
+                                     NGLL3*mp->NSPEC_AB*sizeof(realw)),5209);
 
   // strains used for attenuation and kernel simulations
   if( *COMPUTE_AND_STORE_STRAIN ){
@@ -1341,87 +1368,87 @@
 
     // solid pressure
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),8310);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5310);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8311);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5311);
     // backward solid pressure
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),8312);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5312);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8313);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5313);
     // prepares backward strains
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
-                                       epsilondev_size*sizeof(realw)),8321);
+                                       epsilondev_size*sizeof(realw)),5321);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
-                                       epsilondev_size*sizeof(realw)),8322);
+                                       epsilondev_size*sizeof(realw)),5322);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
-                                       epsilondev_size*sizeof(realw)),8323);
+                                       epsilondev_size*sizeof(realw)),5323);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
-                                       epsilondev_size*sizeof(realw)),8324);
+                                       epsilondev_size*sizeof(realw)),5324);
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
-                                       epsilondev_size*sizeof(realw)),8325);
+                                       epsilondev_size*sizeof(realw)),5325);
 
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
-                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8326);
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5326);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
-                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8327);
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5327);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
-                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8328);
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5328);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
-                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8329);
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5329);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
-                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8330);
+                                       epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5330);
   }
 
   // attenuation memory variables
   if( *ATTENUATION ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
-                                       (*R_size)*sizeof(realw)),8421);
+                                       (*R_size)*sizeof(realw)),5421);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8422);
+                                       cudaMemcpyHostToDevice),5422);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
-                                       (*R_size)*sizeof(realw)),8423);
+                                       (*R_size)*sizeof(realw)),5423);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8424);
+                                       cudaMemcpyHostToDevice),5424);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
-                                       (*R_size)*sizeof(realw)),8425);
+                                       (*R_size)*sizeof(realw)),5425);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8426);
+                                       cudaMemcpyHostToDevice),5426);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
-                                       (*R_size)*sizeof(realw)),8427);
+                                       (*R_size)*sizeof(realw)),5427);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8428);
+                                       cudaMemcpyHostToDevice),5428);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
-                                       (*R_size)*sizeof(realw)),8429);
+                                       (*R_size)*sizeof(realw)),5429);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw),
-                                       cudaMemcpyHostToDevice),8420);
+                                       cudaMemcpyHostToDevice),5420);
 
     // alpha,beta,gamma factors for backward fields
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
-                                       N_SLS*sizeof(realw)),8434);
+                                       N_SLS*sizeof(realw)),5434);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8435);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
-                                       N_SLS*sizeof(realw)),8436);
+                                       N_SLS*sizeof(realw)),5436);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8437);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
-                                       N_SLS*sizeof(realw)),8438);
+                                       N_SLS*sizeof(realw)),5438);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
-                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8439);
+                                       N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439);
   }
 
   if( *APPROXIMATE_HESS_KL ){
-    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8450);
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5450);
     // initializes with zeros
     print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),8451);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),5451);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1429,10 +1456,76 @@
 #endif
 }
 
+/* ----------------------------------------------------------------------------------------------- */
 
+// purely adjoint & kernel simulations
 
 /* ----------------------------------------------------------------------------------------------- */
 
+extern "C"
+void FC_FUNC_(prepare_sim2_or_3_const_device,
+              PREPARE_SIM2_OR_3_CONST_DEVICE)(
+                                              long* Mesh_pointer_f,
+                                              int* islice_selected_rec,
+                                              int* islice_selected_rec_size,
+                                              int* nadj_rec_local,
+                                              int* nrec,
+                                              int* myrank) {
+
+  TRACE("prepare_sim2_or_3_const_device");
+
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+  // allocates arrays for receivers
+  print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
+                                     *islice_selected_rec_size*sizeof(int)),6001);
+  // copies arrays to GPU device
+  print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_rec,islice_selected_rec,
+                                     *islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),6002);
+
+  // adjoint source arrays
+  mp->nadj_rec_local = *nadj_rec_local;
+  if( mp->nadj_rec_local > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
+                                       (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),6003);
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
+                                       (mp->nadj_rec_local)*sizeof(int)),6004);
+
+    // prepares local irec array:
+    // the irec_local variable needs to be precomputed (as
+    // h_pre_comp..), because normally it is in the loop updating accel,
+    // and due to how it's incremented, it cannot be parallelized
+    int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) );
+    if( h_pre_computed_irec == NULL ) exit_on_error("prepare_sim2_or_3_const_device: h_pre_computed_irec not allocated\n");
+
+    int irec_local = 0;
+    for(int irec = 0; irec < *nrec; irec++) {
+      if(*myrank == islice_selected_rec[irec]) {
+        irec_local++;
+        h_pre_computed_irec[irec_local-1] = irec;
+      }
+    }
+    if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
+    // copies values onto GPU
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
+                                       (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),6010);
+    free(h_pre_computed_irec);
+
+    // temporary array to prepare extracted source array values
+    mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
+    if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+
+  }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("prepare_sim2_or_3_const_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
 // for NOISE simulations
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -1462,25 +1555,25 @@
   mp->num_free_surface_faces = *num_free_surface_faces;
 
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec,
-                                     mp->num_free_surface_faces*sizeof(int)),4001);
+                                     mp->num_free_surface_faces*sizeof(int)),7001);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec,
-                                     mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4002);
+                                     mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7002);
 
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
-                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4003);
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int)),7003);
   print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4004);
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7004);
 
   // alloc storage for the surface buffer to be copied
   print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
-                                     3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),4005);
+                                     3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),7005);
 
   // prepares noise source array
   if( *NOISE_TOMOGRAPHY == 1 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
-                                       3*NGLL3*(*NSTEP)*sizeof(realw)),4101);
+                                       3*NGLL3*(*NSTEP)*sizeof(realw)),7101);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
-                                       3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),4102);
+                                       3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),7102);
   }
 
   // prepares noise directions
@@ -1488,35 +1581,35 @@
     int nface_size = NGLL2*(*num_free_surface_faces);
     // allocates memory on GPU
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
-                                       nface_size*sizeof(realw)),4301);
+                                       nface_size*sizeof(realw)),7301);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
-                                       nface_size*sizeof(realw)),4302);
+                                       nface_size*sizeof(realw)),7302);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
-                                       nface_size*sizeof(realw)),4303);
+                                       nface_size*sizeof(realw)),7303);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
-                                       nface_size*sizeof(realw)),4304);
+                                       nface_size*sizeof(realw)),7304);
     print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
-                                       nface_size*sizeof(realw)),4305);
+                                       nface_size*sizeof(realw)),7305);
     // transfers data onto GPU
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),4306);
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7306);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),4307);
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7307);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),4308);
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),4309);
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
-                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),4310);
+                                       nface_size*sizeof(realw),cudaMemcpyHostToDevice),7310);
   }
 
   // prepares noise strength kernel
   if( *NOISE_TOMOGRAPHY == 3 ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
-                                       NGLL3*(mp->NSPEC_AB)*sizeof(realw)),4401);
+                                       NGLL3*(mp->NSPEC_AB)*sizeof(realw)),7401);
     // initializes kernel values to zero
     print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
-                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),4403);
+                                       NGLL3*mp->NSPEC_AB*sizeof(realw)),7403);
 
   }
 
@@ -1551,25 +1644,25 @@
   mp->gravity = *GRAVITY;
   if( mp->gravity ){
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_deriv_gravity),
-                                       (mp->NGLOB_AB)*sizeof(realw)),7700);
+                                       (mp->NGLOB_AB)*sizeof(realw)),8000);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_deriv_gravity, minus_deriv_gravity,
-                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),7701);
+                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8001);
 
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_g),
-                                       (mp->NGLOB_AB)*sizeof(realw)),7702);
+                                       (mp->NGLOB_AB)*sizeof(realw)),8002);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_g, minus_g,
-                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),7703);
+                                       (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8003);
 
 
     if( *ACOUSTIC_SIMULATION == 0 ){
       // rhostore not allocated yet
       int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
       // padded array
-      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),7706);
+      print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),8006);
       // transfer constant element data with padding
       for(int i=0; i < mp->NSPEC_AB; i++) {
         print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3],
-                                           NGLL3*sizeof(realw),cudaMemcpyHostToDevice),7707);
+                                           NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8007);
       }
     }
   }

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2011-11-17 23:51:03 UTC (rev 19213)
@@ -72,7 +72,7 @@
   fp = fopen(filename, "wb");
   fwrite(vector, sizeof(int), *size, fp);
   fclose(fp);
-  
+
 }
 
 
@@ -82,7 +82,7 @@
   int procid;
   MPI_Comm_rank(MPI_COMM_WORLD,&procid);
   sprintf(filename,"/scratch/eiger/rietmann/SPECFEM3D_AIGLE/in_out_files/DATABASES_MPI/proc%06d_surface_movie",procid);
-  
+
   FILE* fp; int it;
   printf("Opening %s for analysis\n",filename);
   fp = fopen(filename,"rb");
@@ -92,8 +92,8 @@
     printf("FILE ERROR:%s\n",strerror(errno));
     perror("file error\n");
     exit(1);
-  }  
-  
+  }
+
   float* vector = (float*)malloc(nodes_per_iteration*sizeof(float));
   float max_val;
   int i;
@@ -121,7 +121,7 @@
     if(vector1[i] != vector2[i]) {
       error_count++;
       if(error_count < 10) {
-	printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
+  printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
       }
     }
   }
@@ -137,20 +137,20 @@
   for(i=0;i<size;++i) {
     if(vector1[i] != 0) {
       if( fabsf(vector1[i]-vector2[i])/vector1[i] > 0.01) {
-	if(fabsf(vector1[i]-vector2[i]) > 1e-20) {
-	error_count++;
-    	if(error_count<10) {
-    	  printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
-    	}
+  if(fabsf(vector1[i]-vector2[i]) > 1e-20) {
+  error_count++;
+      if(error_count<10) {
+        printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]);
       }
       }
+      }
     }
     /* if(vector1[i] != vector2[i]) { */
     /*   if(fabsf(vector1[i]-vector2[i]) > 1e-25) { */
-    /* 	error_count++; */
-    /* 	if(error_count<50) { */
-    /* 	  printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]); */
-    /* 	} */
+    /*  error_count++; */
+    /*  if(error_count<50) { */
+    /*    printf("err[%d]: %e != %e\n",i,vector1[i],vector2[i]); */
+    /*  } */
     /*   } */
     /* } */
   }
@@ -159,7 +159,7 @@
 }
 
 void compare_surface_files_(int* bytes_per_iteration, int* number_of_iterations) {
-  
+
   char* cpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_proc000001_surface_movie";
   char* gpu_file = "/scratch/eiger/rietmann/SPECFEM3D/in_out_files/DATABASES_MPI/cpu_v2_proc000001_surface_movie";
 
@@ -169,7 +169,7 @@
   if(fp_cpu == 0) {
     //errorstr = (char*) strerror(errno);
     //printf("CPU FILE ERROR:%s\n",errorstr);
-    printf("CPU FILE ERROR:%s\n",strerror(errno));    
+    printf("CPU FILE ERROR:%s\n",strerror(errno));
     perror("cpu file error\n");
   }
   FILE* fp_gpu;
@@ -178,14 +178,14 @@
   if(fp_gpu == NULL) {
     //errorstr = (char*) strerror(errno);
     //printf("GPU FILE ERROR:%s\n",errorstr);
-    printf("GPU FILE ERROR:%s\n",strerror(errno));    
+    printf("GPU FILE ERROR:%s\n",strerror(errno));
     perror("gpu file error\n");
   }
-  
+
   /* pause_for_debug(); */
-  
+
   float* gpu_vector = (float*)malloc(*bytes_per_iteration);
-  float* cpu_vector = (float*)malloc(*bytes_per_iteration);  
+  float* cpu_vector = (float*)malloc(*bytes_per_iteration);
   int i,it,error_count=0;
   for(it=0;it<*number_of_iterations;it++) {
     int pos = (*bytes_per_iteration)*(it);
@@ -203,14 +203,14 @@
     float cpu_max_val=0;
     if(it<100) {
       for(i=0;i<size;i++) {
-	if((fabs(cpu_vector[i] - gpu_vector[i])/(fabs(cpu_vector[i])+1e-31) > 0.01)) {
-	  if(error_count < 30) printf("ERROR[%d]: %g != %g\n",i,cpu_vector[i], gpu_vector[i]);
-	  if(cpu_vector[i] > 1e-30) error_count++;
-	}
-	if(gpu_vector[i]>gpu_max_val) gpu_max_val = gpu_vector[i];
-	if(gpu_vector[i]<gpu_min_val) gpu_min_val = gpu_vector[i];
-	if(cpu_vector[i]>cpu_max_val) cpu_max_val = cpu_vector[i];
-	if(cpu_vector[i]<cpu_min_val) cpu_min_val = cpu_vector[i];
+  if((fabs(cpu_vector[i] - gpu_vector[i])/(fabs(cpu_vector[i])+1e-31) > 0.01)) {
+    if(error_count < 30) printf("ERROR[%d]: %g != %g\n",i,cpu_vector[i], gpu_vector[i]);
+    if(cpu_vector[i] > 1e-30) error_count++;
+  }
+  if(gpu_vector[i]>gpu_max_val) gpu_max_val = gpu_vector[i];
+  if(gpu_vector[i]<gpu_min_val) gpu_min_val = gpu_vector[i];
+  if(cpu_vector[i]>cpu_max_val) cpu_max_val = cpu_vector[i];
+  if(cpu_vector[i]<cpu_min_val) cpu_min_val = cpu_vector[i];
       }
       printf("%d Total Errors\n",error_count);
       printf("size:%d\n",size);
@@ -245,7 +245,7 @@
     else
        printf("File read error.");
   }
-  
+
   fclose(fp);
 
   int i;
@@ -253,11 +253,11 @@
   for(i=0;i<*size;i++) {
     if((fabs(vector[i] - compare_vector[i])/vector[i] > 0.0001)) {
       if(error_count < 30) {
-	printf("ERROR[%d]: %g != %g\n",i,compare_vector[i], vector[i]);
+  printf("ERROR[%d]: %g != %g\n",i,compare_vector[i], vector[i]);
       }
       error_count++;
-      /* if(compare_vector[i] > 1e-30) error_count++; */      
-    }    
+      /* if(compare_vector[i] > 1e-30) error_count++; */
+    }
   }
   printf("%d Total Errors\n",error_count);
   printf("size:%d\n",*size);
@@ -289,7 +289,7 @@
     else
        printf("File read error.");
   }
-  
+
   fclose(fp);
 
   int i;
@@ -298,7 +298,7 @@
     if((abs(vector[i] - compare_vector[i])/vector[i] > 0.01) && error_count < 30) {
       printf("ERROR[%d]: %g != %g\n",i,compare_vector[i], vector[i]);
       error_count++;
-    }    
+    }
   }
   printf("%d Total Errors\n",error_count);
 }

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2011-11-17 23:51:03 UTC (rev 19213)
@@ -6,9 +6,9 @@
 /* from check_fields_cuda.cu */
 void FC_FUNC_(check_max_norm_displ_gpu,
               CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID){}
-				       
+
 void FC_FUNC_(check_max_norm_vector,
-              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID){}				       
+              CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID){}
 void FC_FUNC_(check_max_norm_displ,
               CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID){}
 
@@ -34,23 +34,23 @@
               GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer){}
 
 void FC_FUNC_(get_norm_acoustic_from_device,
-              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm, 
+              GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
                                                   long* Mesh_pointer_f,
                                                   int* SIMULATION_TYPE){}
 
 void FC_FUNC_(get_norm_elastic_from_device,
-              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm, 
+              GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
                                                  long* Mesh_pointer_f,
                                                  int* SIMULATION_TYPE){}
 
-						
+
 /* from file compute_add_sources_elastic_cuda.cu */
 
 void FC_FUNC_(compute_add_sources_el_cuda,
               COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
                                            int* phase_is_innerf,
                                            int* NSOURCESf,
-                                           double* h_stf_pre_compute, 
+                                           double* h_stf_pre_compute,
                                            int* myrankf){}
 
 void FC_FUNC_(compute_add_sources_el_s3_cuda,
@@ -61,21 +61,21 @@
                                               int* myrank){}
 
 void FC_FUNC_(add_source_master_rec_noise_cu,
-              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f, 
-                                                int* myrank_f,  
-                                                int* it_f, 
-                                                int* irec_master_noise_f, 
+              ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
+                                                int* myrank_f,
+                                                int* it_f,
+                                                int* irec_master_noise_f,
                                                 int* islice_selected_rec){}
 
 void FC_FUNC_(add_sources_el_sim_type_2_or_3,
-              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer, 
+              ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
                                            realw* h_adj_sourcearrays,
                                            int* phase_is_inner,
                                            int* h_ispec_is_inner,
-                                           int* h_ispec_is_elastic,                                            
+                                           int* h_ispec_is_elastic,
                                            int* h_ispec_selected_rec,
-                                           int* myrank, 
-                                           int* nrec, 
+                                           int* myrank,
+                                           int* nrec,
                                            int* time_index,
                                            int* h_islice_selected_rec,
                                            int* nadj_rec_local,
@@ -84,23 +84,23 @@
 /* from file compute_add_sources_acoustic_cuda.cu */
 
 void FC_FUNC_(compute_add_sources_ac_cuda,
-              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f, 
+              COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
                                                  int* phase_is_innerf,
-                                                 int* NSOURCESf, 
+                                                 int* NSOURCESf,
                                                  int* SIMULATION_TYPEf,
-                                                 double* h_stf_pre_compute, 
+                                                 double* h_stf_pre_compute,
                                                  int* myrankf){}
 
 void FC_FUNC_(compute_add_sources_ac_s3_cuda,
-              COMPUTE_ADD_SOURCES_AC_S3_CUDA)(long* Mesh_pointer_f, 
+              COMPUTE_ADD_SOURCES_AC_S3_CUDA)(long* Mesh_pointer_f,
                                                       int* phase_is_innerf,
-                                                      int* NSOURCESf, 
+                                                      int* NSOURCESf,
                                                       int* SIMULATION_TYPEf,
-                                                      double* h_stf_pre_compute, 
+                                                      double* h_stf_pre_compute,
                                                       int* myrankf){}
 
 void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
-              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer, 
+              ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
                                                realw* h_adj_sourcearrays,
                                                int* phase_is_inner,
                                                int* h_ispec_is_inner,
@@ -114,43 +114,43 @@
                                                int* NTSTEP_BETWEEN_ADJSRC){}
 
 /* from compute_coupling_cuda.cu */
-							
+
 void FC_FUNC_(compute_coupling_ac_el_cuda,
               COMPUTE_COUPLING_AC_EL_CUDA)(
-                                            long* Mesh_pointer_f, 
-                                            int* phase_is_innerf, 
-                                            int* num_coupling_ac_el_facesf, 
+                                            long* Mesh_pointer_f,
+                                            int* phase_is_innerf,
+                                            int* num_coupling_ac_el_facesf,
                                             int* SIMULATION_TYPEf){}
 
 void FC_FUNC_(compute_coupling_el_ac_cuda,
               COMPUTE_COUPLING_EL_AC_CUDA)(
-                                                 long* Mesh_pointer_f, 
-                                                 int* phase_is_innerf, 
-                                                 int* num_coupling_ac_el_facesf, 
+                                                 long* Mesh_pointer_f,
+                                                 int* phase_is_innerf,
+                                                 int* num_coupling_ac_el_facesf,
                                                  int* SIMULATION_TYPEf){}
 
 /* from compute_forces_acoustic_cuda.cu */
 
 void FC_FUNC_(transfer_boun_pot_from_device,
               TRANSFER_BOUN_POT_FROM_DEVICE)(
-                                              int* size, 
-                                              long* Mesh_pointer_f, 
-                                              realw* potential_dot_dot_acoustic, 
+                                              int* size,
+                                              long* Mesh_pointer_f,
+                                              realw* potential_dot_dot_acoustic,
                                               realw* send_potential_dot_dot_buffer,
-                                              int* num_interfaces_ext_mesh, 
+                                              int* num_interfaces_ext_mesh,
                                               int* max_nibool_interfaces_ext_mesh,
-                                              int* nibool_interfaces_ext_mesh, 
+                                              int* nibool_interfaces_ext_mesh,
                                               int* ibool_interfaces_ext_mesh,
                                               int* FORWARD_OR_ADJOINT){}
 
 void FC_FUNC_(transfer_asmbl_pot_to_device,
               TRANSFER_ASMBL_POT_TO_DEVICE)(
-                                                long* Mesh_pointer, 
-                                                realw* potential_dot_dot_acoustic, 
+                                                long* Mesh_pointer,
+                                                realw* potential_dot_dot_acoustic,
                                                 realw* buffer_recv_scalar_ext_mesh,
-                                                int* num_interfaces_ext_mesh, 
+                                                int* num_interfaces_ext_mesh,
                                                 int* max_nibool_interfaces_ext_mesh,
-                                                int* nibool_interfaces_ext_mesh, 
+                                                int* nibool_interfaces_ext_mesh,
                                                 int* ibool_interfaces_ext_mesh,
                                                 int* FORWARD_OR_ADJOINT){}
 
@@ -163,18 +163,18 @@
 
 void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
                              long* Mesh_pointer,
-                             int* size_F, 
+                             int* size_F,
                              int* SIMULATION_TYPE){}
 
 void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
                                                              long* Mesh_pointer,
-                                                             int* size_F, 
-                                                             realw* deltatover2_F, 
-                                                             int* SIMULATION_TYPE, 
+                                                             int* size_F,
+                                                             realw* deltatover2_F,
+                                                             int* SIMULATION_TYPE,
                                                              realw* b_deltatover2_F){}
 
 void FC_FUNC_(acoustic_enforce_free_surf_cuda,
-              ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f, 
+              ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
                                                   int* SIMULATION_TYPE,
                                                   int* ABSORB_FREE_SURFACE){}
 
@@ -182,22 +182,22 @@
 /* from compute_forces_elastic_cuda.cu */
 void FC_FUNC_(transfer_boun_accel_from_device,
               TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
-						   realw* send_accel_buffer,
-						   int* num_interfaces_ext_mesh,
-						   int* max_nibool_interfaces_ext_mesh,
-						   int* nibool_interfaces_ext_mesh,
-						   int* ibool_interfaces_ext_mesh,
-						   int* FORWARD_OR_ADJOINT){}
+               realw* send_accel_buffer,
+               int* num_interfaces_ext_mesh,
+               int* max_nibool_interfaces_ext_mesh,
+               int* nibool_interfaces_ext_mesh,
+               int* ibool_interfaces_ext_mesh,
+               int* FORWARD_OR_ADJOINT){}
 
-						  
+
 void FC_FUNC_(transfer_asmbl_accel_to_device,
-              TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, 
+              TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
                                               realw* accel,
                                               realw* buffer_recv_vector_ext_mesh,
                                               int* num_interfaces_ext_mesh,
                                               int* max_nibool_interfaces_ext_mesh,
                                               int* nibool_interfaces_ext_mesh,
-						     int* ibool_interfaces_ext_mesh,int* FORWARD_OR_ADJOINT){}
+                 int* ibool_interfaces_ext_mesh,int* FORWARD_OR_ADJOINT){}
 
 void FC_FUNC_(compute_forces_elastic_cuda,
               COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f,
@@ -211,91 +211,91 @@
 
 void FC_FUNC_(kernel_3_a_cuda,
               KERNEL_3_A_CUDA)(long* Mesh_pointer,
-                               int* size_F, 
-                               realw* deltatover2_F, 
-                               int* SIMULATION_TYPE_f, 
+                               int* size_F,
+                               realw* deltatover2_F,
+                               int* SIMULATION_TYPE_f,
                                realw* b_deltatover2_F,
                                int* OCEANS){}
 
 void FC_FUNC_(kernel_3_b_cuda,
               KERNEL_3_B_CUDA)(long* Mesh_pointer,
-                             int* size_F, 
-                             realw* deltatover2_F, 
-                             int* SIMULATION_TYPE_f, 
-			       realw* b_deltatover2_F){}
+                             int* size_F,
+                             realw* deltatover2_F,
+                             int* SIMULATION_TYPE_f,
+             realw* b_deltatover2_F){}
 
 void FC_FUNC_(elastic_ocean_load_cuda,
-              ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f, 
+              ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
                                        int* SIMULATION_TYPE){}
 
 /* from file compute_kernels_cuda.cu */
-				      
+
 void FC_FUNC_(compute_kernels_elastic_cuda,
               COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
                                             realw* deltat_f){}
 
 void FC_FUNC_(compute_kernels_strgth_noise_cu,
-              COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer, 
+              COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
                                                realw* h_noise_surface_movie,
                                                realw* deltat){}
 
 void FC_FUNC_(compute_kernels_acoustic_cuda,
               COMPUTE_KERNELS_ACOUSTIC_CUDA)(
-                                             long* Mesh_pointer, 
+                                             long* Mesh_pointer,
                                              realw* deltat_f){}
 
 void FC_FUNC_(compute_kernels_hess_cuda,
               COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
                                          realw* deltat_f) {}
-                                         
+
 /* from file compute_stacey_acoustic_cuda.cu */
 void FC_FUNC_(compute_stacey_acoustic_cuda,
               COMPUTE_STACEY_ACOUSTIC_CUDA)(
-                                    long* Mesh_pointer_f, 
-                                    int* phase_is_innerf, 
-                                    int* SIMULATION_TYPEf, 
+                                    long* Mesh_pointer_f,
+                                    int* phase_is_innerf,
+                                    int* SIMULATION_TYPEf,
                                     int* SAVE_FORWARDf,
                                     realw* h_b_absorb_potential){}
 
 
 /* from file compute_stacey_elastic_cuda.cu */
-					   
+
 void FC_FUNC_(compute_stacey_elastic_cuda,
-              COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f, 
-                                           int* phase_is_innerf, 
-                                           int* SIMULATION_TYPEf, 
+              COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
+                                           int* phase_is_innerf,
+                                           int* SIMULATION_TYPEf,
                                            int* SAVE_FORWARDf,
                                            realw* h_b_absorb_field){}
 
 /* from file it_update_displacement_cuda.cu */
-					  
+
 void FC_FUNC_(it_update_displacement_cuda,
               it_update_displacement_cuda)(long* Mesh_pointer_f,
-                                                 int* size_F, 
-                                                 realw* deltat_F, 
-                                                 realw* deltatsqover2_F, 
+                                                 int* size_F,
+                                                 realw* deltat_F,
+                                                 realw* deltatsqover2_F,
                                                  realw* deltatover2_F,
-                                                 int* SIMULATION_TYPE, 
-                                                 realw* b_deltat_F, 
-                                                 realw* b_deltatsqover2_F, 
+                                                 int* SIMULATION_TYPE,
+                                                 realw* b_deltat_F,
+                                                 realw* b_deltatsqover2_F,
                                                  realw* b_deltatover2_F){}
 
 void FC_FUNC_(it_update_displacement_ac_cuda,
-              IT_UPDATE_DISPLACEMENT_AC_CUDA)(long* Mesh_pointer_f, 
+              IT_UPDATE_DISPLACEMENT_AC_CUDA)(long* Mesh_pointer_f,
                                                            int* size_F,
-                                                           realw* deltat_F, 
-                                                           realw* deltatsqover2_F, 
+                                                           realw* deltat_F,
+                                                           realw* deltatsqover2_F,
                                                            realw* deltatover2_F,
-                                                           int* SIMULATION_TYPE, 
-                                                           realw* b_deltat_F, 
-                                                           realw* b_deltatsqover2_F, 
+                                                           int* SIMULATION_TYPE,
+                                                           realw* b_deltat_F,
+                                                           realw* b_deltatsqover2_F,
                                                            realw* b_deltatover2_F){}
 
 /* from file noise_tomography_cuda.cu */
-							  
-void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}							  
-void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id){}					
 
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id){}
+
 void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val){}
 
 void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val){}
@@ -307,13 +307,13 @@
                                         realw* h_noise_surface_movie){}
 
 void FC_FUNC_(noise_read_add_surface_movie_cu,
-              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f, 
-                                               realw* h_noise_surface_movie, 
+              NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+                                               realw* h_noise_surface_movie,
                                                int* NOISE_TOMOGRAPHYf){}
 
-						
-/* from file prepare_mesh_constants_cuda.cu						 */
-						
+
+/* from file prepare_mesh_constants_cuda.cu            */
+
 void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)(){}
 void FC_FUNC_(output_free_device_memory,
               OUTPUT_FREE_DEVICE_MEMORY)(int* id){}
@@ -324,26 +324,30 @@
 void FC_FUNC_(get_free_device_memory,
               get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ){}
 
+extern "C"
+void FC_FUNC_(prepare_cuda_device,
+              PREPARE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices){}
+
 void FC_FUNC_(prepare_constants_device,
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
-                                        int* h_NGLLX, 
+                                        int* h_NGLLX,
                                         int* NSPEC_AB, int* NGLOB_AB,
                                         realw* h_xix, realw* h_xiy, realw* h_xiz,
                                         realw* h_etax, realw* h_etay, realw* h_etaz,
                                         realw* h_gammax, realw* h_gammay, realw* h_gammaz,
                                         realw* h_kappav, realw* h_muv,
-                                        int* h_ibool, 
+                                        int* h_ibool,
                                         int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
                                         int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
-                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz, 
+                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
                                         realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
-                                        realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,        
-                                        int* ABSORBING_CONDITIONS,    
+                                        realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
+                                        int* ABSORBING_CONDITIONS,
                                         int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
                                         realw* h_abs_boundary_normal,
                                         realw* h_abs_boundary_jacobian2Dw,
                                         int* h_num_abs_boundary_faces,
-                                        int* h_ispec_is_inner, 
+                                        int* h_ispec_is_inner,
                                         int* NSOURCES,
                                         int* nsources_local,
                                         realw* h_sourcearrays,
@@ -355,8 +359,7 @@
                                         int* nrec_local_f,
                                         int* SIMULATION_TYPE,
                                         int* USE_MESH_COLORING_GPU,
-                                        int* nspec_acoustic,int* nspec_elastic,                                        
-                                        int* myrank_f, int* ncuda_devices)
+                                        int* nspec_acoustic,int* nspec_elastic)
 {
   fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
   exit(1);
@@ -372,11 +375,11 @@
                                               int* myrank){}
 
 void FC_FUNC_(prepare_fields_acoustic_device,
-              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f, 
-                                              realw* rmass_acoustic, 
+              PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
+                                              realw* rmass_acoustic,
                                               realw* rhostore,
                                               realw* kappastore,
-                                              int* num_phase_ispec_acoustic, 
+                                              int* num_phase_ispec_acoustic,
                                               int* phase_ispec_inner_acoustic,
                                               int* ispec_is_acoustic,
                                               int* NOISE_TOMOGRAPHY,
@@ -394,12 +397,12 @@
                                               realw* coupling_ac_el_jacobian2Dw,
                                               int* num_colors_outer_acoustic,
                                               int* num_colors_inner_acoustic,
-                                              int* num_elem_colors_acoustic){}							 
+                                              int* num_elem_colors_acoustic){}
 
 void FC_FUNC_(prepare_fields_acoustic_adj_dev,
               PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
                                                int* SIMULATION_TYPE,
-                                               int* APPROXIMATE_HESS_KL) {}                                              
+                                               int* APPROXIMATE_HESS_KL) {}
 void FC_FUNC_(prepare_fields_elastic_device,
               PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
                                              int* size,
@@ -416,7 +419,7 @@
                                              int* COMPUTE_AND_STORE_STRAIN,
                                              realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
                                              realw* epsilondev_xz,realw* epsilondev_yz,
-                                             int* ATTENUATION, 
+                                             int* ATTENUATION,
                                              int* R_size,
                                              realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
                                              realw* one_minus_sum_beta,realw* factor_common,
@@ -426,7 +429,7 @@
                                              int* NOISE_TOMOGRAPHY,
                                              realw* free_surface_normal,
                                              int* free_surface_ispec,
-                                             int* free_surface_ijk,                                             
+                                             int* free_surface_ijk,
                                              int* num_free_surface_faces,
                                              int* ACOUSTIC_SIMULATION,
                                              int* num_colors_outer_elastic,
@@ -441,7 +444,7 @@
                                              realw *c16store,
                                              realw *c22store,
                                              realw *c23store,
-                                             realw *c24store,                                             
+                                             realw *c24store,
                                              realw *c25store,
                                              realw *c26store,
                                              realw *c33store,
@@ -454,25 +457,25 @@
                                              realw *c55store,
                                              realw *c56store,
                                              realw *c66store){}
-  
+
 void FC_FUNC_(prepare_fields_elastic_adj_dev,
               PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
                                               int* size,
                                               int* SIMULATION_TYPE,
                                               int* COMPUTE_AND_STORE_STRAIN,
-                                              realw* epsilon_trace_over_3,                                             
+                                              realw* epsilon_trace_over_3,
                                               realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
                                               realw* b_epsilondev_xz,realw* b_epsilondev_yz,
                                               realw* b_epsilon_trace_over_3,
-                                              int* ATTENUATION, 
+                                              int* ATTENUATION,
                                               int* R_size,
                                               realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
                                               realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
                                               int* APPROXIMATE_HESS_KL){}
-  
 
+
 void FC_FUNC_(prepare_fields_noise_device,
-              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f, 
+              PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
                                            int* NSPEC_AB, int* NGLOB_AB,
                                            int* free_surface_ispec,
                                            int* free_surface_ijk,
@@ -504,12 +507,12 @@
 void FC_FUNC_(prepare_fields_gravity_device,
               PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
                                              int* GRAVITY,
-                                             realw* minus_deriv_gravity, 
+                                             realw* minus_deriv_gravity,
                                              realw* minus_g,
                                              realw* h_wgll_cube,
                                              int* ACOUSTIC_SIMULATION,
                                              realw* rhostore){}
-/* from file transfer_fields_cuda.cu				      */
+/* from file transfer_fields_cuda.cu              */
 
 void FC_FUNC_(transfer_fields_el_to_device,
               TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f){}
@@ -573,7 +576,7 @@
                                                           realw* b_epsilon_trace_over_3,
                                                           int* size_epsilon_trace_over_3) {}
 */
-                                                          
+
 void FC_FUNC_(transfer_b_fields_att_to_device,
               TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
                                              realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
@@ -597,46 +600,46 @@
                                                int* size_epsilondev){}
 
 void FC_FUNC_(transfer_kernels_el_to_host,
-              TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer, 
+              TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
                                                     realw* h_rho_kl,
-                                                    realw* h_mu_kl, 
+                                                    realw* h_mu_kl,
                                                     realw* h_kappa_kl,
                                                     int* NSPEC_AB){}
 
 void FC_FUNC_(transfer_kernels_noise_to_host,
-              TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer, 
+              TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
                                               realw* h_Sigma_kl,
                                               int* NSPEC_AB){}
 
-							 
+
 void FC_FUNC_(transfer_fields_ac_to_device,
               TRANSFER_FIELDS_AC_TO_DEVICE)(
-                                                  int* size, 
-                                                  realw* potential_acoustic, 
-                                                  realw* potential_dot_acoustic, 
+                                                  int* size,
+                                                  realw* potential_acoustic,
+                                                  realw* potential_dot_acoustic,
                                                   realw* potential_dot_dot_acoustic,
                                                   long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_b_fields_ac_to_device,
               TRANSFER_B_FIELDS_AC_TO_DEVICE)(
-                                                    int* size, 
-                                                    realw* b_potential_acoustic, 
-                                                    realw* b_potential_dot_acoustic, 
+                                                    int* size,
+                                                    realw* b_potential_acoustic,
+                                                    realw* b_potential_dot_acoustic,
                                                     realw* b_potential_dot_dot_acoustic,
                                                     long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_fields_ac_from_device,TRANSFER_FIELDS_AC_FROM_DEVICE)(
-                                                                             int* size, 
-                                                                             realw* potential_acoustic, 
-                                                                             realw* potential_dot_acoustic, 
+                                                                             int* size,
+                                                                             realw* potential_acoustic,
+                                                                             realw* potential_dot_acoustic,
                                                                              realw* potential_dot_dot_acoustic,
                                                                              long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_b_fields_ac_from_device,
               TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
-                                                      int* size, 
-                                                      realw* b_potential_acoustic, 
-                                                      realw* b_potential_dot_acoustic, 
+                                                      int* size,
+                                                      realw* b_potential_acoustic,
+                                                      realw* b_potential_dot_acoustic,
                                                       realw* b_potential_dot_dot_acoustic,
                                                       long* Mesh_pointer_f){}
 
@@ -647,7 +650,7 @@
               TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f){}
 
 void FC_FUNC_(transfer_kernels_ac_to_host,
-              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer, 
+              TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
                                                              realw* h_rho_ac_kl,
                                                              realw* h_kappa_ac_kl,
                                                              int* NSPEC_AB){}
@@ -675,8 +678,8 @@
                                                 realw* potential_acoustic,
                                                 realw* potential_dot_acoustic,
                                                 realw* potential_dot_dot_acoustic,
-                                                realw* b_potential_acoustic, 
-                                                realw* b_potential_dot_acoustic, 
+                                                realw* b_potential_acoustic,
+                                                realw* b_potential_dot_acoustic,
                                                 realw* b_potential_dot_dot_acoustic,
                                                 long* Mesh_pointer_f,
                                                 int* number_receiver_global,
@@ -685,5 +688,5 @@
                                                 int* ibool,
                                                 int* SIMULATION_TYPEf){}
 
-							   
-							    
+
+

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2011-11-17 23:51:03 UTC (rev 19213)
@@ -79,7 +79,7 @@
   int num_blocks_x = mp->nrec_local;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
@@ -205,7 +205,7 @@
   int num_blocks_x = mp->nrec_local;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
-    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/param_reader.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/param_reader.c	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/param_reader.c	2011-11-17 23:51:03 UTC (rev 19213)
@@ -142,7 +142,7 @@
   /* Regular expression for parsing lines from param file.
    ** Good luck reading this regular expression.  Basically, the lines of
    ** the parameter file should be of the form 'parameter = value',
-   ** optionally followed by a #-delimited comment.  
+   ** optionally followed by a #-delimited comment.
    ** 'value' can be any number of space- or tab-separated words. Blank
    ** lines, lines containing only white space and lines whose first non-
    ** whitespace character is '#' are ignored.  White space is generally

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c	2011-11-17 23:51:03 UTC (rev 19213)
@@ -1,683 +1,683 @@
-/*
-!=====================================================================
-!
-!               S p e c f e m 3 D  V e r s i o n  2 . 0
-!               ---------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and University of Pau / CNRS / INRIA
-! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
-!                            April 2011
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-*/
-
-// for large files
-#define _FILE_OFFSET_BITS  64
-
-// after Brian's function
-
-#include "config.h"
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-static int fd;
-
-void
-FC_FUNC_(open_file,OPEN_FILE)(char *file) {
-  /*    fprintf(stderr, "Opening file: %s\n", file); */
-  fd = open(file, O_WRONLY | O_CREAT, 0644);
-  if(fd == -1) {
-    fprintf(stderr, "Error opening file: %s exiting\n", file);
-    exit(-1);
-  }
-}
-
-void
-FC_FUNC_(close_file,CLOSE_FILE)() {
-  /*    fprintf(stderr, "Closing file\n"); */
-  close(fd);
-}
-
-void
-FC_FUNC_(write_integer,WRITE_INTEGER)(int *z) {
-  int dummy_unused_variable = write(fd, z, sizeof(int));
-}
-
-void
-FC_FUNC_(write_real,WRITE_REAL)(float *z) {
-  int dummy_unused_variable = write(fd, z, sizeof(float));
-}
-
-
-/* ---------------------------------------
-
- IO performance test
-
- Software Optimization for High Performance Computing: Creating Faster Applications
-
- By Isom L. Crawford and Kevin R. Wadleigh
- Jul 18, 2003
-
- - uses functions fopen/fread/fwrite for binary file I/O
-  
- --------------------------------------- */
-
-#define __USE_GNU
-#include <string.h>
-#include <regex.h>
-
-#define MIN(x,y) ((x) < (y) ? (x) : (y))
-
-/* fastest performance on nehalem nodes:
-
- Linux 2.6.18-164.11.1.el5 #1 SMP Wed Jan 20 10:04:55 EST 2010 x86_64 x86_64 x86_64 GNU/Linux
-
- achieved with 16 KB buffers: */
-
-//#define MAX_B 65536 // 64 KB
-//#define MAX_B 32768 // 32 KB
-#define MAX_B 16384 // 16 KB
-//#define MAX_B 8192 // 8 KB
-
-// absorbing files: instead of passing file descriptor, we use the array index
-//                          index 0 for elastic domain file
-//                          index 1 for acoustic domain file
-//                          (reserved, but unused yet) index 2 - for NOISE_TOMOGRAPHY (SURFACE_MOVIE)
-#define ABS_FILEID 3  // number of absorbing files, note: in C we start indexing arrays from 0
-
-// file points
-static FILE * fp_abs[ABS_FILEID];
-// file work buffers
-static char * work_buffer[ABS_FILEID];
-
-
-//void
-//FC_FUNC_(open_file_abs_r_fbin,OPEN_FILE_ABS_R_FBIN)(int *fid, char *filename,int *length, int *filesize){
-void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *filesize){
-
-  // opens file for read access
-
-  //This sequence assigns the MAX_B array work_buffer to the file pointer
-  // to be used for its buffering. performance should benefit.
-  char * fncopy;
-  char * blank;
-  FILE *ft;
-  int ret;
-  
-  // checks filesize
-  if( *filesize == 0 ){
-    perror("Error file size for reading");
-    exit(EXIT_FAILURE);
-  }
-  
-  // Trim the file name.
-  fncopy = strndup(filename, *length);
-  blank = strchr(fncopy, ' ');
-  if (blank != NULL) {
-    fncopy[blank - fncopy] = '\0';
-  }
-
-/*
-//debug checks file size
-// see: 
-//https://www.securecoding.cert.org/confluence/display/seccode/FIO19-C.+Do+not+use+fseek()+and+ftell()+to+compute+the+size+of+a+file
-  printf("file size: %lld \n",*filesize);
-  int fd;
-  struct stat stbuf;
-  long long size;
-  fd = open(fncopy, O_RDONLY);
-  if(fd == -1) {
-    fprintf(stderr, "Error opening file: %s exiting\n", fncopy);
-    exit(-1);
-  }  
-  if( fstat(fd, &stbuf) == 0 ){ 
-    size = stbuf.st_size;
-    printf("file size found is: %lld (Bytes) \n",size);  
-  }
-  close(fd);
-*/
-
-  // opens file
-  //ft = fopen( fncopy, "r+" );
-  ft = fopen( fncopy, "rb+" ); // read binary file
-  if( ft == NULL ) { perror("fopen"); exit(-1); }
-
-  
-  // sets mode for full buffering
-  work_buffer[*fid] = (char *)malloc(MAX_B);
-  ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B );
-  if( ret != 0 ){
-    perror("Error setting working buffer");
-    exit(EXIT_FAILURE);  
-  }
-    
-  // stores file index id fid: from 0 to 8
-  fp_abs[*fid] = ft;
-
-  free(fncopy);
-}
-
-//void
-//FC_FUNC_(open_file_abs_w_fbin,OPEN_FILE_ABS_W_FBIN)(int *fid, char *filename, int *length, int *filesize){
-void open_file_abs_w_fbin(int *fid, char *filename, int *length, long long *filesize){
-
-  // opens file for write access
-
-  //This sequence assigns the MAX_B array work_buffer to the file pointer
-  // to be used for its buffering. performance should benefit.
-  char * fncopy;
-  char * blank;
-  FILE *ft;
-  int ret;
-  
-  // checks filesize
-  if( *filesize == 0 ){
-    perror("Error file size for writing");
-    exit(EXIT_FAILURE);
-  }
-
-  // Trim the file name.
-  fncopy = strndup(filename, *length);
-  blank = strchr(fncopy, ' ');
-  if (blank != NULL) {
-    fncopy[blank - fncopy] = '\0';
-  }
-
-  // opens file
-  //ft = fopen( fncopy, "w+" );
-  ft = fopen( fncopy, "wb+" ); // write binary file
-  if( ft == NULL ) { perror("fopen"); exit(-1); }
-
-  // sets mode for full buffering
-  work_buffer[*fid] = (char *)malloc(MAX_B);
-  ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B );
-  if( ret != 0 ){
-    perror("Error setting working buffer");
-    exit(EXIT_FAILURE);  
-  }
-  
-  // stores file index id fid: from 0 to 8
-  fp_abs[*fid] = ft;
-
-  free(fncopy);
-
-}
-
-//void
-//FC_FUNC_(close_file_abs_fbin,CLOSE_FILE_ABS_FBIN)(int * fid){
-void close_file_abs_fbin(int * fid){
-
-  // closes file
-
-  fclose(fp_abs[*fid]);
-
-  free(work_buffer[*fid]);
-
-}
-
-//void
-//FC_FUNC_(write_abs_fbin,WRITE_ABS_FBIN)(int *fid, void *buffer, int *length, int *index){
-void write_abs_fbin(int *fid, char *buffer, int *length, int *index){
-
-  // writes binary file data in chunks of MAX_B
-
-  FILE *ft;
-  int itemlen,remlen,donelen,ret;
-  void *buf;
-
-  // file pointer
-  ft = fp_abs[*fid];
-
-  donelen = 0;
-  remlen = *length;
-  buf = buffer;
-  ret = 0;
-
-/*
-//debug
-  float dat[*length/4];
-  memcpy(dat,buffer,*length);
-  printf("buffer length: %d %d\n",*length,*index);
-  printf("buffer size: %d %d \n",sizeof(dat),sizeof(buffer));
-  int i;
-  for(i=0;i< 50;i++){
-    printf("buffer: %d %e \n",i,dat[i]);
-  }
-  
-  // positions file pointer (for reverse time access)
-  // make sure to use 64-bit arithmetic to avoid overflow for very large files
-  long long pos,cur;
-  
-  pos = ((long long)*length) * (*index -1 );  
-  cur = ftell(ft);
-  
-  printf("current position: %d %lld %lld \n",*fid,cur,pos);  
-  ret = fseek(ft, pos , SEEK_SET);
-  if ( ret != 0 ) {
-    perror("Error fseek");
-    exit(EXIT_FAILURE);
-  }
- */
-  
-  
-  // writes items of maximum MAX_B to the file
-  while (remlen > 0){
-
-    itemlen = MIN(remlen,MAX_B);
-    // note: we want to write out exactly *itemlen* bytes
-    ret = fwrite(buf,1,itemlen,ft);
-    if (ret > 0){
-      donelen = donelen + ret;
-      remlen = remlen - MAX_B;
-      buf += MAX_B;
-    }
-    else{
-      remlen = 0;
-    }
-  }
-
-  //debug
-  //  printf("buffer done length: %d %d\n",donelen,*length);
-}
-
-//void
-//FC_FUNC_(read_abs_fbin,READ_ABS_FBIN)(int *fid, void *buffer, int *length, int *index){
-void read_abs_fbin(int *fid, char *buffer, int *length, int *index){
-
-  // reads binary file data in chunks of MAX_B
-
-  FILE *ft;
-  int ret,itemlen,remlen,donelen;
-  long long pos;
-  void *buf;
-
-  // file pointer
-  ft = fp_abs[*fid];
-
-  // positions file pointer (for reverse time access)
-  // make sure to use 64-bit arithmetic to avoid overflow for very large files
-  pos = ((long long)*length) * (*index -1 );
-  
-  ret = fseek(ft, pos , SEEK_SET);
-  if ( ret != 0 ) {
-    perror("Error fseek");
-    exit(EXIT_FAILURE);
-  }
-
-  donelen = 0;
-  remlen = *length;
-  buf = buffer;
-  ret = 0;
-
-  // cleans buffer 
-  //memset( buf,0,remlen);
-  
-  // reads items of maximum MAX_B to the file
-  while (remlen > 0){
-
-    // checks end of file
-    if (ferror(ft) || feof(ft)) return;
-
-    itemlen = MIN(remlen,MAX_B);
-    ret = fread(buf,1,itemlen,ft);
-
-    if (ferror(ft) || feof(ft)) return;
-
-    if (ret > 0){
-      donelen = donelen + ret;
-      remlen = remlen - MAX_B;
-      buf += MAX_B;
-    }
-    else{
-      remlen = 0;
-    }
-  }
-
-/*
-// debug
-  printf("position: %lld %d %d \n",pos,*length,*index);
-  printf("buffer done length: %d %d\n",donelen,*length);
-  float dat[*length/4];
-  memcpy(dat,buffer,*length);
-  printf("return buffer length: %d %d\n",*length,*index);
-  printf("return buffer size: %d %d \n",sizeof(dat),sizeof(buffer));  
-  int i;
-  for(i=0;i< 50;i++){
-    printf("return buffer: %d %e \n",i,dat[i]);
-  }
-*/
-}
-
-
-
-
-/* ---------------------------------------
-
- IO performance test
-
-
- A Performance Comparison of "read" and "mmap" in the Solaris 8 OS
-
- By Oyetunde Fadele, September 2002
-
- http://developers.sun.com/solaris/articles/read_mmap.html
-
- or
-
- High-performance network programming, Part 2: Speed up processing at both the client and server
-
- by Girish Venkatachalam
-
- http://www.ibm.com/developerworks/aix/library/au-highperform2/
-
-
- - uses functions mmap/memcpy for mapping file I/O
-
- -------------------------------------  */
-
-
-#include <errno.h>
-#include <limits.h>
-#include <sys/mman.h>
-
-// file maps
-static char * map_abs[ABS_FILEID];
-// file descriptors
-static int map_fd_abs[ABS_FILEID];
-// file sizes
-static long long filesize_abs[ABS_FILEID];
-
-//void
-//FC_FUNC_(open_file_abs_w_map,OPEN_FILE_ABS_W_MAP)(int *fid, char *filename, int *length, int *filesize){
-void open_file_abs_w_map(int *fid, char *filename, int *length, long long *filesize){
-
-  // opens file for write access
-
-  int ft;
-  int result;
-  char *map;
-  char *fncopy;
-  char *blank;
-
-  // checks filesize
-  if( *filesize == 0 ){
-    perror("Error file size for writing");
-    exit(EXIT_FAILURE);
-  }
-
-  /*
-   // debug check filesize below or above 2 GB
-   //            filesize gives bytes needed; 4-byte integer limited to +- 2,147,483,648 bytes ~ 2 GB
-   float s = *filesize / 1024. / 1024. / 1024.;
-   if( s > 2.0 ){
-   printf("file size bigger than 2 GB: %lld B or %f GB \n",*filesize,s);
-   }
-   */  
-
-  // Trim the file name.
-  fncopy = strndup(filename, *length);
-  blank = strchr(fncopy, ' ');
-  if (blank != NULL) {
-    fncopy[blank - fncopy] = '\0';
-  }
-
-  /* Open a file for writing.
-   *  - Creating the file if it doesn't exist.
-   *  - Truncating it to 0 size if it already exists. (not really needed)
-   *
-   * Note: "O_WRONLY" mode is not sufficient when mmaping.
-   */
-  ft = open(fncopy, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600);
-  if (ft == -1) {
-    perror("Error opening file for writing");
-    exit(EXIT_FAILURE);
-  }
-
-  // file index id fid: from 0 to 8
-  map_fd_abs[*fid] = ft;
-
-  free(fncopy);
-
-  /* Stretch the file size to the size of the (mmapped) array of ints
-   */
-  filesize_abs[*fid] = *filesize;
-  result = lseek(ft, filesize_abs[*fid] - 1, SEEK_SET);
-  if (result == -1) {
-    close(ft);
-    perror("Error calling lseek() to 'stretch' the file");
-    exit(EXIT_FAILURE);
-  }
-
-  //printf("file length: %d \n",filesize_abs[*fid]);
-
-
-  /* Something needs to be written at the end of the file to
-   * have the file actually have the new size.
-   * Just writing an empty string at the current file position will do.
-   *
-   * Note:
-   *  - The current position in the file is at the end of the stretched
-   *    file due to the call to lseek().
-   *  - An empty string is actually a single '\0' character, so a zero-byte
-   *    will be written at the last byte of the file.
-   */
-  result = write(ft, "", 1);
-  if (result != 1) {
-    close(ft);
-    perror("Error writing last byte of the file");
-    exit(EXIT_FAILURE);
-  }
-
-  /* Now the file is ready to be mmapped.
-   */
-  map = mmap(0, filesize_abs[*fid], PROT_READ | PROT_WRITE, MAP_SHARED, ft, 0);
-  if (map == MAP_FAILED) {
-    close(ft);
-    perror("Error mmapping the file");
-    exit(EXIT_FAILURE);
-  }
-
-  map_abs[*fid] = map;
-
-  //printf("file map: %d\n",*fid);
-
-}
-
-//void
-//FC_FUNC_(open_file_abs_r_map,OPEN_FILE_ABS_R_MAP)(int *fid, char *filename,int *length, int *filesize){
-void open_file_abs_r_map(int *fid, char *filename,int *length, long long *filesize){
-
-  // opens file for read access
-  char * fncopy;
-  char * blank;
-  int ft;
-  char *map;
-
-  // checks filesize
-  if( *filesize == 0 ){
-    perror("Error file size for reading");
-    exit(EXIT_FAILURE);
-  }
-
-  // Trim the file name.
-  fncopy = strndup(filename, *length);
-  blank = strchr(fncopy, ' ');
-  if (blank != NULL) {
-    fncopy[blank - fncopy] = '\0';
-  }
-
-
-  ft = open(fncopy, O_RDONLY);
-  if (ft == -1) {
-    perror("Error opening file for reading");
-    exit(EXIT_FAILURE);
-  }
-
-  // file index id fid: from 0 to 8
-  map_fd_abs[*fid] = ft;
-
-  free(fncopy);
-
-  filesize_abs[*fid] = *filesize;
-
-  map = mmap(0, filesize_abs[*fid], PROT_READ, MAP_SHARED, ft, 0);
-  if (map == MAP_FAILED) {
-    close(ft);
-    perror("Error mmapping the file");
-    exit(EXIT_FAILURE);
-  }
-
-  map_abs[*fid] = map;
-
-  //printf("file length r: %d \n",filesize_abs[*fid]);
-  //printf("file map r: %d\n",*fid);
-
-}
-
-
-//void
-//FC_FUNC_(close_file_abs_map,CLOSE_FILE_ABS_MAP)(int * fid){
-void close_file_abs_map(int * fid){
-
-  /* Don't forget to free the mmapped memory
-   */
-  if (munmap(map_abs[*fid], filesize_abs[*fid]) == -1) {
-    perror("Error un-mmapping the file");
-    /* Decide here whether to close(fd) and exit() or not. Depends... */
-  }
-
-  /* Un-mmaping doesn't close the file, so we still need to do that.
-   */
-  close(map_fd_abs[*fid]);
-}
-
-
-//void
-//FC_FUNC_(write_abs_map,WRITE_ABS_MAP)(int *fid, char *buffer, int *length , int *index){
-void write_abs_map(int *fid, char *buffer, int *length , int *index){
-
-  char *map;
-  long long offset;
-
-  map = map_abs[*fid];
-
-  // offset in bytes
-  // make sure to use 64-bit arithmetic to avoid overflow for very large files
-  offset =  ((long long)*index -1 ) * (*length) ;
-
-  // copies buffer to map
-  memcpy( &map[offset], buffer ,*length );
-
-}
-
-//void
-//FC_FUNC_(read_abs_map,READ_ABS_MAP)(int *fid, char *buffer, int *length , int *index){
-void read_abs_map(int *fid, char *buffer, int *length , int *index){
-
-  char *map;
-  long long offset;
-
-  map = map_abs[*fid];
-
-  // offset in bytes
-  // make sure to use 64-bit arithmetic to avoid overflow for very large files
-  offset =  ((long long)*index -1 ) * (*length) ;
-
-  // copies map to buffer
-  memcpy( buffer, &map[offset], *length );
-
-}
-
-
-/*
-
- wrapper functions
-
- - for your preferred, optimized file i/o ;
- e.g. uncomment  // #define USE_MAP... in config.h to use mmap routines
- or comment out (default) to use fopen/fwrite/fread functions
-
- note: mmap functions should work fine for local harddisk directories, but can lead to
- problems with global (e.g. NFS) directories
-
- (on nehalem, Linux 2.6.18-164.11.1.el5 #1 SMP Wed Jan 20 10:04:55 EST 2010 x86_64 x86_64 x86_64 GNU/Linux
- - mmap functions are about 20 % faster than conventional fortran, unformatted file i/o
- - fwrite/fread function are about 12 % faster than conventional fortran, unformatted file i/o )
-
- */
-
-void
-FC_FUNC_(open_file_abs_w,OPEN_FILE_ABS_W)(int *fid, char *filename,int *length, long long *filesize) {
-
-#ifdef   USE_MAP_FUNCTION
-  open_file_abs_w_map(fid,filename,length,filesize);
-#else
-  open_file_abs_w_fbin(fid,filename,length,filesize);
-#endif
-
-}
-
-void
-FC_FUNC_(open_file_abs_r,OPEN_FILE_ABS_R)(int *fid, char *filename,int *length, long long *filesize) {
-
-#ifdef   USE_MAP_FUNCTION
-  open_file_abs_r_map(fid,filename,length,filesize);
-#else
-  open_file_abs_r_fbin(fid,filename,length,filesize);
-#endif
-
-}
-
-void
-FC_FUNC_(close_file_abs,CLOSE_FILES_ABS)(int *fid) {
-
-#ifdef   USE_MAP_FUNCTION
-  close_file_abs_map(fid);
-#else
-  close_file_abs_fbin(fid);
-#endif
-
-}
-
-void
-FC_FUNC_(write_abs,WRITE_ABS)(int *fid, char *buffer, int *length , int *index) {
-
-#ifdef   USE_MAP_FUNCTION
-  write_abs_map(fid,buffer,length,index);
-#else
-  write_abs_fbin(fid,buffer,length,index);
-#endif
-
-}
-
-void
-FC_FUNC_(read_abs,READ_ABS)(int *fid, char *buffer, int *length , int *index) {
-
-#ifdef   USE_MAP_FUNCTION
-  read_abs_map(fid,buffer,length,index);
-#else
-  read_abs_fbin(fid,buffer,length,index);
-#endif
-
-}
-
-
+/*
+!=====================================================================
+!
+!               S p e c f e m 3 D  V e r s i o n  2 . 0
+!               ---------------------------------------
+!
+!          Main authors: Dimitri Komatitsch and Jeroen Tromp
+!    Princeton University, USA and University of Pau / CNRS / INRIA
+! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+!                            April 2011
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+*/
+
+// for large files
+#define _FILE_OFFSET_BITS  64
+
+// after Brian's function
+
+#include "config.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+static int fd;
+
+void
+FC_FUNC_(open_file,OPEN_FILE)(char *file) {
+  /*    fprintf(stderr, "Opening file: %s\n", file); */
+  fd = open(file, O_WRONLY | O_CREAT, 0644);
+  if(fd == -1) {
+    fprintf(stderr, "Error opening file: %s exiting\n", file);
+    exit(-1);
+  }
+}
+
+void
+FC_FUNC_(close_file,CLOSE_FILE)() {
+  /*    fprintf(stderr, "Closing file\n"); */
+  close(fd);
+}
+
+void
+FC_FUNC_(write_integer,WRITE_INTEGER)(int *z) {
+  int dummy_unused_variable = write(fd, z, sizeof(int));
+}
+
+void
+FC_FUNC_(write_real,WRITE_REAL)(float *z) {
+  int dummy_unused_variable = write(fd, z, sizeof(float));
+}
+
+
+/* ---------------------------------------
+
+ IO performance test
+
+ Software Optimization for High Performance Computing: Creating Faster Applications
+
+ By Isom L. Crawford and Kevin R. Wadleigh
+ Jul 18, 2003
+
+ - uses functions fopen/fread/fwrite for binary file I/O
+
+ --------------------------------------- */
+
+#define __USE_GNU
+#include <string.h>
+#include <regex.h>
+
+#define MIN(x,y) ((x) < (y) ? (x) : (y))
+
+/* fastest performance on nehalem nodes:
+
+ Linux 2.6.18-164.11.1.el5 #1 SMP Wed Jan 20 10:04:55 EST 2010 x86_64 x86_64 x86_64 GNU/Linux
+
+ achieved with 16 KB buffers: */
+
+//#define MAX_B 65536 // 64 KB
+//#define MAX_B 32768 // 32 KB
+#define MAX_B 16384 // 16 KB
+//#define MAX_B 8192 // 8 KB
+
+// absorbing files: instead of passing file descriptor, we use the array index
+//                          index 0 for elastic domain file
+//                          index 1 for acoustic domain file
+//                          (reserved, but unused yet) index 2 - for NOISE_TOMOGRAPHY (SURFACE_MOVIE)
+#define ABS_FILEID 3  // number of absorbing files, note: in C we start indexing arrays from 0
+
+// file points
+static FILE * fp_abs[ABS_FILEID];
+// file work buffers
+static char * work_buffer[ABS_FILEID];
+
+
+//void
+//FC_FUNC_(open_file_abs_r_fbin,OPEN_FILE_ABS_R_FBIN)(int *fid, char *filename,int *length, int *filesize){
+void open_file_abs_r_fbin(int *fid, char *filename,int *length, long long *filesize){
+
+  // opens file for read access
+
+  //This sequence assigns the MAX_B array work_buffer to the file pointer
+  // to be used for its buffering. performance should benefit.
+  char * fncopy;
+  char * blank;
+  FILE *ft;
+  int ret;
+
+  // checks filesize
+  if( *filesize == 0 ){
+    perror("Error file size for reading");
+    exit(EXIT_FAILURE);
+  }
+
+  // Trim the file name.
+  fncopy = strndup(filename, *length);
+  blank = strchr(fncopy, ' ');
+  if (blank != NULL) {
+    fncopy[blank - fncopy] = '\0';
+  }
+
+/*
+//debug checks file size
+// see:
+//https://www.securecoding.cert.org/confluence/display/seccode/FIO19-C.+Do+not+use+fseek()+and+ftell()+to+compute+the+size+of+a+file
+  printf("file size: %lld \n",*filesize);
+  int fd;
+  struct stat stbuf;
+  long long size;
+  fd = open(fncopy, O_RDONLY);
+  if(fd == -1) {
+    fprintf(stderr, "Error opening file: %s exiting\n", fncopy);
+    exit(-1);
+  }
+  if( fstat(fd, &stbuf) == 0 ){
+    size = stbuf.st_size;
+    printf("file size found is: %lld (Bytes) \n",size);
+  }
+  close(fd);
+*/
+
+  // opens file
+  //ft = fopen( fncopy, "r+" );
+  ft = fopen( fncopy, "rb+" ); // read binary file
+  if( ft == NULL ) { perror("fopen"); exit(-1); }
+
+
+  // sets mode for full buffering
+  work_buffer[*fid] = (char *)malloc(MAX_B);
+  ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B );
+  if( ret != 0 ){
+    perror("Error setting working buffer");
+    exit(EXIT_FAILURE);
+  }
+
+  // stores file index id fid: from 0 to 8
+  fp_abs[*fid] = ft;
+
+  free(fncopy);
+}
+
+//void
+//FC_FUNC_(open_file_abs_w_fbin,OPEN_FILE_ABS_W_FBIN)(int *fid, char *filename, int *length, int *filesize){
+void open_file_abs_w_fbin(int *fid, char *filename, int *length, long long *filesize){
+
+  // opens file for write access
+
+  //This sequence assigns the MAX_B array work_buffer to the file pointer
+  // to be used for its buffering. performance should benefit.
+  char * fncopy;
+  char * blank;
+  FILE *ft;
+  int ret;
+
+  // checks filesize
+  if( *filesize == 0 ){
+    perror("Error file size for writing");
+    exit(EXIT_FAILURE);
+  }
+
+  // Trim the file name.
+  fncopy = strndup(filename, *length);
+  blank = strchr(fncopy, ' ');
+  if (blank != NULL) {
+    fncopy[blank - fncopy] = '\0';
+  }
+
+  // opens file
+  //ft = fopen( fncopy, "w+" );
+  ft = fopen( fncopy, "wb+" ); // write binary file
+  if( ft == NULL ) { perror("fopen"); exit(-1); }
+
+  // sets mode for full buffering
+  work_buffer[*fid] = (char *)malloc(MAX_B);
+  ret = setvbuf( ft, work_buffer[*fid], _IOFBF, (size_t)MAX_B );
+  if( ret != 0 ){
+    perror("Error setting working buffer");
+    exit(EXIT_FAILURE);
+  }
+
+  // stores file index id fid: from 0 to 8
+  fp_abs[*fid] = ft;
+
+  free(fncopy);
+
+}
+
+//void
+//FC_FUNC_(close_file_abs_fbin,CLOSE_FILE_ABS_FBIN)(int * fid){
+void close_file_abs_fbin(int * fid){
+
+  // closes file
+
+  fclose(fp_abs[*fid]);
+
+  free(work_buffer[*fid]);
+
+}
+
+//void
+//FC_FUNC_(write_abs_fbin,WRITE_ABS_FBIN)(int *fid, void *buffer, int *length, int *index){
+void write_abs_fbin(int *fid, char *buffer, int *length, int *index){
+
+  // writes binary file data in chunks of MAX_B
+
+  FILE *ft;
+  int itemlen,remlen,donelen,ret;
+  void *buf;
+
+  // file pointer
+  ft = fp_abs[*fid];
+
+  donelen = 0;
+  remlen = *length;
+  buf = buffer;
+  ret = 0;
+
+/*
+//debug
+  float dat[*length/4];
+  memcpy(dat,buffer,*length);
+  printf("buffer length: %d %d\n",*length,*index);
+  printf("buffer size: %d %d \n",sizeof(dat),sizeof(buffer));
+  int i;
+  for(i=0;i< 50;i++){
+    printf("buffer: %d %e \n",i,dat[i]);
+  }
+
+  // positions file pointer (for reverse time access)
+  // make sure to use 64-bit arithmetic to avoid overflow for very large files
+  long long pos,cur;
+
+  pos = ((long long)*length) * (*index -1 );
+  cur = ftell(ft);
+
+  printf("current position: %d %lld %lld \n",*fid,cur,pos);
+  ret = fseek(ft, pos , SEEK_SET);
+  if ( ret != 0 ) {
+    perror("Error fseek");
+    exit(EXIT_FAILURE);
+  }
+ */
+
+
+  // writes items of maximum MAX_B to the file
+  while (remlen > 0){
+
+    itemlen = MIN(remlen,MAX_B);
+    // note: we want to write out exactly *itemlen* bytes
+    ret = fwrite(buf,1,itemlen,ft);
+    if (ret > 0){
+      donelen = donelen + ret;
+      remlen = remlen - MAX_B;
+      buf += MAX_B;
+    }
+    else{
+      remlen = 0;
+    }
+  }
+
+  //debug
+  //  printf("buffer done length: %d %d\n",donelen,*length);
+}
+
+//void
+//FC_FUNC_(read_abs_fbin,READ_ABS_FBIN)(int *fid, void *buffer, int *length, int *index){
+void read_abs_fbin(int *fid, char *buffer, int *length, int *index){
+
+  // reads binary file data in chunks of MAX_B
+
+  FILE *ft;
+  int ret,itemlen,remlen,donelen;
+  long long pos;
+  void *buf;
+
+  // file pointer
+  ft = fp_abs[*fid];
+
+  // positions file pointer (for reverse time access)
+  // make sure to use 64-bit arithmetic to avoid overflow for very large files
+  pos = ((long long)*length) * (*index -1 );
+
+  ret = fseek(ft, pos , SEEK_SET);
+  if ( ret != 0 ) {
+    perror("Error fseek");
+    exit(EXIT_FAILURE);
+  }
+
+  donelen = 0;
+  remlen = *length;
+  buf = buffer;
+  ret = 0;
+
+  // cleans buffer
+  //memset( buf,0,remlen);
+
+  // reads items of maximum MAX_B to the file
+  while (remlen > 0){
+
+    // checks end of file
+    if (ferror(ft) || feof(ft)) return;
+
+    itemlen = MIN(remlen,MAX_B);
+    ret = fread(buf,1,itemlen,ft);
+
+    if (ferror(ft) || feof(ft)) return;
+
+    if (ret > 0){
+      donelen = donelen + ret;
+      remlen = remlen - MAX_B;
+      buf += MAX_B;
+    }
+    else{
+      remlen = 0;
+    }
+  }
+
+/*
+// debug
+  printf("position: %lld %d %d \n",pos,*length,*index);
+  printf("buffer done length: %d %d\n",donelen,*length);
+  float dat[*length/4];
+  memcpy(dat,buffer,*length);
+  printf("return buffer length: %d %d\n",*length,*index);
+  printf("return buffer size: %d %d \n",sizeof(dat),sizeof(buffer));
+  int i;
+  for(i=0;i< 50;i++){
+    printf("return buffer: %d %e \n",i,dat[i]);
+  }
+*/
+}
+
+
+
+
+/* ---------------------------------------
+
+ IO performance test
+
+
+ A Performance Comparison of "read" and "mmap" in the Solaris 8 OS
+
+ By Oyetunde Fadele, September 2002
+
+ http://developers.sun.com/solaris/articles/read_mmap.html
+
+ or
+
+ High-performance network programming, Part 2: Speed up processing at both the client and server
+
+ by Girish Venkatachalam
+
+ http://www.ibm.com/developerworks/aix/library/au-highperform2/
+
+
+ - uses functions mmap/memcpy for mapping file I/O
+
+ -------------------------------------  */
+
+
+#include <errno.h>
+#include <limits.h>
+#include <sys/mman.h>
+
+// file maps
+static char * map_abs[ABS_FILEID];
+// file descriptors
+static int map_fd_abs[ABS_FILEID];
+// file sizes
+static long long filesize_abs[ABS_FILEID];
+
+//void
+//FC_FUNC_(open_file_abs_w_map,OPEN_FILE_ABS_W_MAP)(int *fid, char *filename, int *length, int *filesize){
+void open_file_abs_w_map(int *fid, char *filename, int *length, long long *filesize){
+
+  // opens file for write access
+
+  int ft;
+  int result;
+  char *map;
+  char *fncopy;
+  char *blank;
+
+  // checks filesize
+  if( *filesize == 0 ){
+    perror("Error file size for writing");
+    exit(EXIT_FAILURE);
+  }
+
+  /*
+   // debug check filesize below or above 2 GB
+   //            filesize gives bytes needed; 4-byte integer limited to +- 2,147,483,648 bytes ~ 2 GB
+   float s = *filesize / 1024. / 1024. / 1024.;
+   if( s > 2.0 ){
+   printf("file size bigger than 2 GB: %lld B or %f GB \n",*filesize,s);
+   }
+   */
+
+  // Trim the file name.
+  fncopy = strndup(filename, *length);
+  blank = strchr(fncopy, ' ');
+  if (blank != NULL) {
+    fncopy[blank - fncopy] = '\0';
+  }
+
+  /* Open a file for writing.
+   *  - Creating the file if it doesn't exist.
+   *  - Truncating it to 0 size if it already exists. (not really needed)
+   *
+   * Note: "O_WRONLY" mode is not sufficient when mmaping.
+   */
+  ft = open(fncopy, O_RDWR | O_CREAT | O_TRUNC, (mode_t)0600);
+  if (ft == -1) {
+    perror("Error opening file for writing");
+    exit(EXIT_FAILURE);
+  }
+
+  // file index id fid: from 0 to 8
+  map_fd_abs[*fid] = ft;
+
+  free(fncopy);
+
+  /* Stretch the file size to the size of the (mmapped) array of ints
+   */
+  filesize_abs[*fid] = *filesize;
+  result = lseek(ft, filesize_abs[*fid] - 1, SEEK_SET);
+  if (result == -1) {
+    close(ft);
+    perror("Error calling lseek() to 'stretch' the file");
+    exit(EXIT_FAILURE);
+  }
+
+  //printf("file length: %d \n",filesize_abs[*fid]);
+
+
+  /* Something needs to be written at the end of the file to
+   * have the file actually have the new size.
+   * Just writing an empty string at the current file position will do.
+   *
+   * Note:
+   *  - The current position in the file is at the end of the stretched
+   *    file due to the call to lseek().
+   *  - An empty string is actually a single '\0' character, so a zero-byte
+   *    will be written at the last byte of the file.
+   */
+  result = write(ft, "", 1);
+  if (result != 1) {
+    close(ft);
+    perror("Error writing last byte of the file");
+    exit(EXIT_FAILURE);
+  }
+
+  /* Now the file is ready to be mmapped.
+   */
+  map = mmap(0, filesize_abs[*fid], PROT_READ | PROT_WRITE, MAP_SHARED, ft, 0);
+  if (map == MAP_FAILED) {
+    close(ft);
+    perror("Error mmapping the file");
+    exit(EXIT_FAILURE);
+  }
+
+  map_abs[*fid] = map;
+
+  //printf("file map: %d\n",*fid);
+
+}
+
+//void
+//FC_FUNC_(open_file_abs_r_map,OPEN_FILE_ABS_R_MAP)(int *fid, char *filename,int *length, int *filesize){
+void open_file_abs_r_map(int *fid, char *filename,int *length, long long *filesize){
+
+  // opens file for read access
+  char * fncopy;
+  char * blank;
+  int ft;
+  char *map;
+
+  // checks filesize
+  if( *filesize == 0 ){
+    perror("Error file size for reading");
+    exit(EXIT_FAILURE);
+  }
+
+  // Trim the file name.
+  fncopy = strndup(filename, *length);
+  blank = strchr(fncopy, ' ');
+  if (blank != NULL) {
+    fncopy[blank - fncopy] = '\0';
+  }
+
+
+  ft = open(fncopy, O_RDONLY);
+  if (ft == -1) {
+    perror("Error opening file for reading");
+    exit(EXIT_FAILURE);
+  }
+
+  // file index id fid: from 0 to 8
+  map_fd_abs[*fid] = ft;
+
+  free(fncopy);
+
+  filesize_abs[*fid] = *filesize;
+
+  map = mmap(0, filesize_abs[*fid], PROT_READ, MAP_SHARED, ft, 0);
+  if (map == MAP_FAILED) {
+    close(ft);
+    perror("Error mmapping the file");
+    exit(EXIT_FAILURE);
+  }
+
+  map_abs[*fid] = map;
+
+  //printf("file length r: %d \n",filesize_abs[*fid]);
+  //printf("file map r: %d\n",*fid);
+
+}
+
+
+//void
+//FC_FUNC_(close_file_abs_map,CLOSE_FILE_ABS_MAP)(int * fid){
+void close_file_abs_map(int * fid){
+
+  /* Don't forget to free the mmapped memory
+   */
+  if (munmap(map_abs[*fid], filesize_abs[*fid]) == -1) {
+    perror("Error un-mmapping the file");
+    /* Decide here whether to close(fd) and exit() or not. Depends... */
+  }
+
+  /* Un-mmaping doesn't close the file, so we still need to do that.
+   */
+  close(map_fd_abs[*fid]);
+}
+
+
+//void
+//FC_FUNC_(write_abs_map,WRITE_ABS_MAP)(int *fid, char *buffer, int *length , int *index){
+void write_abs_map(int *fid, char *buffer, int *length , int *index){
+
+  char *map;
+  long long offset;
+
+  map = map_abs[*fid];
+
+  // offset in bytes
+  // make sure to use 64-bit arithmetic to avoid overflow for very large files
+  offset =  ((long long)*index -1 ) * (*length) ;
+
+  // copies buffer to map
+  memcpy( &map[offset], buffer ,*length );
+
+}
+
+//void
+//FC_FUNC_(read_abs_map,READ_ABS_MAP)(int *fid, char *buffer, int *length , int *index){
+void read_abs_map(int *fid, char *buffer, int *length , int *index){
+
+  char *map;
+  long long offset;
+
+  map = map_abs[*fid];
+
+  // offset in bytes
+  // make sure to use 64-bit arithmetic to avoid overflow for very large files
+  offset =  ((long long)*index -1 ) * (*length) ;
+
+  // copies map to buffer
+  memcpy( buffer, &map[offset], *length );
+
+}
+
+
+/*
+
+ wrapper functions
+
+ - for your preferred, optimized file i/o ;
+ e.g. uncomment  // #define USE_MAP... in config.h to use mmap routines
+ or comment out (default) to use fopen/fwrite/fread functions
+
+ note: mmap functions should work fine for local harddisk directories, but can lead to
+ problems with global (e.g. NFS) directories
+
+ (on nehalem, Linux 2.6.18-164.11.1.el5 #1 SMP Wed Jan 20 10:04:55 EST 2010 x86_64 x86_64 x86_64 GNU/Linux
+ - mmap functions are about 20 % faster than conventional fortran, unformatted file i/o
+ - fwrite/fread function are about 12 % faster than conventional fortran, unformatted file i/o )
+
+ */
+
+void
+FC_FUNC_(open_file_abs_w,OPEN_FILE_ABS_W)(int *fid, char *filename,int *length, long long *filesize) {
+
+#ifdef   USE_MAP_FUNCTION
+  open_file_abs_w_map(fid,filename,length,filesize);
+#else
+  open_file_abs_w_fbin(fid,filename,length,filesize);
+#endif
+
+}
+
+void
+FC_FUNC_(open_file_abs_r,OPEN_FILE_ABS_R)(int *fid, char *filename,int *length, long long *filesize) {
+
+#ifdef   USE_MAP_FUNCTION
+  open_file_abs_r_map(fid,filename,length,filesize);
+#else
+  open_file_abs_r_fbin(fid,filename,length,filesize);
+#endif
+
+}
+
+void
+FC_FUNC_(close_file_abs,CLOSE_FILES_ABS)(int *fid) {
+
+#ifdef   USE_MAP_FUNCTION
+  close_file_abs_map(fid);
+#else
+  close_file_abs_fbin(fid);
+#endif
+
+}
+
+void
+FC_FUNC_(write_abs,WRITE_ABS)(int *fid, char *buffer, int *length , int *index) {
+
+#ifdef   USE_MAP_FUNCTION
+  write_abs_map(fid,buffer,length,index);
+#else
+  write_abs_fbin(fid,buffer,length,index);
+#endif
+
+}
+
+void
+FC_FUNC_(read_abs,READ_ABS)(int *fid, char *buffer, int *length , int *index) {
+
+#ifdef   USE_MAP_FUNCTION
+  read_abs_map(fid,buffer,length,index);
+#else
+  read_abs_fbin(fid,buffer,length,index);
+#endif
+
+}
+
+

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-11-17 01:58:40 UTC (rev 19212)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-11-17 23:51:03 UTC (rev 19213)
@@ -860,6 +860,14 @@
     write(IMAIN,*)
   endif
 
+  ! initializes GPU and outputs info to files for all processes
+  call prepare_cuda_device(myrank,ncuda_devices)
+
+  ! collects min/max of local devices found for statistics
+  call sync_all()
+  call min_all_i(ncuda_devices,ncuda_devices_min)
+  call max_all_i(ncuda_devices,ncuda_devices_max)
+
   ! prepares general fields on GPU
   call prepare_constants_device(Mesh_pointer, &
                                   NGLLX, NSPEC_AB, NGLOB_AB, &
@@ -881,11 +889,9 @@
                                   number_receiver_global, ispec_selected_rec, &
                                   nrec, nrec_local, &
                                   SIMULATION_TYPE, &
-                                  USE_MESH_COLORING_GPU,nspec_acoustic,nspec_elastic, &
-                                  myrank,ncuda_devices)
+                                  USE_MESH_COLORING_GPU, &
+                                  nspec_acoustic,nspec_elastic)
 
-  call min_all_i(ncuda_devices,ncuda_devices_min)
-  call max_all_i(ncuda_devices,ncuda_devices_max)
 
   ! prepares fields on GPU for acoustic simulations
   if( ACOUSTIC_SIMULATION ) then
@@ -1014,5 +1020,4 @@
     write(IMAIN,*)
   endif
 
-
   end subroutine prepare_timerun_GPU