[cig-commits] r20591 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src: cuda meshfem3D specfem3D

danielpeter at geodynamics.org danielpeter at geodynamics.org
Sat Aug 18 18:24:47 PDT 2012


Author: danielpeter
Date: 2012-08-18 18:24:46 -0700 (Sat, 18 Aug 2012)
New Revision: 20591

Modified:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_mass_matrices.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90
Log:
fixes inner core coupling; fixes acceleration updates with rotation

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -79,7 +79,7 @@
   int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_outer_core))/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -161,7 +161,7 @@
   int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_outer_core)/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -84,7 +84,7 @@
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_crust_mantle)/((double)blocksize)))*blocksize;
       num_blocks_x = size_padded/blocksize;
       num_blocks_y = 1;
-      while(num_blocks_x > 65535) {
+      while(num_blocks_x > MAXIMUM_GRID_DIM) {
         num_blocks_x = (int) ceil(num_blocks_x*0.5f);
         num_blocks_y = num_blocks_y*2;
       }
@@ -124,7 +124,7 @@
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_inner_core)/((double)blocksize)))*blocksize;
       num_blocks_x = size_padded/blocksize;
       num_blocks_y = 1;
-      while(num_blocks_x > 65535) {
+      while(num_blocks_x > MAXIMUM_GRID_DIM) {
         num_blocks_x = (int) ceil(num_blocks_x*0.5f);
         num_blocks_y = num_blocks_y*2;
       }
@@ -210,7 +210,7 @@
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_crust_mantle)/((double)blocksize)))*blocksize;
       num_blocks_x = size_padded/blocksize;
       num_blocks_y = 1;
-      while(num_blocks_x > 65535) {
+      while(num_blocks_x > MAXIMUM_GRID_DIM) {
         num_blocks_x = (int) ceil(num_blocks_x*0.5f);
         num_blocks_y = num_blocks_y*2;
       }
@@ -256,7 +256,7 @@
       size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_inner_core)/((double)blocksize)))*blocksize;
       num_blocks_x = size_padded/blocksize;
       num_blocks_y = 1;
-      while(num_blocks_x > 65535) {
+      while(num_blocks_x > MAXIMUM_GRID_DIM) {
         num_blocks_x = (int) ceil(num_blocks_x*0.5f);
         num_blocks_y = num_blocks_y*2;
       }

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -374,7 +374,7 @@
   int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -519,7 +519,7 @@
   size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
   num_blocks_x = size_padded/blocksize;
   num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -557,7 +557,7 @@
   size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
   num_blocks_x = size_padded/blocksize;
   num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -629,7 +629,7 @@
   size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
   num_blocks_x = size_padded/blocksize;
   num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -665,7 +665,7 @@
   size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
   num_blocks_x = size_padded/blocksize;
   num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -59,23 +59,25 @@
   int i = threadIdx.x;
   int j = threadIdx.y;
   int k = threadIdx.z;
-
   int isource  = blockIdx.x + gridDim.x*blockIdx.y; // bx
 
-  if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
-
+  // when NSOURCES > MAXIMUM_GRID_DIM, but mod(nspec_top,2) > 0, we end up with an extra block.
+  if(isource < NSOURCES) {
     if(myrank == islice_selected_source[isource]) {
 
       ispec = ispec_selected_source[isource]-1;
 
       stf = (realw) stf_pre_compute[isource];
-      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+      iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
 
       // note: for global version, sourcearrays has dimensions
       //            sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES)
-      atomicAdd(&accel[3*iglob], sourcearrays[INDEX5(3,5,5,5, 0,i,j,k,isource)]*stf);
-      atomicAdd(&accel[3*iglob+1], sourcearrays[INDEX5(3,5,5,5, 1,i,j,k,isource)]*stf);
-      atomicAdd(&accel[3*iglob+2], sourcearrays[INDEX5(3,5,5,5, 2,i,j,k,isource)]*stf);
+      atomicAdd(&accel[3*iglob], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
+                                                     0,i,j,k,isource)]*stf);
+      atomicAdd(&accel[3*iglob+1], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
+                                                       1,i,j,k,isource)]*stf);
+      atomicAdd(&accel[3*iglob+2], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
+                                                       2,i,j,k,isource)]*stf);
     }
   }
 }
@@ -100,12 +102,12 @@
 
   int num_blocks_x = NSOURCES;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
   dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,5);
+  dim3 threads(NGLLX,NGLLX,NGLLX);
 
   // copies source time function buffer values to GPU
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
@@ -148,12 +150,12 @@
 
   int num_blocks_x = NSOURCES;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
   dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(5,5,5);
+  dim3 threads(NGLLX,NGLLX,NGLLX);
 
   // copies source time function buffer values to GPU
   print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
@@ -193,8 +195,8 @@
 
   int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
 
-  if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
-
+  // when nrec > MAXIMUM_GRID_DIM, but mod(nspec_top,2) > 0, we end up with an extra block.
+  if(irec_local < nadj_rec_local) {
     irec = pre_computed_irec[irec_local];
     ispec = ispec_selected_rec[irec]-1;
 
@@ -205,13 +207,13 @@
 
     // atomic operations are absolutely necessary for correctness!
     atomicAdd(&accel[3*iglob], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                      0,i,j,k,irec_local)]);
+                                                       0,i,j,k,irec_local)]);
 
     atomicAdd(&accel[3*iglob+1], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                      1,i,j,k,irec_local)]);
+                                                         1,i,j,k,irec_local)]);
 
     atomicAdd(&accel[3*iglob+2], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                                      2,i,j,k,irec_local)]);
+                                                         2,i,j,k,irec_local)]);
   }
 }
 
@@ -233,15 +235,15 @@
   // check if anything to do
   if( mp->nadj_rec_local == 0 ) return;
 
-  // make sure grid dimension is less than 65535 in x dimension
+  // make sure grid dimension is less than MAXIMUM_GRID_DIM in x dimension
   int num_blocks_x = mp->nadj_rec_local;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
   dim3 grid(num_blocks_x,num_blocks_y,1);
-  dim3 threads(5,5,5);
+  dim3 threads(NGLLX,NGLLX,NGLLX);
 
   // build slice of adj_sourcearrays because full array is *very* large.
   // note: this extracts array values for local adjoint sources at given time step "time_index"
@@ -260,17 +262,17 @@
             // note: global version uses dimensions
             //          h_adj_sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC)
             mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                            0,i,j,k,irec_local-1)]
+                                                0,i,j,k,irec_local-1)]
                     = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
                                                 0,i,j,k,irec_local-1,*time_index-1)];
 
             mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                            1,i,j,k,irec_local-1)]
+                                                1,i,j,k,irec_local-1)]
                     = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
                                                 1,i,j,k,irec_local-1,*time_index-1)];
 
             mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
-                                            2,i,j,k,irec_local-1)]
+                                                2,i,j,k,irec_local-1)]
                     = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
                                                 2,i,j,k,irec_local-1,*time_index-1)];
           }
@@ -283,7 +285,7 @@
 
   // copies extracted array values onto GPU
   cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
-             (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice);
+             (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),cudaMemcpyHostToDevice);
 
 
   // the irec_local variable needs to be precomputed (as

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -100,9 +100,7 @@
     iglob_oc = ibool_outer_core[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)] - 1;
 
     // update fluid acceleration/pressure
-    atomicAdd(&accel_outer_core[iglob_oc],+weight*displ_n);
-
-  // }
+    atomicAdd(&accel_outer_core[iglob_oc], + weight*displ_n);
   }
 }
 
@@ -165,9 +163,8 @@
     iglob_oc = ibool_outer_core[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)] - 1;
 
     // update fluid acceleration/pressure
-    atomicAdd(&accel_outer_core[iglob_oc],+ weight*displ_n);
-
-  // }
+    // note: sign changes to minus because of normal pointing down into inner core
+    atomicAdd(&accel_outer_core[iglob_oc], - weight*displ_n);
   }
 }
 
@@ -184,7 +181,7 @@
 
   int num_blocks_x = mp->nspec2D_top_outer_core;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -238,7 +235,7 @@
 
   int num_blocks_x = mp->nspec2D_bottom_outer_core;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -294,9 +291,9 @@
                                                   realw* wgllwgll_xy,
                                                   int* ibool_outer_core,
                                                   int* ibelm_top_outer_core,
-                                                  double RHO_TOP_OC,
+                                                  realw RHO_TOP_OC,
                                                   realw minus_g_cmb,
-                                                  int GRAVITY_VAL,
+                                                  int GRAVITY,
                                                   int NSPEC_BOTTOM_CM) {
 
   int i = threadIdx.x;
@@ -331,11 +328,11 @@
     iglob_cm = ibool_crust_mantle[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)] - 1;
 
     // compute pressure, taking gravity into account
-    if( GRAVITY_VAL ){
-      pressure = RHO_TOP_OC * ( - accel_outer_core[iglob_oc] + minus_g_cmb
-         * (displ_crust_mantle[iglob_cm*3]*nx
-            + displ_crust_mantle[iglob_cm*3+1]*ny
-            + displ_crust_mantle[iglob_cm*3+2]*nz) );
+    if( GRAVITY ){
+      pressure = RHO_TOP_OC * ( - accel_outer_core[iglob_oc]
+        + minus_g_cmb * (displ_crust_mantle[iglob_cm*3]*nx
+                         + displ_crust_mantle[iglob_cm*3+1]*ny
+                         + displ_crust_mantle[iglob_cm*3+2]*nz) );
     }else{
       pressure = - RHO_TOP_OC * accel_outer_core[iglob_oc];
     }
@@ -344,11 +341,9 @@
     weight = jacobian2D_top_outer_core[INDEX3(NGLLX,NGLLX,i,j,iface)]*wgllwgll_xy[INDEX2(NGLLX,i,j)];
 
     // update fluid acceleration/pressure
-    atomicAdd(&accel_crust_mantle[iglob_cm*3],+ weight*nx*pressure);
-    atomicAdd(&accel_crust_mantle[iglob_cm*3+1],+ weight*ny*pressure);
-    atomicAdd(&accel_crust_mantle[iglob_cm*3+2],+ weight*nz*pressure);
-
-    //  }
+    atomicAdd(&accel_crust_mantle[iglob_cm*3], + weight*nx*pressure);
+    atomicAdd(&accel_crust_mantle[iglob_cm*3+1], + weight*ny*pressure);
+    atomicAdd(&accel_crust_mantle[iglob_cm*3+2], + weight*nz*pressure);
   }
 }
 
@@ -364,9 +359,9 @@
                                                   realw* wgllwgll_xy,
                                                   int* ibool_outer_core,
                                                   int* ibelm_bottom_outer_core,
-                                                  double RHO_BOTTOM_OC,
+                                                  realw RHO_BOTTOM_OC,
                                                   realw minus_g_icb,
-                                                  int GRAVITY_VAL,
+                                                  int GRAVITY,
                                                   int NSPEC_TOP_IC) {
 
   int i = threadIdx.x;
@@ -401,11 +396,11 @@
     iglob_ic = ibool_inner_core[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)] - 1;
 
     // compute pressure, taking gravity into account
-    if( GRAVITY_VAL ){
-      pressure = RHO_BOTTOM_OC * ( - accel_outer_core[iglob_oc] + minus_g_icb
-         * (displ_inner_core[iglob_ic*3]*nx
-            + displ_inner_core[iglob_ic*3+1]*ny
-            + displ_inner_core[iglob_ic*3+2]*nz) );
+    if( GRAVITY ){
+      pressure = RHO_BOTTOM_OC * ( - accel_outer_core[iglob_oc]
+        + minus_g_icb * (displ_inner_core[iglob_ic*3]*nx
+                         + displ_inner_core[iglob_ic*3+1]*ny
+                         + displ_inner_core[iglob_ic*3+2]*nz) );
     }else{
       pressure = - RHO_BOTTOM_OC * accel_outer_core[iglob_oc];
     }
@@ -414,11 +409,10 @@
     weight = jacobian2D_bottom_outer_core[INDEX3(NGLLX,NGLLX,i,j,iface)]*wgllwgll_xy[INDEX2(NGLLX,i,j)];
 
     // update fluid acceleration/pressure
-    atomicAdd(&accel_inner_core[iglob_ic*3],+ weight*nx*pressure);
-    atomicAdd(&accel_inner_core[iglob_ic*3+1],+ weight*ny*pressure);
-    atomicAdd(&accel_inner_core[iglob_ic*3+2],+ weight*nz*pressure);
-
-    //  }
+    // note: sign changes to minus because of normal pointing down into inner core
+    atomicAdd(&accel_inner_core[iglob_ic*3], - weight*nx*pressure);
+    atomicAdd(&accel_inner_core[iglob_ic*3+1], - weight*ny*pressure);
+    atomicAdd(&accel_inner_core[iglob_ic*3+2], - weight*nz*pressure);
   }
 }
 
@@ -426,10 +420,7 @@
 
 extern "C"
 void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
-              COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
-                                               double RHO_TOP_OC,
-                                               realw minus_g_cmb,
-                                               int GRAVITY_VAL) {
+              COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f) {
 
   TRACE("compute_coupling_cmb_fluid_cuda");
   //double start_time = get_time();
@@ -438,7 +429,7 @@
 
   int num_blocks_x = mp->nspec2D_bottom_crust_mantle;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -457,9 +448,9 @@
                                                       mp->d_wgllwgll_xy,
                                                       mp->d_ibool_outer_core,
                                                       mp->d_ibelm_top_outer_core,
-                                                      RHO_TOP_OC,
-                                                      minus_g_cmb,
-                                                      GRAVITY_VAL,
+                                                      mp->RHO_TOP_OC,
+                                                      mp->minus_g_cmb,
+                                                      mp->gravity,
                                                       mp->nspec2D_bottom_crust_mantle);
 
   //  adjoint simulations
@@ -474,9 +465,9 @@
                                                         mp->d_wgllwgll_xy,
                                                         mp->d_ibool_outer_core,
                                                         mp->d_ibelm_top_outer_core,
-                                                        RHO_TOP_OC,
-                                                        minus_g_cmb,
-                                                        GRAVITY_VAL,
+                                                        mp->RHO_TOP_OC,
+                                                        mp->minus_g_cmb,
+                                                        mp->gravity,
                                                         mp->nspec2D_bottom_crust_mantle);
   }
 
@@ -491,10 +482,7 @@
 
 extern "C"
 void FC_FUNC_(compute_coupling_icb_fluid_cuda,
-              COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f,
-                                               double RHO_BOTTOM_OC,
-                                               realw minus_g_icb,
-                                               int GRAVITY_VAL) {
+              COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f) {
 
   TRACE("compute_coupling_icb_fluid_cuda");
   //double start_time = get_time();
@@ -503,7 +491,7 @@
 
   int num_blocks_x = mp->nspec2D_top_inner_core;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -522,9 +510,9 @@
                                                       mp->d_wgllwgll_xy,
                                                       mp->d_ibool_outer_core,
                                                       mp->d_ibelm_bottom_outer_core,
-                                                      RHO_BOTTOM_OC,
-                                                      minus_g_icb,
-                                                      GRAVITY_VAL,
+                                                      mp->RHO_BOTTOM_OC,
+                                                      mp->minus_g_icb,
+                                                      mp->gravity,
                                                       mp->nspec2D_top_inner_core);
 
   //  adjoint simulations
@@ -539,9 +527,9 @@
                                                         mp->d_wgllwgll_xy,
                                                         mp->d_ibool_outer_core,
                                                         mp->d_ibelm_bottom_outer_core,
-                                                        RHO_BOTTOM_OC,
-                                                        minus_g_icb,
-                                                        GRAVITY_VAL,
+                                                        mp->RHO_BOTTOM_OC,
+                                                        mp->minus_g_icb,
+                                                        mp->gravity,
                                                         mp->nspec2D_top_inner_core);
   }
 
@@ -559,13 +547,13 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 __global__ void compute_coupling_ocean_cuda_kernel(realw* accel_crust_mantle,
-                                                       realw* rmassx_crust_mantle,
-                                                       realw* rmassy_crust_mantle,
-                                                       realw* rmassz_crust_mantle,
-                                                       realw* rmass_ocean_load,
-                                                       int npoin_ocean_load,
-                                                       int* iglob_ocean_load,
-                                                       realw* normal_ocean_load) {
+                                                   realw* rmassx_crust_mantle,
+                                                   realw* rmassy_crust_mantle,
+                                                   realw* rmassz_crust_mantle,
+                                                   realw* rmass_ocean_load,
+                                                   int npoin_ocean_load,
+                                                   int* ibool_ocean_load,
+                                                   realw* normal_ocean_load) {
 
   int ipoin = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
 
@@ -579,7 +567,7 @@
 
     // get global point number
     // "-1" from index values to convert from Fortran-> C indexing
-    iglob = iglob_ocean_load[ipoin] - 1;
+    iglob = ibool_ocean_load[ipoin] - 1;
 
     // get normal
     nx = normal_ocean_load[INDEX2(NDIM,0,ipoin)]; // (1,ipoin)
@@ -624,7 +612,7 @@
 
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -639,7 +627,7 @@
                                                          mp->d_rmassz_crust_mantle,
                                                          mp->d_rmass_ocean_load,
                                                          mp->npoin_oceans,
-                                                         mp->d_iglob_ocean_load,
+                                                         mp->d_ibool_ocean_load,
                                                          mp->d_normal_ocean_load);
 
     // for backward/reconstructed potentials
@@ -650,7 +638,7 @@
                                                            mp->d_rmassz_crust_mantle,
                                                            mp->d_rmass_ocean_load,
                                                            mp->npoin_oceans,
-                                                           mp->d_iglob_ocean_load,
+                                                           mp->d_ibool_ocean_load,
                                                            mp->d_normal_ocean_load);
     }
   }else{
@@ -660,7 +648,7 @@
                                                          mp->d_rmassz_crust_mantle,
                                                          mp->d_rmass_ocean_load,
                                                          mp->npoin_oceans,
-                                                         mp->d_iglob_ocean_load,
+                                                         mp->d_ibool_ocean_load,
                                                          mp->d_normal_ocean_load);
 
     // for backward/reconstructed potentials
@@ -671,7 +659,7 @@
                                                            mp->d_rmassz_crust_mantle,
                                                            mp->d_rmass_ocean_load,
                                                            mp->npoin_oceans,
-                                                           mp->d_iglob_ocean_load,
+                                                           mp->d_ibool_ocean_load,
                                                            mp->d_normal_ocean_load);
     }
   }

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -85,7 +85,6 @@
     *sigma_xz = *sigma_xz - R_xz[i_sls + N_SLS*(tx + NGLL3*working_element)];
     *sigma_yz = *sigma_yz - R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)];
   }
-  return;
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -167,7 +166,6 @@
     R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)] =
       alphaval_loc * R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
   }
-  return;
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -224,10 +222,17 @@
   theta = d_ystore[iglob];
   phi = d_zstore[iglob];
 
-  cos_theta = cos(theta);
-  sin_theta = sin(theta);
-  cos_phi = cos(phi);
-  sin_phi = sin(phi);
+  if( sizeof( theta ) == sizeof( float ) ){
+    // float operations
+    // sincos function return sinus and cosine for given value
+    sincosf(theta, &sin_theta, &cos_theta);
+    sincosf(phi, &sin_phi, &cos_phi);
+  }else{
+    cos_theta = cos(theta);
+    sin_theta = sin(theta);
+    cos_phi = cos(phi);
+    sin_phi = sin(phi);
+  }
 
   // for efficiency replace with lookup table every 100 m in radial direction
   // note: radius in crust mantle should never be zero,
@@ -288,8 +293,6 @@
   *rho_s_H1 = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl);
   *rho_s_H2 = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl);
   *rho_s_H3 = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl);
-
-  return;
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -367,7 +370,6 @@
              c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl;
   *sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
              c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
-  return;
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -674,8 +676,6 @@
 
   *sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
               c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
-
-  return;
 }
 
 
@@ -695,7 +695,7 @@
                                           int* d_phase_ispec_inner,
                                           int num_phase_ispec,
                                           int d_iphase,
-                                          realw d_deltat,
+                                          realw deltat,
                                           int use_mesh_coloring_gpu,
                                           realw* d_displ,
                                           realw* d_veloc,
@@ -737,9 +737,9 @@
                                           realw* wgll_cube,
                                           int NSPEC_CRUST_MANTLE_STRAIN_ONLY){
 
-  /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
+  // block id == spectral-element id
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  /* int bx = blockIdx.x; */
+  // thread id == GLL point id
   int tx = threadIdx.x;
 
   int K = (tx/NGLL2);
@@ -838,13 +838,13 @@
         // use first order Taylor expansion of displacement for local storage of stresses
         // at this current time step, to fix attenuation in a consistent way
 #ifdef USE_TEXTURES_FIELDS
-        s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(d_displ_cm_tex, iglob*3);
-        s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(d_displ_cm_tex, iglob*3 + 1);
-        s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(d_displ_cm_tex, iglob*3 + 2);
+        s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + deltat * tex1Dfetch(d_displ_cm_tex, iglob*3);
+        s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + deltat * tex1Dfetch(d_displ_cm_tex, iglob*3 + 1);
+        s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + deltat * tex1Dfetch(d_displ_cm_tex, iglob*3 + 2);
 #else
-        s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
-        s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
-        s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+        s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + deltat * d_veloc[iglob*3];
+        s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + deltat * d_veloc[iglob*3 + 1];
+        s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + deltat * d_veloc[iglob*3 + 2];
 #endif
       }else{
         // takes old routines
@@ -1419,7 +1419,6 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 void Kernel_2_crust_mantle(int nb_blocks_to_compute,Mesh* mp,
-                          realw d_deltat,
                           int d_iphase,
                           int* d_ibool,
                           int* d_ispec_is_tiso,
@@ -1459,8 +1458,7 @@
                           realw* d_c25store,realw* d_c26store,realw* d_c33store,
                           realw* d_c34store,realw* d_c35store,realw* d_c36store,
                           realw* d_c44store,realw* d_c45store,realw* d_c46store,
-                          realw* d_c55store,realw* d_c56store,realw* d_c66store
-                          ){
+                          realw* d_c55store,realw* d_c56store,realw* d_c66store){
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("before kernel Kernel_2_crust_mantle");
@@ -1472,7 +1470,7 @@
 
   int num_blocks_x = nb_blocks_to_compute;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -1495,7 +1493,7 @@
                                                 mp->d_phase_ispec_inner_crust_mantle,
                                                 mp->num_phase_ispec_crust_mantle,
                                                 d_iphase,
-                                                d_deltat,
+                                                mp->deltat,
                                                 mp->use_mesh_coloring_gpu,
                                                 mp->d_displ_crust_mantle,
                                                 mp->d_veloc_crust_mantle,
@@ -1546,7 +1544,7 @@
                                                    mp->d_phase_ispec_inner_crust_mantle,
                                                    mp->num_phase_ispec_crust_mantle,
                                                    d_iphase,
-                                                   d_deltat,
+                                                   mp->b_deltat,
                                                    mp->use_mesh_coloring_gpu,
                                                    mp->d_b_displ_crust_mantle,
                                                    mp->d_b_veloc_crust_mantle,
@@ -1609,7 +1607,6 @@
 extern "C"
 void FC_FUNC_(compute_forces_crust_mantle_cuda,
               COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
-                                                realw* deltat,
                                                 int* iphase) {
 
   TRACE("compute_forces_crust_mantle_cuda");
@@ -1701,7 +1698,6 @@
 #endif
 
       Kernel_2_crust_mantle(nb_blocks_to_compute,mp,
-                            *deltat,
                             *iphase,
                             mp->d_ibool_crust_mantle + color_offset_nonpadded,
                             mp->d_ispec_is_tiso_crust_mantle + color_offset_ispec,
@@ -1794,7 +1790,6 @@
     // no mesh coloring: uses atomic updates
 
     Kernel_2_crust_mantle(num_elements,mp,
-                          *deltat,
                           *iphase,
                           mp->d_ibool_crust_mantle,
                           mp->d_ispec_is_tiso_crust_mantle,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -87,7 +87,6 @@
     *sigma_xz = *sigma_xz - R_xz[offset];
     *sigma_yz = *sigma_yz - R_yz[offset];
   }
-  return;
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -157,7 +156,6 @@
     Snp1   = factor_loc * epsilondev_yz_loc;
     R_yz[offset] = alphaval_loc * R_yz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
   }
-  return;
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -220,10 +218,17 @@
   theta = d_ystore[iglob];
   phi = d_zstore[iglob];
 
-  cos_theta = cos(theta);
-  sin_theta = sin(theta);
-  cos_phi = cos(phi);
-  sin_phi = sin(phi);
+  if( sizeof( theta ) == sizeof( float ) ){
+    // float operations
+    // sincos function return sinus and cosine for given value
+    sincosf(theta, &sin_theta, &cos_theta);
+    sincosf(phi, &sin_phi, &cos_phi);
+  }else{
+    cos_theta = cos(theta);
+    sin_theta = sin(theta);
+    cos_phi = cos(phi);
+    sin_phi = sin(phi);
+  }
 
   // for efficiency replace with lookup table every 100 m in radial direction
   // note: radius in crust mantle should never be zero,
@@ -286,8 +291,6 @@
   *rho_s_H1 = factor * (sx_l * Hxxl + sy_l * Hxyl + sz_l * Hxzl);
   *rho_s_H2 = factor * (sx_l * Hxyl + sy_l * Hyyl + sz_l * Hyzl);
   *rho_s_H3 = factor * (sx_l * Hxzl + sy_l * Hyzl + sz_l * Hzzl);
-
-  return;
 }
 
 
@@ -306,7 +309,7 @@
                                          int* d_phase_ispec_inner,
                                          int num_phase_ispec,
                                          int d_iphase,
-                                         realw d_deltat,
+                                         realw deltat,
                                          int use_mesh_coloring_gpu,
                                          realw* d_displ,
                                          realw* d_veloc,
@@ -343,9 +346,9 @@
                                          int NSPEC_INNER_CORE_STRAIN_ONLY,
                                          int NSPEC_INNER_CORE){
 
-  /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
+  // block id
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  /* int bx = blockIdx.x; */
+  // thread id
   int tx = threadIdx.x;
 
   int K = (tx/NGLL2);
@@ -449,13 +452,13 @@
           // use first order Taylor expansion of displacement for local storage of stresses
           // at this current time step, to fix attenuation in a consistent way
 #ifdef USE_TEXTURES_FIELDS
-          s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(d_displ_ic_tex, iglob*3);
-          s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(d_displ_ic_tex, iglob*3 + 1);
-          s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(d_displ_ic_tex, iglob*3 + 2);
+          s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + deltat * tex1Dfetch(d_displ_ic_tex, iglob*3);
+          s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + deltat * tex1Dfetch(d_displ_ic_tex, iglob*3 + 1);
+          s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + deltat * tex1Dfetch(d_displ_ic_tex, iglob*3 + 2);
 #else
-          s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
-          s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
-          s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+          s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + deltat * d_veloc[iglob*3];
+          s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + deltat * d_veloc[iglob*3 + 1];
+          s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + deltat * d_veloc[iglob*3 + 2];
 #endif
         }
         else{
@@ -1070,7 +1073,6 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 void Kernel_2_inner_core(int nb_blocks_to_compute,Mesh* mp,
-                         realw d_deltat,
                          int d_iphase,
                          int* d_ibool,
                          int* d_idoubling,
@@ -1116,7 +1118,7 @@
 
   int num_blocks_x = nb_blocks_to_compute;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -1139,7 +1141,7 @@
                                              mp->d_phase_ispec_inner_inner_core,
                                              mp->num_phase_ispec_inner_core,
                                              d_iphase,
-                                             d_deltat,
+                                             mp->deltat,
                                              mp->use_mesh_coloring_gpu,
                                              mp->d_displ_inner_core,
                                              mp->d_veloc_inner_core,
@@ -1188,7 +1190,7 @@
                                                 mp->d_phase_ispec_inner_inner_core,
                                                 mp->num_phase_ispec_inner_core,
                                                 d_iphase,
-                                                d_deltat,
+                                                mp->b_deltat,
                                                 mp->use_mesh_coloring_gpu,
                                                 mp->d_b_displ_inner_core,
                                                 mp->d_b_veloc_inner_core,
@@ -1249,7 +1251,6 @@
 extern "C"
 void FC_FUNC_(compute_forces_inner_core_cuda,
               COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
-                                              realw* deltat,
                                               int* iphase) {
 
   TRACE("compute_forces_inner_core_cuda");
@@ -1386,7 +1387,6 @@
 #endif
 
       Kernel_2_inner_core(nb_blocks_to_compute,mp,
-                          *deltat,
                           *iphase,
                           mp->d_ibool_inner_core + color_offset_nonpadded,
                           mp->d_idoubling_inner_core + color_offset_ispec,
@@ -1429,8 +1429,7 @@
                           mp->d_c12store_inner_core + color_offset,
                           mp->d_c13store_inner_core + color_offset,
                           mp->d_c33store_inner_core + color_offset,
-                          mp->d_c44store_inner_core + color_offset
-                          );
+                          mp->d_c44store_inner_core + color_offset);
 
       // for padded and aligned arrays
       color_offset += nb_blocks_to_compute * NGLL3_PADDED;
@@ -1459,7 +1458,6 @@
     // no mesh coloring: uses atomic updates
 
     Kernel_2_inner_core(num_elements,mp,
-                        *deltat,
                         *iphase,
                         mp->d_ibool_inner_core,
                         mp->d_idoubling_inner_core,
@@ -1493,8 +1491,7 @@
                         mp->d_b_R_xz_inner_core,
                         mp->d_b_R_yz_inner_core,
                         mp->d_c11store_inner_core,mp->d_c12store_inner_core,mp->d_c13store_inner_core,
-                        mp->d_c33store_inner_core,mp->d_c44store_inner_core
-                        );
+                        mp->d_c33store_inner_core,mp->d_c44store_inner_core);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -60,39 +60,40 @@
                                             realw deltat,
                                             realw* d_A_array_rotation,
                                             realw* d_B_array_rotation,
-                                            realw dpotentialdxl, realw dpotentialdyl,
+                                            realw dpotentialdxl,
+                                            realw dpotentialdyl,
                                             realw* dpotentialdx_with_rot,
                                             realw* dpotentialdy_with_rot) {
 
   realw two_omega_deltat,cos_two_omega_t,sin_two_omega_t;
   realw A_rotation,B_rotation;
-  realw ux_rotation,uy_rotation;
   realw source_euler_A,source_euler_B;
 
   // store the source for the Euler scheme for A_rotation and B_rotation
+  if( sizeof( sin_two_omega_t ) == sizeof( float ) ){
+    // float operations
+    // sincos function return sinus and cosine for given value
+    sincosf(two_omega_earth*time, &sin_two_omega_t, &cos_two_omega_t);
+  }else{
+    cos_two_omega_t = cos(two_omega_earth*time);
+    sin_two_omega_t = sin(two_omega_earth*time);
+  }
+
+  // time step deltat of Euler scheme is included in the source
   two_omega_deltat = deltat * two_omega_earth;
 
-  cos_two_omega_t = cos(two_omega_earth*time);
-  sin_two_omega_t = sin(two_omega_earth*time);
-
-  // time step deltat of Euler scheme is included in the source
   source_euler_A = two_omega_deltat * (cos_two_omega_t * dpotentialdyl + sin_two_omega_t * dpotentialdxl);
   source_euler_B = two_omega_deltat * (sin_two_omega_t * dpotentialdyl - cos_two_omega_t * dpotentialdxl);
 
   A_rotation = d_A_array_rotation[tx + working_element*NGLL3]; // (non-padded offset)
   B_rotation = d_B_array_rotation[tx + working_element*NGLL3];
 
-  ux_rotation =   A_rotation*cos_two_omega_t + B_rotation*sin_two_omega_t;
-  uy_rotation = - A_rotation*sin_two_omega_t + B_rotation*cos_two_omega_t;
+  *dpotentialdx_with_rot = dpotentialdxl + (  A_rotation*cos_two_omega_t + B_rotation*sin_two_omega_t);
+  *dpotentialdy_with_rot = dpotentialdyl + (- A_rotation*sin_two_omega_t + B_rotation*cos_two_omega_t);
 
-  *dpotentialdx_with_rot = dpotentialdxl + ux_rotation;
-  *dpotentialdy_with_rot = dpotentialdyl + uy_rotation;
-
   // updates rotation term with Euler scheme (non-padded offset)
   d_A_array_rotation[tx + working_element*NGLL3] += source_euler_A;
   d_B_array_rotation[tx + working_element*NGLL3] += source_euler_B;
-
-  return;
 }
 
 
@@ -105,31 +106,35 @@
 
 
 __global__ void Kernel_2_outer_core_impl(int nb_blocks_to_compute,
-                                       int NGLOB, int* d_ibool,
-                                       int* d_phase_ispec_inner,
-                                       int num_phase_ispec,
-                                       int d_iphase,
-                                       int use_mesh_coloring_gpu,
-                                       realw* d_potential, realw* d_potential_dot_dot,
-                                       realw* d_xix, realw* d_xiy, realw* d_xiz,
-                                       realw* d_etax, realw* d_etay, realw* d_etaz,
-                                       realw* d_gammax, realw* d_gammay, realw* d_gammaz,
-                                       realw* d_hprime_xx,
-                                       realw* hprimewgll_xx, realw* hprimewgll_yy, realw* hprimewgll_zz,
-                                       realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
-                                       int GRAVITY,
-                                       realw* d_xstore, realw* d_ystore, realw* d_zstore,
-                                       realw* d_d_ln_density_dr_table,
-                                       realw* d_minus_rho_g_over_kappa_fluid,
-                                       realw* wgll_cube,
-                                       int ROTATION,
-                                       realw time,
-                                       realw two_omega_earth,
-                                       realw deltat,
-                                       realw* d_A_array_rotation,realw* d_B_array_rotation,
-                                       int NSPEC_OUTER_CORE){
+                                         int NGLOB,
+                                         int* d_ibool,
+                                         int* d_phase_ispec_inner,
+                                         int num_phase_ispec,
+                                         int d_iphase,
+                                         int use_mesh_coloring_gpu,
+                                         realw* d_potential, realw* d_potential_dot_dot,
+                                         realw* d_xix, realw* d_xiy, realw* d_xiz,
+                                         realw* d_etax, realw* d_etay, realw* d_etaz,
+                                         realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+                                         realw* d_hprime_xx,
+                                         realw* hprimewgll_xx, realw* hprimewgll_yy, realw* hprimewgll_zz,
+                                         realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
+                                         int GRAVITY,
+                                         realw* d_xstore, realw* d_ystore, realw* d_zstore,
+                                         realw* d_d_ln_density_dr_table,
+                                         realw* d_minus_rho_g_over_kappa_fluid,
+                                         realw* wgll_cube,
+                                         int ROTATION,
+                                         realw time,
+                                         realw two_omega_earth,
+                                         realw deltat,
+                                         realw* d_A_array_rotation,
+                                         realw* d_B_array_rotation,
+                                         int NSPEC_OUTER_CORE){
 
+  // block id == spectral-element id
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
+  // thread id == GLL point id
   int tx = threadIdx.x;
 
   // R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
@@ -142,34 +147,33 @@
   int I = (tx-K*NGLL2-J*NGLLX);
 
   int active,offset;
-  int iglob = 0;
+  int iglob;
   int working_element;
 
   realw temp1l,temp2l,temp3l;
   realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+
   realw dpotentialdxl,dpotentialdyl,dpotentialdzl;
   realw dpotentialdx_with_rot,dpotentialdy_with_rot;
+
   realw fac1,fac2,fac3;
   realw sum_terms;
   realw gravity_term;
   realw gxl,gyl,gzl;
+
   realw radius,theta,phi;
   realw cos_theta,sin_theta,cos_phi,sin_phi;
   realw grad_x_ln_rho,grad_y_ln_rho,grad_z_ln_rho;
   int int_radius;
-
-
 #ifndef MANUALLY_UNROLLED_LOOPS
   int l;
   int offset1,offset2,offset3;
 #endif
 
   __shared__ realw s_dummy_loc[NGLL3];
-
   __shared__ realw s_temp1[NGLL3];
   __shared__ realw s_temp2[NGLL3];
   __shared__ realw s_temp3[NGLL3];
-
   __shared__ realw sh_hprime_xx[NGLL2];
 
 // use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
@@ -275,8 +279,8 @@
 
     //  compute the jacobian
     jacobianl = 1.f / (xixl*(etayl*gammazl-etazl*gammayl)
-                      -xiyl*(etaxl*gammazl-etazl*gammaxl)
-                      +xizl*(etaxl*gammayl-etayl*gammaxl));
+                    - xiyl*(etaxl*gammazl-etazl*gammaxl)
+                    + xizl*(etaxl*gammayl-etayl*gammaxl));
 
     // derivatives of potential
     dpotentialdxl = xixl*temp1l + etaxl*temp2l + gammaxl*temp3l;
@@ -285,12 +289,12 @@
 
     // compute contribution of rotation and add to gradient of potential
     // this term has no Z component
-    if(ROTATION){
-      compute_element_oc_rotation(tx,working_element,time,two_omega_earth,deltat,
+    if( ROTATION ){
+      compute_element_oc_rotation(tx,working_element,
+                                  time,two_omega_earth,deltat,
                                   d_A_array_rotation,d_B_array_rotation,
                                   dpotentialdxl,dpotentialdyl,
                                   &dpotentialdx_with_rot,&dpotentialdy_with_rot);
-
     }else{
       dpotentialdx_with_rot = dpotentialdxl;
       dpotentialdy_with_rot = dpotentialdyl;
@@ -304,16 +308,28 @@
     theta = d_ystore[iglob];
     phi = d_zstore[iglob];
 
-    cos_theta = cos(theta);
-    sin_theta = sin(theta);
-    cos_phi = cos(phi);
-    sin_phi = sin(phi);
+    if( sizeof( theta ) == sizeof( float ) ){
+      // float operations
+      // sincos function return sinus and cosine for given value
+      sincosf(theta, &sin_theta, &cos_theta);
+      sincosf(phi, &sin_phi, &cos_phi);
+    }else{
+      cos_theta = cos(theta);
+      sin_theta = sin(theta);
+      cos_phi = cos(phi);
+      sin_phi = sin(phi);
+    }
 
     // for efficiency replace with lookup table every 100 m in radial direction
     // note: radius in outer core should never be zero,
     //          and arrays in C start from 0, thus we need to subtract -1
     int_radius = rint(radius * R_EARTH_KM * 10.0f ) - 1;
 
+    //debug: checks bounds NRAD_GRAVITY == 70000
+    //if( int_radius < 0 || int_radius >= 70000 ){
+    //  printf("gravity: in_radius out of bounds %d radius=%e\n",int_radius,radius);
+    //}
+
     // depending on gravity or not, different potential definitions are used
     if( ! GRAVITY ){
       // add (chi/rho)grad(rho) term in no gravity case
@@ -324,9 +340,9 @@
       grad_z_ln_rho = cos_theta * d_d_ln_density_dr_table[int_radius];
 
       // adding (chi/rho)grad(rho)
-      dpotentialdx_with_rot = dpotentialdx_with_rot + s_dummy_loc[tx] * grad_x_ln_rho;
-      dpotentialdy_with_rot = dpotentialdy_with_rot + s_dummy_loc[tx] * grad_y_ln_rho;
-      dpotentialdzl = dpotentialdzl + s_dummy_loc[tx] * grad_z_ln_rho;
+      dpotentialdx_with_rot += s_dummy_loc[tx] * grad_x_ln_rho;
+      dpotentialdy_with_rot += s_dummy_loc[tx] * grad_y_ln_rho;
+      dpotentialdzl += s_dummy_loc[tx] * grad_z_ln_rho;
 
     }else{
 
@@ -360,9 +376,12 @@
     }
 
     // form the dot product with the test vector
-    s_temp1[tx] = jacobianl*(xixl*dpotentialdx_with_rot + xiyl*dpotentialdy_with_rot + xizl*dpotentialdzl);
-    s_temp2[tx] = jacobianl*(etaxl*dpotentialdx_with_rot + etayl*dpotentialdy_with_rot + etazl*dpotentialdzl);
-    s_temp3[tx] = jacobianl*(gammaxl*dpotentialdx_with_rot + gammayl*dpotentialdy_with_rot + gammazl*dpotentialdzl);
+    s_temp1[tx] = jacobianl*(xixl*dpotentialdx_with_rot
+                             + xiyl*dpotentialdy_with_rot + xizl*dpotentialdzl);
+    s_temp2[tx] = jacobianl*(etaxl*dpotentialdx_with_rot
+                             + etayl*dpotentialdy_with_rot + etazl*dpotentialdzl);
+    s_temp3[tx] = jacobianl*(gammaxl*dpotentialdx_with_rot
+                             + gammayl*dpotentialdy_with_rot + gammazl*dpotentialdzl);
   }
 
 // synchronize all the threads (one thread for each of the NGLL grid points of the
@@ -454,7 +473,6 @@
         atomicAdd(&d_potential_dot_dot[iglob],sum_terms);
       }
 
-
     }else{
 
       atomicAdd(&d_potential_dot_dot[iglob],sum_terms);
@@ -473,8 +491,10 @@
                          realw* d_etax,realw* d_etay,realw* d_etaz,
                          realw* d_gammax,realw* d_gammay,realw* d_gammaz,
                          realw time, realw b_time,
-                         realw* d_A_array_rotation,realw* d_B_array_rotation,
-                         realw* d_b_A_array_rotation,realw* d_b_B_array_rotation){
+                         realw* d_A_array_rotation,
+                         realw* d_B_array_rotation,
+                         realw* d_b_A_array_rotation,
+                         realw* d_b_B_array_rotation){
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("before outer_core kernel Kernel_2");
@@ -486,14 +506,19 @@
 
   int num_blocks_x = nb_blocks_to_compute;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
 
-  int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2;
-  dim3 grid_2(num_blocks_x,num_blocks_y);
+  int blocksize = NGLL3_PADDED;
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
 
+  // 2d grid
+  //int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2;
+  //dim3 grid_2(num_blocks_x,num_blocks_y);
+
   // Cuda timing
   // cudaEvent_t start, stop;
   // realw time;
@@ -501,7 +526,7 @@
   // cudaEventCreate(&stop);
   // cudaEventRecord( start, 0 );
 
-  Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+  Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
                                                           mp->NGLOB_OUTER_CORE,
                                                           d_ibool,
                                                           mp->d_phase_ispec_inner_outer_core,
@@ -523,13 +548,14 @@
                                                           mp->d_wgll_cube,
                                                           mp->rotation,
                                                           time,
-                                                          mp->d_two_omega_earth,
-                                                          mp->d_deltat,
-                                                          d_A_array_rotation,d_B_array_rotation,
+                                                          mp->two_omega_earth,
+                                                          mp->deltat,
+                                                          d_A_array_rotation,
+                                                          d_B_array_rotation,
                                                           mp->NSPEC_OUTER_CORE);
 
   if(mp->simulation_type == 3) {
-    Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+    Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
                                                             mp->NGLOB_OUTER_CORE,
                                                             d_ibool,
                                                             mp->d_phase_ispec_inner_outer_core,
@@ -551,9 +577,10 @@
                                                             mp->d_wgll_cube,
                                                             mp->rotation,
                                                             b_time,
-                                                            mp->d_b_two_omega_earth,
-                                                            mp->d_b_deltat,
-                                                            d_b_A_array_rotation,d_b_B_array_rotation,
+                                                            mp->b_two_omega_earth,
+                                                            mp->b_deltat,
+                                                            d_b_A_array_rotation,
+                                                            d_b_B_array_rotation,
                                                             mp->NSPEC_OUTER_CORE);
   }
 
@@ -563,7 +590,6 @@
   // cudaEventDestroy( start );
   // cudaEventDestroy( stop );
   // printf("Kernel2 Execution Time: %f ms\n",time);
-
   /* cudaThreadSynchronize(); */
   /* TRACE("Kernel 2 finished"); */
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -581,9 +607,9 @@
 extern "C"
 void FC_FUNC_(compute_forces_outer_core_cuda,
               COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
-                                            int* iphase,
-                                            realw* time_f,
-                                            realw* b_time_f) {
+                                              int* iphase,
+                                              realw* time_f,
+                                              realw* b_time_f) {
 
   TRACE("compute_forces_outer_core_cuda");
 
@@ -592,11 +618,11 @@
   //double start_time = get_time();
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-
-  int num_elements;
   realw time = *time_f;
   realw b_time = *b_time_f;
 
+  int num_elements;
+
   if( *iphase == 1 )
     num_elements = mp->nspec_outer_outer_core;
   else
@@ -683,8 +709,7 @@
                           mp->d_A_array_rotation + color_offset_nonpadded,
                           mp->d_B_array_rotation + color_offset_nonpadded,
                           mp->d_b_A_array_rotation + color_offset_nonpadded,
-                          mp->d_b_B_array_rotation + color_offset_nonpadded
-                         );
+                          mp->d_b_B_array_rotation + color_offset_nonpadded);
 
       // for padded and aligned arrays
       color_offset += nb_blocks_to_compute * NGLL3_PADDED;
@@ -702,9 +727,10 @@
                         mp->d_etax_outer_core,mp->d_etay_outer_core,mp->d_etaz_outer_core,
                         mp->d_gammax_outer_core,mp->d_gammay_outer_core,mp->d_gammaz_outer_core,
                         time,b_time,
-                        mp->d_A_array_rotation,mp->d_B_array_rotation,
-                        mp->d_b_A_array_rotation,mp->d_b_B_array_rotation
-                        );
+                        mp->d_A_array_rotation,
+                        mp->d_B_array_rotation,
+                        mp->d_b_A_array_rotation,
+                        mp->d_b_B_array_rotation);
 
   }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -226,7 +226,7 @@
 
   int num_blocks_x = mp->NSPEC_CRUST_MANTLE;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -299,7 +299,7 @@
 
   int num_blocks_x = mp->NSPEC_INNER_CORE;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -513,7 +513,7 @@
 
   int num_blocks_x = mp->NSPEC_OUTER_CORE;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -605,7 +605,7 @@
 
   int num_blocks_x = mp->nspec2D_top_crust_mantle;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -683,7 +683,7 @@
 
   int num_blocks_x = mp->NSPEC_CRUST_MANTLE;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -228,8 +228,8 @@
     case 8:
       // zmin
       num_abs_boundary_faces = mp->nspec2D_zmin_outer_core;
-      d_abs_boundary_ispec = mp->d_ibelm_zmin_outer_core;
-      d_abs_boundary_jacobian2D = mp->d_jacobian2D_zmin_outer_core;
+      d_abs_boundary_ispec = mp->d_ibelm_bottom_outer_core;
+      d_abs_boundary_jacobian2D = mp->d_jacobian2D_bottom_outer_core;
       d_b_absorb_potential = mp->d_absorb_zmin_outer_core;
       d_wgllwgll = mp->d_wgllwgll_xy;
       break;
@@ -252,7 +252,7 @@
 
   int num_blocks_x = num_abs_boundary_faces;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -263,7 +263,8 @@
   if (mp->simulation_type == 3 && num_abs_boundary_faces > 0 ){
     // copies array to GPU
     print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_potential,absorb_potential,
-                            NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
+                            NGLL2*num_abs_boundary_faces*sizeof(realw),
+                            cudaMemcpyHostToDevice),7700);
   }
 
   compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_veloc_outer_core,
@@ -290,7 +291,8 @@
   if (mp->simulation_type == 1 && mp->save_forward && num_abs_boundary_faces > 0 ){
     // copies array to CPU
     print_CUDA_error_if_any(cudaMemcpy(absorb_potential,d_b_absorb_potential,
-                            NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyDeviceToHost),7701);
+                                       NGLL2*num_abs_boundary_faces*sizeof(realw),
+                                       cudaMemcpyDeviceToHost),7701);
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -261,7 +261,7 @@
 
   int num_blocks_x = num_abs_boundary_faces;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -51,6 +51,9 @@
               INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
   TRACE("initialize_cuda_device");
 
+  int device;
+  int device_count;
+
   // Gets rank number of MPI process
   int myrank = *myrank_f;
 
@@ -87,7 +90,7 @@
   // note: from here on we use the runtime API  ...
 
   // Gets number of GPU devices
-  int device_count = 0;
+  device_count = 0;
   cudaGetDeviceCount(&device_count);
   exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\ncheck if driver and runtime libraries work together\nexiting...\n");
 
@@ -107,12 +110,20 @@
     //MPI_Barrier(MPI_COMM_WORLD);
 
     // sets active device
-    cudaSetDevice( myrank % device_count );
-    exit_on_cuda_error("cudaSetDevice");
+    device = myrank % device_count;
+    cudaSetDevice( device );
+    exit_on_cuda_error("cudaSetDevice has invalid device");
+
+    // double check that device was  properly selected
+    cudaGetDevice(&device);
+    if( device != (myrank % device_count) ){
+       printf("error rank: %d devices: %d \n",myrank,device_count);
+       printf("  cudaSetDevice()=%d\n  cudaGetDevice()=%d\n",myrank%device_count,device);
+       exit_on_error("CUDA set/get device error: device id conflict \n");
+    }
   }
 
   // returns a handle to the active device
-  int device;
   cudaGetDevice(&device);
 
   // get device properties

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -91,7 +91,7 @@
 
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -154,7 +154,7 @@
 
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -248,7 +248,7 @@
 
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -288,8 +288,10 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 __global__ void kernel_3_cuda_device(realw* veloc,
-                                     realw* accel, int size,
+                                     realw* accel,
+                                     int size,
                                      realw deltatover2,
+                                     realw two_omega_earth,
                                      realw* rmassx,
                                      realw* rmassy,
                                      realw* rmassz) {
@@ -298,8 +300,9 @@
   /* because of block and grid sizing problems, there is a small */
   /* amount of buffer at the end of the calculation */
   if(id < size) {
-    accel[3*id] = accel[3*id]*rmassx[id];
-    accel[3*id+1] = accel[3*id+1]*rmassy[id];
+    // note: update adds rotational acceleration in case two_omega_earth is non-zero
+    accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
+    accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
     accel[3*id+2] = accel[3*id+2]*rmassz[id];
 
     veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
@@ -311,7 +314,9 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 __global__ void kernel_3_accel_cuda_device(realw* accel,
+                                           realw* veloc,
                                            int size,
+                                           realw two_omega_earth,
                                            realw* rmassx,
                                            realw* rmassy,
                                            realw* rmassz) {
@@ -320,8 +325,9 @@
   /* because of block and grid sizing problems, there is a small */
   /* amount of buffer at the end of the calculation */
   if(id < size) {
-    accel[3*id] = accel[3*id]*rmassx[id];
-    accel[3*id+1] = accel[3*id+1]*rmassy[id];
+    // note: update adds rotational acceleration in case two_omega_earth is non-zero
+    accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
+    accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
     accel[3*id+2] = accel[3*id+2]*rmassz[id];
   }
 }
@@ -351,13 +357,13 @@
                                realw* deltatover2_F,
                                int* SIMULATION_TYPE_f,
                                realw* b_deltatover2_F,
-                               int* OCEANS,
                                int* NCHUNKS_VAL) {
   TRACE("kernel_3_a_cuda");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
 
   int SIMULATION_TYPE = *SIMULATION_TYPE_f;
+
   realw deltatover2 = *deltatover2_F;
   realw b_deltatover2 = *b_deltatover2_F;
 
@@ -366,7 +372,7 @@
 
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -376,7 +382,7 @@
 
   // crust/mantle region only
   // check whether we can update accel and veloc, or only accel at this point
-  if( *OCEANS == 0 ){
+  if( mp->oceans == 0 ){
     // updates both, accel and veloc
 
     if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
@@ -384,6 +390,7 @@
                                                mp->d_accel_crust_mantle,
                                                mp->NGLOB_CRUST_MANTLE,
                                                deltatover2,
+                                               mp->two_omega_earth,
                                                mp->d_rmassx_crust_mantle,
                                                mp->d_rmassy_crust_mantle,
                                                mp->d_rmassz_crust_mantle);
@@ -393,6 +400,7 @@
                                                  mp->d_b_accel_crust_mantle,
                                                  mp->NGLOB_CRUST_MANTLE,
                                                  b_deltatover2,
+                                                 mp->b_two_omega_earth,
                                                  mp->d_rmassx_crust_mantle,
                                                  mp->d_rmassy_crust_mantle,
                                                  mp->d_rmassz_crust_mantle);
@@ -402,6 +410,7 @@
                                                mp->d_accel_crust_mantle,
                                                mp->NGLOB_CRUST_MANTLE,
                                                deltatover2,
+                                               mp->two_omega_earth,
                                                mp->d_rmassz_crust_mantle,
                                                mp->d_rmassz_crust_mantle,
                                                mp->d_rmassz_crust_mantle);
@@ -411,6 +420,7 @@
                                                  mp->d_b_accel_crust_mantle,
                                                  mp->NGLOB_CRUST_MANTLE,
                                                  b_deltatover2,
+                                                 mp->b_two_omega_earth,
                                                  mp->d_rmassz_crust_mantle,
                                                  mp->d_rmassz_crust_mantle,
                                                  mp->d_rmassz_crust_mantle);
@@ -422,28 +432,36 @@
 
     if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
       kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+                                                     mp->d_veloc_crust_mantle,
                                                      mp->NGLOB_CRUST_MANTLE,
+                                                     mp->two_omega_earth,
                                                      mp->d_rmassx_crust_mantle,
                                                      mp->d_rmassy_crust_mantle,
                                                      mp->d_rmassz_crust_mantle);
 
       if(SIMULATION_TYPE == 3) {
         kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+                                                       mp->d_b_veloc_crust_mantle,
                                                        mp->NGLOB_CRUST_MANTLE,
+                                                       mp->b_two_omega_earth,
                                                        mp->d_rmassx_crust_mantle,
                                                        mp->d_rmassy_crust_mantle,
                                                        mp->d_rmassz_crust_mantle);
       }
     }else{
       kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+                                                     mp->d_veloc_crust_mantle,
                                                      mp->NGLOB_CRUST_MANTLE,
+                                                     mp->two_omega_earth,
                                                      mp->d_rmassz_crust_mantle,
                                                      mp->d_rmassz_crust_mantle,
                                                      mp->d_rmassz_crust_mantle);
 
       if(SIMULATION_TYPE == 3) {
         kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+                                                       mp->d_b_veloc_crust_mantle,
                                                        mp->NGLOB_CRUST_MANTLE,
+                                                       mp->b_two_omega_earth,
                                                        mp->d_rmassz_crust_mantle,
                                                        mp->d_rmassz_crust_mantle,
                                                        mp->d_rmassz_crust_mantle);
@@ -464,8 +482,7 @@
               KERNEL_3_B_CUDA)(long* Mesh_pointer,
                                realw* deltatover2_F,
                                int* SIMULATION_TYPE_f,
-                               realw* b_deltatover2_F,
-                               int* OCEANS) {
+                               realw* b_deltatover2_F) {
   TRACE("kernel_3_b_cuda");
   int size_padded,num_blocks_x,num_blocks_y;
 
@@ -479,11 +496,11 @@
 
   // crust/mantle region
   // in case of ocean loads, we still have to update the velocity for crust/mantle region
-  if( *OCEANS ){
+  if( mp->oceans ){
     size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
     num_blocks_x = size_padded/blocksize;
     num_blocks_y = 1;
-    while(num_blocks_x > 65535) {
+    while(num_blocks_x > MAXIMUM_GRID_DIM) {
       num_blocks_x = (int) ceil(num_blocks_x*0.5f);
       num_blocks_y = num_blocks_y*2;
     }
@@ -504,11 +521,11 @@
     }
   }
 
-  // inner core
+  // inner core region
   size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
   num_blocks_x = size_padded/blocksize;
   num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -520,6 +537,7 @@
                                            mp->d_accel_inner_core,
                                            mp->NGLOB_INNER_CORE,
                                            deltatover2,
+                                           mp->two_omega_earth,
                                            mp->d_rmass_inner_core,
                                            mp->d_rmass_inner_core,
                                            mp->d_rmass_inner_core);
@@ -529,6 +547,7 @@
                                              mp->d_b_accel_inner_core,
                                              mp->NGLOB_INNER_CORE,
                                              b_deltatover2,
+                                             mp->b_two_omega_earth,
                                              mp->d_rmass_inner_core,
                                              mp->d_rmass_inner_core,
                                              mp->d_rmass_inner_core);
@@ -586,10 +605,11 @@
   realw b_deltatover2 = *b_deltatover2_F;
 
   int blocksize = BLOCKSIZE_KERNEL3;
+
   int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2012-08-19 01:24:46 UTC (rev 20591)
@@ -154,14 +154,18 @@
 // Texture memory usage:
 // requires CUDA version >= 4.0, see check below
 // Use textures for d_displ and d_accel -- 10% performance boost
-#define USE_TEXTURES_FIELDS
 
+//daniel: todo benchmark fields
+//#define USE_TEXTURES_FIELDS
+
 // Using texture memory for the hprime-style constants is slower on
 // Fermi generation hardware, but *may* be faster on Kepler
 // generation.
 // Use textures for hprime_xx
-#define USE_TEXTURES_CONSTANTS
 
+//daniel: todo benchmark constants
+//#define USE_TEXTURES_CONSTANTS
+
 // CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize()
 #if CUDA_VERSION < 4000
 #undef USE_TEXTURES_FIELDS
@@ -187,6 +191,9 @@
 #define BLOCKSIZE_KERNEL3 128
 #define BLOCKSIZE_TRANSFER 256
 
+// maximum grid dimension in one direction of GPU
+#define MAXIMUM_GRID_DIM 65535
+
 /* ----------------------------------------------------------------------------------------------- */
 
 // indexing
@@ -492,8 +499,8 @@
   int npoin_oceans;
 
   // model parameter
+  int* d_ibool_ocean_load;
   realw* d_rmass_ocean_load;
-  int* d_iglob_ocean_load;
   realw* d_normal_ocean_load;
 
   // ------------------------------------------------------------------ //
@@ -551,6 +558,9 @@
   int anisotropic_kl;
   int approximate_hess_kl;
 
+  realw deltat;
+  realw b_deltat;
+
   // ------------------------------------------------------------------ //
   // gravity
   // ------------------------------------------------------------------ //
@@ -560,16 +570,20 @@
   realw* d_minus_deriv_gravity_table;
   realw* d_density_table;
 
+  realw minus_g_icb;
+  realw minus_g_cmb;
+
+  realw RHO_BOTTOM_OC;
+  realw RHO_TOP_OC;
+
   // ------------------------------------------------------------------ //
   // rotation
   // ------------------------------------------------------------------ //
-  realw d_two_omega_earth;
-  realw d_deltat;
+  realw two_omega_earth;
   realw* d_A_array_rotation; realw* d_B_array_rotation;
 
   // needed for backward/reconstructed fields (kernel runs)
-  realw d_b_two_omega_earth;
-  realw d_b_deltat;
+  realw b_two_omega_earth;
   realw* d_b_A_array_rotation; realw* d_b_B_array_rotation;
 
   // ------------------------------------------------------------------ //
@@ -656,11 +670,9 @@
 
   int* d_ibelm_xmin_outer_core, *d_ibelm_xmax_outer_core;
   int* d_ibelm_ymin_outer_core, *d_ibelm_ymax_outer_core;
-  int* d_ibelm_zmin_outer_core;
 
   realw* d_jacobian2D_xmin_outer_core, *d_jacobian2D_xmax_outer_core;
   realw* d_jacobian2D_ymin_outer_core, *d_jacobian2D_ymax_outer_core;
-  realw* d_jacobian2D_zmin_outer_core;
 
   realw* d_absorb_xmin_outer_core, *d_absorb_xmax_outer_core;
   realw* d_absorb_ymin_outer_core, *d_absorb_ymax_outer_core;

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -156,7 +156,7 @@
 
   int num_blocks_x = mp->nspec2D_top_crust_mantle;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -254,7 +254,7 @@
   int igll = threadIdx.x;
   int iface = blockIdx.x + gridDim.x*blockIdx.y; // surface element id
 
-  // when nspec_top > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+  // when nspec_top > MAXIMUM_GRID_DIM, but mod(nspec_top,2) > 0, we end up with an extra block.
   if(iface < nspec_top) {
 
     int ispec = ibelm_top[iface]-1;
@@ -306,7 +306,7 @@
 
   int num_blocks_x = mp->nspec2D_top_crust_mantle;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -94,7 +94,7 @@
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
                                         int* myrank_f,
                                         int* h_NGLLX,
-                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+                                        realw* h_hprime_xx,
                                         realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
                                         realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
                                         int* NSOURCES,int* nsources_local,
@@ -125,7 +125,9 @@
                                         int* SAVE_BOUNDARY_MESH_f,
                                         int* USE_MESH_COLORING_GPU_f,
                                         int* ANISOTROPIC_KL_f,
-                                        int* APPROXIMATE_HESS_KL_f) {
+                                        int* APPROXIMATE_HESS_KL_f,
+                                        realw* deltat_f,
+                                        realw* b_deltat_f) {
 
   TRACE("prepare_constants_device");
 
@@ -294,6 +296,15 @@
 
   }
 
+  // for rotation and new attenuation
+  mp->deltat = *deltat_f;
+  if( mp->simulation_type == 3 ){
+    mp->b_deltat = *b_deltat_f;
+  }
+  // initializes for rotational effects
+  mp->two_omega_earth = 0.f;
+  mp->b_two_omega_earth = 0.f;
+
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("prepare_constants_device");
 #endif
@@ -308,12 +319,10 @@
 extern "C"
 void FC_FUNC_(prepare_fields_rotation_device,
               PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
-                                              realw* two_omega_earth,
-                                              realw* deltat,
+                                              realw* two_omega_earth_f,
                                               realw* A_array_rotation,
                                               realw* B_array_rotation,
-                                              realw* b_two_omega_earth,
-                                              realw* b_deltat,
+                                              realw* b_two_omega_earth_f,
                                               realw* b_A_array_rotation,
                                               realw* b_B_array_rotation,
                                               int* NSPEC_OUTER_CORE_ROTATION) {
@@ -323,22 +332,26 @@
   Mesh* mp = (Mesh*)(*Mesh_pointer_f);
 
   // arrays only needed when rotation is required
-  if( ! mp->rotation ){ exit_on_cuda_error("prepare_fields_rotation_device rotation not properly initialized"); }
+  if( ! mp->rotation ){
+    exit_on_cuda_error("prepare_fields_rotation_device: rotation flag not properly initialized");
+  }
+  // checks array size
+  if( *NSPEC_OUTER_CORE_ROTATION != mp->NSPEC_OUTER_CORE){
+    printf("error prepare_fields_rotation_device: rotation array has wrong size: %d instead of %d\n",
+           *NSPEC_OUTER_CORE_ROTATION,mp->NSPEC_OUTER_CORE);
+    exit_on_cuda_error("prepare_fields_rotation_device: rotation array has wrong size");
+  }
 
   // rotation arrays (needed only for outer core region)
-  mp->d_two_omega_earth = *two_omega_earth;
-  mp->d_deltat = *deltat;
+  mp->two_omega_earth = *two_omega_earth_f;
+  copy_todevice_realw((void**)&mp->d_A_array_rotation,A_array_rotation,NGLL3*mp->NSPEC_OUTER_CORE);
+  copy_todevice_realw((void**)&mp->d_B_array_rotation,B_array_rotation,NGLL3*mp->NSPEC_OUTER_CORE);
 
-  copy_todevice_realw((void**)&mp->d_A_array_rotation,A_array_rotation,NGLL3*(*NSPEC_OUTER_CORE_ROTATION));
-  copy_todevice_realw((void**)&mp->d_B_array_rotation,B_array_rotation,NGLL3*(*NSPEC_OUTER_CORE_ROTATION));
-
   // backward/reconstructed fields
   if( mp->simulation_type == 3 ){
-    mp->d_b_two_omega_earth = *b_two_omega_earth;
-    mp->d_b_deltat = *b_deltat;
-
-    copy_todevice_realw((void**)&mp->d_b_A_array_rotation,b_A_array_rotation,NGLL3*(*NSPEC_OUTER_CORE_ROTATION));
-    copy_todevice_realw((void**)&mp->d_b_B_array_rotation,b_B_array_rotation,NGLL3*(*NSPEC_OUTER_CORE_ROTATION));
+    mp->b_two_omega_earth = *b_two_omega_earth_f;
+    copy_todevice_realw((void**)&mp->d_b_A_array_rotation,b_A_array_rotation,NGLL3*mp->NSPEC_OUTER_CORE);
+    copy_todevice_realw((void**)&mp->d_b_B_array_rotation,b_B_array_rotation,NGLL3*mp->NSPEC_OUTER_CORE);
   }
 }
 
@@ -358,7 +371,11 @@
                                              realw* minus_deriv_gravity_table,
                                              realw* density_table,
                                              realw* h_wgll_cube,
-                                             int* NRAD_GRAVITY) {
+                                             int* NRAD_GRAVITY,
+                                             realw* minus_g_icb,
+                                             realw* minus_g_cmb,
+                                             double* RHO_BOTTOM_OC,
+                                             double* RHO_TOP_OC) {
 
   TRACE("prepare_fields_gravity_device");
 
@@ -372,6 +389,8 @@
 
   }else{
     // gravity case
+    mp->minus_g_icb = *minus_g_icb;
+    mp->minus_g_cmb = *minus_g_cmb;
 
     // sets up gll weights cubed
     setConst_wgll_cube(h_wgll_cube,mp);
@@ -382,6 +401,11 @@
     copy_todevice_realw((void**)&mp->d_minus_deriv_gravity_table,minus_deriv_gravity_table,(*NRAD_GRAVITY));
     copy_todevice_realw((void**)&mp->d_density_table,density_table,(*NRAD_GRAVITY));
   }
+
+  // constants
+  mp->RHO_BOTTOM_OC = (realw) *RHO_BOTTOM_OC;
+  mp->RHO_TOP_OC = (realw) *RHO_TOP_OC;
+
 }
 
 
@@ -600,10 +624,8 @@
                                             int* nkmin_xi_outer_core,int* nkmin_eta_outer_core,
                                             int* ibelm_xmin_outer_core,int* ibelm_xmax_outer_core,
                                             int* ibelm_ymin_outer_core,int* ibelm_ymax_outer_core,
-                                            int* ibelm_bottom_outer_core,
                                             realw* jacobian2D_xmin_outer_core, realw* jacobian2D_xmax_outer_core,
                                             realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
-                                            realw* jacobian2D_bottom_outer_core,
                                             realw* vp_outer_core) {
 
   TRACE("prepare_fields_absorb_device");
@@ -759,9 +781,8 @@
 
   // zmin
   if( mp->nspec2D_zmin_outer_core > 0 ){
-    copy_todevice_int((void**)&mp->d_ibelm_zmin_outer_core,ibelm_bottom_outer_core,mp->nspec2D_zmin_outer_core);
-    copy_todevice_realw((void**)&mp->d_jacobian2D_zmin_outer_core,jacobian2D_bottom_outer_core,
-                        NGLL2*(mp->nspec2D_zmin_outer_core));
+    // note: ibelm_bottom_outer_core and jacobian2D_bottom_outer_core will be allocated
+    //          when preparing the outer core
     // boundary buffer
     if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
       print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_absorb_zmin_outer_core,
@@ -908,7 +929,6 @@
     // initializes kernel values to zero
     print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
                                        NGLL3*mp->NSPEC_CRUST_MANTLE*sizeof(realw)),7403);
-
   }
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -927,8 +947,8 @@
 void FC_FUNC_(prepare_oceans_device,
               PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
                                      int* npoin_oceans,
+                                     int* h_iglob_ocean_load,
                                      realw* h_rmass_ocean_load_selected,
-                                     int* h_iglob_ocean_load,
                                      realw* h_normal_ocean_load) {
 
   TRACE("prepare_oceans_device");
@@ -941,12 +961,12 @@
   // checks for global partitions, each slice must have a top surface with points on it
   if( mp->npoin_oceans == 0 ){ exit_on_cuda_error("prepare_oceans_device has zero npoin_oceans"); }
 
+  // global point indices
+  copy_todevice_int((void**)&mp->d_ibool_ocean_load,h_iglob_ocean_load,mp->npoin_oceans);
+
   // mass matrix
   copy_todevice_realw((void**)&mp->d_rmass_ocean_load,h_rmass_ocean_load_selected,mp->npoin_oceans);
 
-  // global point indices
-  copy_todevice_int((void**)&mp->d_iglob_ocean_load,h_iglob_ocean_load,mp->npoin_oceans);
-
   // normals
   copy_todevice_realw((void**)&mp->d_normal_ocean_load,h_normal_ocean_load,NDIM*mp->npoin_oceans);
 
@@ -1396,18 +1416,17 @@
   // CMB/ICB coupling
   mp->nspec2D_top_outer_core = *NSPEC2D_TOP_OC;
   mp->nspec2D_bottom_outer_core = *NSPEC2D_BOTTOM_OC;
+
   int size_toc = NGLL2*(mp->nspec2D_top_outer_core);
-  int size_boc = NGLL2*(mp->nspec2D_bottom_outer_core);
-
+  copy_todevice_int((void**)&mp->d_ibelm_top_outer_core,h_ibelm_top_outer_core,mp->nspec2D_top_outer_core);
+  copy_todevice_realw((void**)&mp->d_jacobian2D_top_outer_core,h_jacobian2D_top_outer_core,size_toc);
   copy_todevice_realw((void**)&mp->d_normal_top_outer_core,h_normal_top_outer_core,NDIM*size_toc);
-  copy_todevice_realw((void**)&mp->d_normal_bottom_outer_core,h_normal_bottom_outer_core,NDIM*size_boc);
 
-  copy_todevice_realw((void**)&mp->d_jacobian2D_top_outer_core,h_jacobian2D_top_outer_core,size_toc);
+  int size_boc = NGLL2*(mp->nspec2D_bottom_outer_core);
+  copy_todevice_int((void**)&mp->d_ibelm_bottom_outer_core,h_ibelm_bottom_outer_core,mp->nspec2D_bottom_outer_core);
   copy_todevice_realw((void**)&mp->d_jacobian2D_bottom_outer_core,h_jacobian2D_bottom_outer_core,size_boc);
+  copy_todevice_realw((void**)&mp->d_normal_bottom_outer_core,h_normal_bottom_outer_core,NDIM*size_boc);
 
-  copy_todevice_int((void**)&mp->d_ibelm_top_outer_core,h_ibelm_top_outer_core,mp->nspec2D_top_outer_core);
-  copy_todevice_int((void**)&mp->d_ibelm_bottom_outer_core,h_ibelm_bottom_outer_core,mp->nspec2D_bottom_outer_core);
-
   // wavefield
   int size = mp->NGLOB_OUTER_CORE;
   print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ_outer_core),sizeof(realw)*size),4001);
@@ -1887,8 +1906,6 @@
       }
     }
     if( mp->nspec2D_zmin_outer_core > 0 ){
-      cudaFree(mp->d_ibelm_zmin_outer_core);
-      cudaFree(mp->d_jacobian2D_zmin_outer_core);
       if( (mp->simulation_type == 1 && mp->save_forward ) || (mp->simulation_type == 3) ){
         cudaFree(mp->d_absorb_zmin_outer_core);
       }
@@ -2007,6 +2024,7 @@
     }
     if(mp->approximate_hess_kl){ cudaFree(mp->d_hess_kl_crust_mantle);}
   }
+  // mass matrix
   if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
     cudaFree(mp->d_rmassx_crust_mantle);
     cudaFree(mp->d_rmassy_crust_mantle);
@@ -2039,12 +2057,11 @@
   cudaFree(mp->d_phase_ispec_inner_outer_core);
 
   cudaFree(mp->d_ibelm_top_outer_core);
+  cudaFree(mp->d_jacobian2D_top_outer_core);
+  cudaFree(mp->d_normal_top_outer_core);
+
   cudaFree(mp->d_ibelm_bottom_outer_core);
-
-  cudaFree(mp->d_normal_top_outer_core);
   cudaFree(mp->d_normal_bottom_outer_core);
-
-  cudaFree(mp->d_jacobian2D_top_outer_core);
   cudaFree(mp->d_jacobian2D_bottom_outer_core);
 
   cudaFree(mp->d_displ_outer_core);
@@ -2057,6 +2074,7 @@
     cudaFree(mp->d_rho_kl_outer_core);
     cudaFree(mp->d_alpha_kl_outer_core);
   }
+  // mass matrix
   cudaFree(mp->d_rmass_outer_core);
 
   //------------------------------------------
@@ -2075,6 +2093,7 @@
   cudaFree(mp->d_muvstore_inner_core);
   cudaFree(mp->d_ibool_inner_core);
 
+  // gravity
   if( mp->gravity ){
     cudaFree(mp->d_xstore_inner_core);
     cudaFree(mp->d_ystore_inner_core);
@@ -2116,11 +2135,13 @@
     cudaFree(mp->d_alpha_kl_inner_core);
     cudaFree(mp->d_beta_kl_inner_core);
   }
+  // mass matrix
   cudaFree(mp->d_rmass_inner_core);
 
+  // oceans
   if( mp->oceans ){
     cudaFree(mp->d_rmass_ocean_load);
-    cudaFree(mp->d_iglob_ocean_load);
+    cudaFree(mp->d_ibool_ocean_load);
     cudaFree(mp->d_normal_ocean_load);
   }
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c	2012-08-19 01:24:46 UTC (rev 20591)
@@ -161,16 +161,10 @@
               COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
-              COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
-                                               double RHO_TOP_OC,
-                                               realw minus_g_cmb,
-                                               int GRAVITY_VAL) {} 
+              COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(compute_coupling_icb_fluid_cuda,
-              COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f,
-                                               double RHO_BOTTOM_OC,
-                                               realw minus_g_icb,
-                                               int GRAVITY_VAL) {} 
+              COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f) {} 
 
 void FC_FUNC_(compute_coupling_ocean_cuda,
               COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
@@ -183,7 +177,6 @@
 
 void FC_FUNC_(compute_forces_crust_mantle_cuda,
               COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
-                                                realw* deltat,
                                                 int* iphase) {} 
 
 
@@ -193,7 +186,6 @@
 
 void FC_FUNC_(compute_forces_inner_core_cuda,
               COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
-                                              realw* deltat,
                                               int* iphase) {} 
 
 
@@ -203,9 +195,9 @@
 
 void FC_FUNC_(compute_forces_outer_core_cuda,
               COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
-                                            int* iphase,
-                                            realw* time_f,
-                                            realw* b_time_f) {} 
+                                              int* iphase,
+                                              realw* time_f,
+                                              realw* b_time_f) {} 
 
 
 //
@@ -298,15 +290,13 @@
                                realw* deltatover2_F,
                                int* SIMULATION_TYPE_f,
                                realw* b_deltatover2_F,
-                               int* OCEANS,
                                int* NCHUNKS_VAL) {} 
 
 void FC_FUNC_(kernel_3_b_cuda,
               KERNEL_3_B_CUDA)(long* Mesh_pointer,
                                realw* deltatover2_F,
                                int* SIMULATION_TYPE_f,
-                               realw* b_deltatover2_F,
-                               int* OCEANS) {} 
+                               realw* b_deltatover2_F) {} 
 
 void FC_FUNC_(kernel_3_outer_core_cuda,
               KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
@@ -352,7 +342,7 @@
               PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
                                         int* myrank_f,
                                         int* h_NGLLX,
-                                        realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+                                        realw* h_hprime_xx,
                                         realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
                                         realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
                                         int* NSOURCES,int* nsources_local,
@@ -383,16 +373,16 @@
                                         int* SAVE_BOUNDARY_MESH_f,
                                         int* USE_MESH_COLORING_GPU_f,
                                         int* ANISOTROPIC_KL_f,
-                                        int* APPROXIMATE_HESS_KL_f) {} 
+                                        int* APPROXIMATE_HESS_KL_f,
+                                        realw* deltat_f,
+                                        realw* b_deltat_f) {} 
 
 void FC_FUNC_(prepare_fields_rotation_device,
               PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
-                                              realw* two_omega_earth,
-                                              realw* deltat,
+                                              realw* two_omega_earth_f,
                                               realw* A_array_rotation,
                                               realw* B_array_rotation,
-                                              realw* b_two_omega_earth,
-                                              realw* b_deltat,
+                                              realw* b_two_omega_earth_f,
                                               realw* b_A_array_rotation,
                                               realw* b_B_array_rotation,
                                               int* NSPEC_OUTER_CORE_ROTATION) {} 
@@ -405,7 +395,11 @@
                                              realw* minus_deriv_gravity_table,
                                              realw* density_table,
                                              realw* h_wgll_cube,
-                                             int* NRAD_GRAVITY) {} 
+                                             int* NRAD_GRAVITY,
+                                             realw* minus_g_icb,
+                                             realw* minus_g_cmb,
+                                             double* RHO_BOTTOM_OC,
+                                             double* RHO_TOP_OC) {} 
 
 void FC_FUNC_(prepare_fields_attenuat_device,
               PREPARE_FIELDS_ATTENUAT_DEVICE)(long* Mesh_pointer_f,
@@ -478,10 +472,8 @@
                                             int* nkmin_xi_outer_core,int* nkmin_eta_outer_core,
                                             int* ibelm_xmin_outer_core,int* ibelm_xmax_outer_core,
                                             int* ibelm_ymin_outer_core,int* ibelm_ymax_outer_core,
-                                            int* ibelm_bottom_outer_core,
                                             realw* jacobian2D_xmin_outer_core, realw* jacobian2D_xmax_outer_core,
                                             realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
-                                            realw* jacobian2D_bottom_outer_core,
                                             realw* vp_outer_core) {} 
 
 void FC_FUNC_(prepare_mpi_buffers_device,
@@ -514,8 +506,8 @@
 void FC_FUNC_(prepare_oceans_device,
               PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
                                      int* npoin_oceans,
-                                     realw* h_rmass_ocean_load_selected,
                                      int* h_iglob_ocean_load,
+                                     realw* h_rmass_ocean_load_selected,
                                      realw* h_normal_ocean_load) {} 
 
 void FC_FUNC_(prepare_crust_mantle_device,
@@ -766,7 +758,8 @@
 //
 
 void FC_FUNC_(write_seismograms_transfer_cuda,
-              WRITE_SEISMOGRAMS_TRANSFER_CUDA)(realw* displ,
+              WRITE_SEISMOGRAMS_TRANSFER_CUDA)(long* Mesh_pointer_f,
+                                               realw* displ,
                                                realw* b_displ,
                                                realw* eps_trace_over_3,
                                                realw* epsilondev_xx,
@@ -774,24 +767,8 @@
                                                realw* epsilondev_xy,
                                                realw* epsilondev_xz,
                                                realw* epsilondev_yz,
-                                               long* Mesh_pointer_f,
                                                int* number_receiver_global,
                                                int* ispec_selected_rec,
                                                int* ispec_selected_source,
                                                int* ibool) {} 
 
-void FC_FUNC_(transfer_station_ac_from_device,
-              TRANSFER_STATION_AC_FROM_DEVICE)(
-                                                realw* potential_acoustic,
-                                                realw* potential_dot_acoustic,
-                                                realw* potential_dot_dot_acoustic,
-                                                realw* b_potential_acoustic,
-                                                realw* b_potential_dot_acoustic,
-                                                realw* b_potential_dot_dot_acoustic,
-                                                long* Mesh_pointer_f,
-                                                int* number_receiver_global,
-                                                int* ispec_selected_rec,
-                                                int* ispec_selected_source,
-                                                int* ibool,
-                                                int* SIMULATION_TYPEf) {} 
-

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2012-08-19 01:24:46 UTC (rev 20591)
@@ -97,13 +97,17 @@
 
 TRACE("write_seismograms_transfer_from_device");
 
+  int irec_local,irec;
+  int ispec,iglob,i;
+
   // checks if anything to do
   if( mp->nrec_local == 0 ) return;
 
   int blocksize = NGLL3;
+
   int num_blocks_x = mp->nrec_local;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -122,13 +126,12 @@
   cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
              3*NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
 
-  int irec_local;
   for(irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
-    int irec = number_receiver_global[irec_local] - 1;
-    int ispec = h_ispec_selected[irec] - 1;
+    irec = number_receiver_global[irec_local] - 1;
+    ispec = h_ispec_selected[irec] - 1;
 
-    for(int i = 0; i < NGLL3; i++) {
-      int iglob = ibool[i+NGLL3*ispec] - 1;
+    for(i = 0; i < NGLL3; i++) {
+      iglob = ibool[i+NGLL3*ispec] - 1;
       h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
       h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
       h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
@@ -151,13 +154,17 @@
 
   TRACE("write_seismograms_transfer_scalar_from_device");
 
+  int irec_local,irec;
+  int ispec,iglob,i;
+
   // checks if anything to do
   if( mp->nrec_local == 0 ) return;
 
   int blocksize = NGLL3;
+
   int num_blocks_x = mp->nrec_local;
   int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
+  while(num_blocks_x > MAXIMUM_GRID_DIM) {
     num_blocks_x = (int) ceil(num_blocks_x*0.5f);
     num_blocks_y = num_blocks_y*2;
   }
@@ -174,15 +181,13 @@
 
   // copies array to CPU
   cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
-             1*NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+             NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
 
-  int irec_local;
   for(irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
-    int irec = number_receiver_global[irec_local] - 1;
-    int ispec = h_ispec_selected[irec] - 1;
-
-    for(int i = 0; i < NGLL3; i++) {
-      int iglob = ibool[i+NGLL3*ispec] - 1;
+    irec = number_receiver_global[irec_local] - 1;
+    ispec = h_ispec_selected[irec] - 1;
+    for(i = 0; i < NGLL3; i++) {
+      iglob = ibool[i+NGLL3*ispec] - 1;
       h_field[iglob] = mp->h_station_seismo_field[i+irec_local*NGLL3];
     }
   }
@@ -196,7 +201,8 @@
 
 extern "C"
 void FC_FUNC_(write_seismograms_transfer_cuda,
-              WRITE_SEISMOGRAMS_TRANSFER_CUDA)(realw* displ,
+              WRITE_SEISMOGRAMS_TRANSFER_CUDA)(long* Mesh_pointer_f,
+                                               realw* displ,
                                                realw* b_displ,
                                                realw* eps_trace_over_3,
                                                realw* epsilondev_xx,
@@ -204,7 +210,6 @@
                                                realw* epsilondev_xy,
                                                realw* epsilondev_xz,
                                                realw* epsilondev_yz,
-                                               long* Mesh_pointer_f,
                                                int* number_receiver_global,
                                                int* ispec_selected_rec,
                                                int* ispec_selected_source,
@@ -219,10 +224,10 @@
   switch( mp->simulation_type ){
     case 1:
       write_seismograms_transfer_from_device(mp,mp->d_displ_crust_mantle,
-                                           displ,
-                                           number_receiver_global,
-                                           mp->d_ispec_selected_rec,
-                                           ispec_selected_rec, ibool);
+                                             displ,
+                                             number_receiver_global,
+                                             mp->d_ispec_selected_rec,
+                                             ispec_selected_rec, ibool);
       break;
 
     case 2:
@@ -271,161 +276,5 @@
                                                  ispec_selected_rec, ibool);
       break;
   }
-
 }
 
-/* ----------------------------------------------------------------------------------------------- */
-
-// ACOUSTIC simulations
-
-/* ----------------------------------------------------------------------------------------------- */
-/*
-__global__ void transfer_stations_fields_acoustic_from_device_kernel(int* number_receiver_global,
-                                                                     int* ispec_selected_rec,
-                                                                     int* ibool,
-                                                                     realw* station_seismo_potential,
-                                                                     realw* desired_potential) {
-
-  int blockID = blockIdx.x + blockIdx.y*gridDim.x;
-  int nodeID = threadIdx.x + blockID*blockDim.x;
-
-  int irec = number_receiver_global[blockID]-1;
-  int ispec = ispec_selected_rec[irec]-1;
-  int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
-
-  //if(threadIdx.x == 0 ) printf("node acoustic: %i %i %i %i %i %e \n",blockID,nodeID,irec,ispec,iglob,desired_potential[iglob]);
-
-  station_seismo_potential[nodeID] = desired_potential[iglob];
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-/*
-void transfer_field_acoustic_from_device(Mesh* mp,
-                                         realw* d_potential,
-                                         realw* h_potential,
-                                         int* number_receiver_global,
-                                         int* d_ispec_selected,
-                                         int* h_ispec_selected,
-                                         int* ibool) {
-
-TRACE("transfer_field_acoustic_from_device");
-
-  int irec_local,irec,ispec,iglob,j;
-
-  // checks if anything to do
-  if( mp->nrec_local == 0 ) return;
-
-  // sets up kernel dimensions
-  int blocksize = NGLL3;
-  int num_blocks_x = mp->nrec_local;
-  int num_blocks_y = 1;
-  while(num_blocks_x > 65535) {
-    num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-    num_blocks_y = num_blocks_y*2;
-  }
-
-  dim3 grid(num_blocks_x,num_blocks_y);
-  dim3 threads(blocksize,1,1);
-
-  // prepare field transfer array on device
-  transfer_stations_fields_acoustic_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
-                                                                         d_ispec_selected,
-                                                                         mp->d_ibool,
-                                                                         mp->d_station_seismo_potential,
-                                                                         d_potential);
-
-
-  print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential,
-                                     mp->nrec_local*NGLL3*sizeof(realw),cudaMemcpyDeviceToHost),500);
-
-  //printf("copy local receivers: %i \n",mp->nrec_local);
-
-  for(irec_local=0; irec_local < mp->nrec_local; irec_local++) {
-    irec = number_receiver_global[irec_local]-1;
-    ispec = h_ispec_selected[irec]-1;
-
-    // copy element values
-    // note: iglob may vary and can be irregularly accessing the h_potential array
-    for(j=0; j < NGLL3; j++){
-      iglob = ibool[j+NGLL3*ispec]-1;
-      h_potential[iglob] = mp->h_station_seismo_potential[j+irec_local*NGLL3];
-    }
-
-    // copy each station element's points to working array
-    // note: this works if iglob values would be all aligned...
-    //memcpy(&(h_potential[iglob]),&(mp->h_station_seismo_potential[irec_local*NGLL3]),NGLL3*sizeof(realw));
-
-  }
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("transfer_field_acoustic_from_device");
-#endif
-}
-*/
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(transfer_station_ac_from_device,
-              TRANSFER_STATION_AC_FROM_DEVICE)(
-                                                realw* potential_acoustic,
-                                                realw* potential_dot_acoustic,
-                                                realw* potential_dot_dot_acoustic,
-                                                realw* b_potential_acoustic,
-                                                realw* b_potential_dot_acoustic,
-                                                realw* b_potential_dot_dot_acoustic,
-                                                long* Mesh_pointer_f,
-                                                int* number_receiver_global,
-                                                int* ispec_selected_rec,
-                                                int* ispec_selected_source,
-                                                int* ibool,
-                                                int* SIMULATION_TYPEf) {
-
-TRACE("transfer_station_ac_from_device");
-  //double start_time = get_time();
-
-  Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
-  // checks if anything to do
-  if( mp->nrec_local == 0 ) return;
-
-  int SIMULATION_TYPE = *SIMULATION_TYPEf;
-
-  if(SIMULATION_TYPE == 1) {
-    transfer_field_acoustic_from_device(mp,mp->d_potential_acoustic,potential_acoustic,
-                                        number_receiver_global,
-                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_acoustic,potential_dot_acoustic,
-                                        number_receiver_global,
-                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
-                                        number_receiver_global,
-                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-  }
-  else if(SIMULATION_TYPE == 2) {
-    transfer_field_acoustic_from_device(mp,mp->d_potential_acoustic,potential_acoustic,
-                                        number_receiver_global,
-                                        mp->d_ispec_selected_source, ispec_selected_source, ibool);
-    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_acoustic,potential_dot_acoustic,
-                                        number_receiver_global,
-                                        mp->d_ispec_selected_source, ispec_selected_source, ibool);
-    transfer_field_acoustic_from_device(mp,mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
-                                        number_receiver_global,
-                                        mp->d_ispec_selected_source, ispec_selected_source, ibool);
-  }
-  else if(SIMULATION_TYPE == 3) {
-    transfer_field_acoustic_from_device(mp,mp->d_b_potential_acoustic,b_potential_acoustic,
-                                        number_receiver_global,
-                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-    transfer_field_acoustic_from_device(mp,mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
-                                        number_receiver_global,
-                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-    transfer_field_acoustic_from_device(mp,mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic,
-                                        number_receiver_global,
-                                        mp->d_ispec_selected_rec, ispec_selected_rec, ibool);
-  }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  //double end_time = get_time();
-  //printf("Elapsed time: %e\n",end_time-start_time);
-  exit_on_cuda_error("transfer_station_ac_from_device");
-#endif
-}
-*/

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_mass_matrices.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_mass_matrices.f90	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/meshfem3D/create_mass_matrices.f90	2012-08-19 01:24:46 UTC (rev 20591)
@@ -162,9 +162,15 @@
       do j = 1,NGLLY
         do i = 1,NGLLX
 
-          ! if 3D Earth with topography, compute local height of oceans
+          ! note: old version (5.1.4)
+          ! only for models where 3D crustal stretching was used (even without topography?)
+          !if( CASE_3D ) then
+
+          ! note: new version:
+          ! for 3D Earth with topography, compute local height of oceans
           if( TOPOGRAPHY ) then
 
+
             ! get coordinates of current point
             xval = xstore(i,j,k,ispec)
             yval = ystore(i,j,k,ispec)

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90	2012-08-19 01:24:46 UTC (rev 20591)
@@ -85,18 +85,12 @@
     do iinterface = 1, num_interfaces
       ! non-blocking synchronous send request
       call isend_cr(buffer_send_scalar(1:nibool_interfaces(iinterface),iinterface), &
-           nibool_interfaces(iinterface), &
-           my_neighbours(iinterface), &
-           itag, &
-           request_send_scalar(iinterface) &
-           )
+                   nibool_interfaces(iinterface),my_neighbours(iinterface), &
+                   itag,request_send_scalar(iinterface) )
       ! receive request
       call irecv_cr(buffer_recv_scalar(1:nibool_interfaces(iinterface),iinterface), &
-           nibool_interfaces(iinterface), &
-           my_neighbours(iinterface), &
-           itag, &
-           request_recv_scalar(iinterface) &
-           )
+                   nibool_interfaces(iinterface),my_neighbours(iinterface), &
+                   itag,request_recv_scalar(iinterface) )
     enddo
 
     ! wait for communications completion (recv)
@@ -177,18 +171,12 @@
     do iinterface = 1, num_interfaces
       ! non-blocking synchronous send request
       call isend_cr(buffer_send_scalar(1:nibool_interfaces(iinterface),iinterface), &
-           nibool_interfaces(iinterface), &
-           my_neighbours(iinterface), &
-           itag, &
-           request_send_scalar(iinterface) &
-           )
+                   nibool_interfaces(iinterface),my_neighbours(iinterface), &
+                   itag,request_send_scalar(iinterface))
       ! receive request
       call irecv_cr(buffer_recv_scalar(1:nibool_interfaces(iinterface),iinterface), &
-           nibool_interfaces(iinterface), &
-           my_neighbours(iinterface), &
-           itag, &
-           request_recv_scalar(iinterface) &
-           )
+                   nibool_interfaces(iinterface),my_neighbours(iinterface), &
+                   itag,request_recv_scalar(iinterface))
 
     enddo
 
@@ -303,18 +291,12 @@
     do iinterface = 1, num_interfaces
       ! non-blocking synchronous send request
       call isend_cr(buffer_send_scalar(1:nibool_interfaces(iinterface),iinterface), &
-           nibool_interfaces(iinterface), &
-           my_neighbours(iinterface), &
-           itag, &
-           request_send_scalar(iinterface) &
-           )
+                   nibool_interfaces(iinterface),my_neighbours(iinterface), &
+                   itag,request_send_scalar(iinterface))
       ! receive request
       call irecv_cr(buffer_recv_scalar(1:nibool_interfaces(iinterface),iinterface), &
-           nibool_interfaces(iinterface), &
-           my_neighbours(iinterface), &
-           itag, &
-           request_recv_scalar(iinterface) &
-           )
+                   nibool_interfaces(iinterface),my_neighbours(iinterface), &
+                   itag,request_recv_scalar(iinterface) )
 
     enddo
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90	2012-08-19 01:24:46 UTC (rev 20591)
@@ -26,11 +26,11 @@
 !=====================================================================
 
   subroutine compute_coupling_fluid_CMB(displ_crust_mantle,b_displ_crust_mantle, &
-                            ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
-                            accel_outer_core,b_accel_outer_core, &
-                            normal_top_outer_core,jacobian2D_top_outer_core, &
-                            wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
-                            SIMULATION_TYPE,nspec2D_top)
+                                       ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
+                                       accel_outer_core,b_accel_outer_core, &
+                                       normal_top_outer_core,jacobian2D_top_outer_core, &
+                                       wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
+                                       SIMULATION_TYPE,nspec2D_top)
 
   implicit none
 
@@ -289,14 +289,15 @@
 
         ! get global point number
         ! corresponding points are located at the top of the outer core
-        iglob_oc = ibool_outer_core(i,j,NGLLZ,ispec_selected)
+        iglob_oc = ibool_outer_core(i,j,k_corresp,ispec_selected)
         iglob_mantle = ibool_crust_mantle(i,j,k,ispec)
 
         ! compute pressure, taking gravity into account
         if(GRAVITY_VAL) then
           pressure = RHO_TOP_OC * (- accel_outer_core(iglob_oc) &
              + minus_g_cmb *(displ_crust_mantle(1,iglob_mantle)*nx &
-             + displ_crust_mantle(2,iglob_mantle)*ny + displ_crust_mantle(3,iglob_mantle)*nz))
+                            + displ_crust_mantle(2,iglob_mantle)*ny &
+                            + displ_crust_mantle(3,iglob_mantle)*nz))
         else
           pressure = - RHO_TOP_OC * accel_outer_core(iglob_oc)
         endif
@@ -401,7 +402,7 @@
         if(GRAVITY_VAL) then
           pressure = RHO_BOTTOM_OC * (- accel_outer_core(iglob) &
              + minus_g_icb *(displ_inner_core(1,iglob_inner_core)*nx &
-             + displ_inner_core(2,iglob_inner_core)*ny + displ_inner_core(3,iglob_inner_core)*nz))
+                           + displ_inner_core(2,iglob_inner_core)*ny + displ_inner_core(3,iglob_inner_core)*nz))
         else
           pressure = - RHO_BOTTOM_OC * accel_outer_core(iglob)
         endif

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-08-19 01:24:46 UTC (rev 20591)
@@ -144,7 +144,7 @@
              ! crust/mantle region
              call compute_forces_crust_mantle_Dev( NSPEC_CRUST_MANTLE_ADJOINT,NGLOB_CRUST_MANTLE_ADJOINT, &
                   NSPEC_CRUST_MANTLE_STR_AND_ATT, &
-                  deltat, &
+                  b_deltat, &
                   b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
                   phase_is_inner, &
                   b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
@@ -159,7 +159,7 @@
              ! inner core region
              call compute_forces_inner_core_Dev( NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
                   NSPEC_INNER_CORE_STR_AND_ATT, &
-                  deltat, &
+                  b_deltat, &
                   b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
                   phase_is_inner, &
                   b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
@@ -177,7 +177,7 @@
              ! crust/mantle region
              call compute_forces_crust_mantle( NSPEC_CRUST_MANTLE_ADJOINT,NGLOB_CRUST_MANTLE_ADJOINT, &
                   NSPEC_CRUST_MANTLE_STR_AND_ATT, &
-                  deltat, &
+                  b_deltat, &
                   b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
                   phase_is_inner, &
                   b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
@@ -193,7 +193,7 @@
              ! inner core region
              call compute_forces_inner_core( NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
                   NSPEC_INNER_CORE_STR_AND_ATT, &
-                  deltat, &
+                  b_deltat, &
                   b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
                   phase_is_inner, &
                   b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
@@ -212,9 +212,9 @@
        ! on GPU
        ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
        ! for crust/mantle
-       call compute_forces_crust_mantle_cuda(Mesh_pointer,deltat,iphase)
+       call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase)
        ! for inner core
-       call compute_forces_inner_core_cuda(Mesh_pointer,deltat,iphase)
+       call compute_forces_inner_core_cuda(Mesh_pointer,iphase)
     endif ! GPU_MODE
 
 
@@ -280,27 +280,27 @@
           !--- couple with outer core at the bottom of the mantle
           !---
           if(ACTUALLY_COUPLE_FLUID_CMB) &
-               call compute_coupling_CMB_fluid(displ_crust_mantle,b_displ_crust_mantle, &
-               accel_crust_mantle,b_accel_crust_mantle, &
-               ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
-               accel_outer_core,b_accel_outer_core, &
-               normal_top_outer_core,jacobian2D_top_outer_core, &
-               wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
-               RHO_TOP_OC,minus_g_cmb, &
-               SIMULATION_TYPE,NSPEC2D_BOTTOM(IREGION_CRUST_MANTLE))
+            call compute_coupling_CMB_fluid(displ_crust_mantle,b_displ_crust_mantle, &
+                                           accel_crust_mantle,b_accel_crust_mantle, &
+                                           ibool_crust_mantle,ibelm_bottom_crust_mantle,  &
+                                           accel_outer_core,b_accel_outer_core, &
+                                           normal_top_outer_core,jacobian2D_top_outer_core, &
+                                           wgllwgll_xy,ibool_outer_core,ibelm_top_outer_core, &
+                                           RHO_TOP_OC,minus_g_cmb, &
+                                           SIMULATION_TYPE,NSPEC2D_BOTTOM(IREGION_CRUST_MANTLE))
 
           !---
           !--- couple with outer core at the top of the inner core
           !---
           if(ACTUALLY_COUPLE_FLUID_ICB) &
-               call compute_coupling_ICB_fluid(displ_inner_core,b_displ_inner_core, &
-               accel_inner_core,b_accel_inner_core, &
-               ibool_inner_core,ibelm_top_inner_core,  &
-               accel_outer_core,b_accel_outer_core, &
-               normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
-               wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
-               RHO_BOTTOM_OC,minus_g_icb, &
-               SIMULATION_TYPE,NSPEC2D_TOP(IREGION_INNER_CORE))
+            call compute_coupling_ICB_fluid(displ_inner_core,b_displ_inner_core, &
+                                           accel_inner_core,b_accel_inner_core, &
+                                           ibool_inner_core,ibelm_top_inner_core,  &
+                                           accel_outer_core,b_accel_outer_core, &
+                                           normal_bottom_outer_core,jacobian2D_bottom_outer_core, &
+                                           wgllwgll_xy,ibool_outer_core,ibelm_bottom_outer_core, &
+                                           RHO_BOTTOM_OC,minus_g_icb, &
+                                           SIMULATION_TYPE,NSPEC2D_TOP(IREGION_INNER_CORE))
 
        else
           ! on GPU
@@ -308,14 +308,12 @@
           !--- couple with outer core at the bottom of the mantle
           !---
           if( ACTUALLY_COUPLE_FLUID_CMB ) &
-               call compute_coupling_cmb_fluid_cuda(Mesh_pointer, &
-               RHO_TOP_OC,minus_g_cmb,GRAVITY_VAL)
+               call compute_coupling_cmb_fluid_cuda(Mesh_pointer)
           !---
           !--- couple with outer core at the top of the inner core
           !---
           if( ACTUALLY_COUPLE_FLUID_ICB ) &
-               call compute_coupling_icb_fluid_cuda(Mesh_pointer, &
-               RHO_BOTTOM_OC,minus_g_icb,GRAVITY_VAL)
+               call compute_coupling_icb_fluid_cuda(Mesh_pointer)
 
        endif
     endif ! iphase == 1
@@ -491,7 +489,7 @@
 
   enddo ! iphase
 
-  if(NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS) then
+  if( NCHUNKS_VAL /= 6 .and. ABSORBING_CONDITIONS ) then
      NGLOB_XY = NGLOB_CRUST_MANTLE
   else
      NGLOB_XY = 1
@@ -500,25 +498,25 @@
   ! updates (only) acceleration w/ rotation in the crust/mantle region (touches oceans)
   if(.NOT. GPU_MODE) then
      ! on CPU
-     call compute_forces_el_update_accel(NGLOB_CRUST_MANTLE,NGLOB_XY,veloc_crust_mantle,accel_crust_mantle,two_omega_earth, &
+     call compute_forces_el_update_accel(NGLOB_CRUST_MANTLE,NGLOB_XY,veloc_crust_mantle,accel_crust_mantle, &
+                                        two_omega_earth, &
                                         rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle, &
                                         NCHUNKS_VAL,ABSORBING_CONDITIONS)
-
      ! adjoint / kernel runs
      if (SIMULATION_TYPE == 3) &
           call compute_forces_el_update_accel(NGLOB_CRUST_MANTLE_ADJOINT,NGLOB_XY,b_veloc_crust_mantle,b_accel_crust_mantle, &
-                                             b_two_omega_earth,rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle, &
+                                             b_two_omega_earth, &
+                                             rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle, &
                                              NCHUNKS_VAL,ABSORBING_CONDITIONS)
-
   else
      ! on GPU
      call kernel_3_a_cuda(Mesh_pointer, &
-                         deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS_VAL,NCHUNKS_VAL)
+                         deltatover2,SIMULATION_TYPE,b_deltatover2,NCHUNKS_VAL)
   endif
 
   ! couples ocean with crust mantle
   ! (updates acceleration with ocean load approximation)
-  if(OCEANS_VAL) then
+  if( OCEANS_VAL ) then
     if(.NOT. GPU_MODE) then
       ! on CPU
       call compute_coupling_ocean(accel_crust_mantle,b_accel_crust_mantle, &
@@ -551,7 +549,7 @@
   else
     ! on GPU
     call kernel_3_b_cuda(Mesh_pointer, &
-                        deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS_VAL)
+                        deltatover2,SIMULATION_TYPE,b_deltatover2)
   endif
 
   end subroutine compute_forces_elastic
@@ -559,7 +557,8 @@
 
 !=====================================================================
 
-  subroutine compute_forces_el_update_accel(NGLOB,NGLOB_XY,veloc_crust_mantle,accel_crust_mantle,two_omega_earth, &
+  subroutine compute_forces_el_update_accel(NGLOB,NGLOB_XY,veloc_crust_mantle,accel_crust_mantle, &
+                                           two_omega_earth, &
                                            rmassx_crust_mantle,rmassy_crust_mantle,rmassz_crust_mantle, &
                                            NCHUNKS_VAL,ABSORBING_CONDITIONS)
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_outer_core_Dev.f90	2012-08-19 01:24:46 UTC (rev 20591)
@@ -73,20 +73,21 @@
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: tempx1,tempx2,tempx3
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: newtempx1,newtempx2,newtempx3
   ! for gravity
-  integer int_radius
-  double precision radius,theta,phi,gxl,gyl,gzl
-  double precision cos_theta,sin_theta,cos_phi,sin_phi
+  integer :: int_radius
+  double precision :: radius,theta,phi,gxl,gyl,gzl
+  double precision :: cos_theta,sin_theta,cos_phi,sin_phi
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: gravity_term
   ! for the Euler scheme for rotation
-  real(kind=CUSTOM_REAL) two_omega_deltat,cos_two_omega_t,sin_two_omega_t,A_rotation,B_rotation, &
+  real(kind=CUSTOM_REAL) :: two_omega_deltat,cos_two_omega_t,sin_two_omega_t,A_rotation,B_rotation, &
        ux_rotation,uy_rotation,dpotentialdx_with_rot,dpotentialdy_with_rot
+
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: source_euler_A,source_euler_B
 
-  integer ispec,iglob
-  integer i,j,k
-  real(kind=CUSTOM_REAL) xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
-  real(kind=CUSTOM_REAL) dpotentialdxl,dpotentialdyl,dpotentialdzl
-  real(kind=CUSTOM_REAL) sum_terms
+  integer :: ispec,iglob
+  integer :: i,j,k
+  real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+  real(kind=CUSTOM_REAL) :: dpotentialdxl,dpotentialdyl,dpotentialdzl
+  real(kind=CUSTOM_REAL) :: sum_terms
 
   ! manually inline the calls to the Deville et al. (2002) routines
   real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc
@@ -253,9 +254,6 @@
             source_euler_B(i,j,k) = two_omega_deltat &
                   * (sin_two_omega_t * dpotentialdyl - cos_two_omega_t * dpotentialdxl)
 
-            A_rotation = A_array_rotation(i,j,k,ispec)
-            B_rotation = B_array_rotation(i,j,k,ispec)
-
             ux_rotation =   A_rotation*cos_two_omega_t + B_rotation*sin_two_omega_t
             uy_rotation = - A_rotation*sin_two_omega_t + B_rotation*cos_two_omega_t
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2012-08-19 01:24:46 UTC (rev 20591)
@@ -1149,12 +1149,14 @@
 
   ! prepares general fields on GPU
   call prepare_constants_device(Mesh_pointer,myrank,NGLLX, &
-                                  hprime_xx, hprime_yy, hprime_zz, &
+                                  hprime_xx, &
                                   hprimewgll_xx, hprimewgll_yy, hprimewgll_zz, &
                                   wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
                                   NSOURCES, nsources_local, &
-                                  sourcearrays,islice_selected_source,ispec_selected_source, &
-                                  number_receiver_global,islice_selected_rec,ispec_selected_rec, &
+                                  sourcearrays, &
+                                  islice_selected_source,ispec_selected_source, &
+                                  number_receiver_global, &
+                                  islice_selected_rec,ispec_selected_rec, &
                                   nrec, nrec_local, nadj_rec_local, &
                                   NSPEC_CRUST_MANTLE,NGLOB_CRUST_MANTLE, &
                                   NSPEC_CRUST_MANTLE_STRAIN_ONLY, &
@@ -1163,25 +1165,28 @@
                                   NSPEC_INNER_CORE_STRAIN_ONLY, &
                                   SIMULATION_TYPE,NOISE_TOMOGRAPHY, &
                                   SAVE_FORWARD,ABSORBING_CONDITIONS, &
-                                  OCEANS_VAL,GRAVITY_VAL,ROTATION_VAL, &
+                                  OCEANS_VAL, &
+                                  GRAVITY_VAL, &
+                                  ROTATION_VAL, &
                                   ATTENUATION_VAL,ATTENUATION_NEW_VAL, &
                                   USE_ATTENUATION_MIMIC,ATTENUATION_3D_VAL, &
                                   COMPUTE_AND_STORE_STRAIN, &
                                   ANISOTROPIC_3D_MANTLE_VAL,ANISOTROPIC_INNER_CORE_VAL, &
                                   SAVE_BOUNDARY_MESH, &
                                   USE_MESH_COLORING_GPU, &
-                                  ANISOTROPIC_KL,APPROXIMATE_HESS_KL)
+                                  ANISOTROPIC_KL,APPROXIMATE_HESS_KL, &
+                                  deltat,b_deltat)
 
   ! prepares rotation arrays
   if( ROTATION_VAL ) then
     if(myrank == 0 ) write(IMAIN,*) "  loading rotation arrays"
 
     call prepare_fields_rotation_device(Mesh_pointer, &
-                                  two_omega_earth,deltat, &
-                                  A_array_rotation,B_array_rotation, &
-                                  b_two_omega_earth,b_deltat, &
-                                  b_A_array_rotation,b_B_array_rotation, &
-                                  NSPEC_OUTER_CORE_ROTATION)
+                                       two_omega_earth, &
+                                       A_array_rotation,B_array_rotation, &
+                                       b_two_omega_earth, &
+                                       b_A_array_rotation,b_B_array_rotation, &
+                                       NSPEC_OUTER_CORE_ROTATION)
   endif
 
   ! prepares arrays related to gravity
@@ -1197,18 +1202,30 @@
           cr_density_table(NRAD_GRAVITY), &
           stat=ier)
   if( ier /= 0 ) stop 'error allocating cr_minus_rho_g_over_kappa_fluid, etc...'
-  ! d_ln_density_dr_table needed for no gravity case
-  cr_d_ln_density_dr_table(:) = d_ln_density_dr_table(:)
-  ! these are needed for gravity cases only
-  cr_minus_rho_g_over_kappa_fluid(:) = minus_rho_g_over_kappa_fluid(:)
-  cr_minus_gravity_table(:) = minus_gravity_table(:)
-  cr_minus_deriv_gravity_table(:) = minus_deriv_gravity_table(:)
-  cr_density_table(:) = density_table(:)
 
   allocate(cr_wgll_cube(NGLLX,NGLLY,NGLLZ),stat=ier)
   if( ier /= 0 ) stop 'error allocating cr_wgll_cube'
-  cr_wgll_cube(:,:,:) = wgll_cube(:,:,:)
 
+  if(CUSTOM_REAL == SIZE_REAL) then
+    ! d_ln_density_dr_table needed for no gravity case
+    cr_d_ln_density_dr_table(:) = sngl(d_ln_density_dr_table(:))
+    ! these are needed for gravity cases only
+    cr_minus_rho_g_over_kappa_fluid(:) = sngl(minus_rho_g_over_kappa_fluid(:))
+    cr_minus_gravity_table(:) = sngl(minus_gravity_table(:))
+    cr_minus_deriv_gravity_table(:) = sngl(minus_deriv_gravity_table(:))
+    cr_density_table(:) = sngl(density_table(:))
+    cr_wgll_cube(:,:,:) = sngl(wgll_cube(:,:,:))
+  else
+    ! d_ln_density_dr_table needed for no gravity case
+    cr_d_ln_density_dr_table(:) = d_ln_density_dr_table(:)
+    ! these are needed for gravity cases only
+    cr_minus_rho_g_over_kappa_fluid(:) = minus_rho_g_over_kappa_fluid(:)
+    cr_minus_gravity_table(:) = minus_gravity_table(:)
+    cr_minus_deriv_gravity_table(:) = minus_deriv_gravity_table(:)
+    cr_density_table(:) = density_table(:)
+    cr_wgll_cube(:,:,:) = wgll_cube(:,:,:)
+  endif
+
   ! prepares on GPU
   call prepare_fields_gravity_device(Mesh_pointer, &
                                     cr_d_ln_density_dr_table, &
@@ -1217,7 +1234,10 @@
                                     cr_minus_deriv_gravity_table, &
                                     cr_density_table, &
                                     cr_wgll_cube, &
-                                    NRAD_GRAVITY)
+                                    NRAD_GRAVITY, &
+                                    minus_g_icb,minus_g_cmb, &
+                                    RHO_BOTTOM_OC,RHO_TOP_OC)
+
   deallocate(cr_d_ln_density_dr_table,cr_minus_rho_g_over_kappa_fluid, &
             cr_minus_gravity_table,cr_minus_deriv_gravity_table, &
             cr_density_table)
@@ -1287,10 +1307,8 @@
                                     nkmin_xi_outer_core,nkmin_eta_outer_core, &
                                     ibelm_xmin_outer_core,ibelm_xmax_outer_core, &
                                     ibelm_ymin_outer_core,ibelm_ymax_outer_core, &
-                                    ibelm_bottom_outer_core, &
                                     jacobian2D_xmin_outer_core,jacobian2D_xmax_outer_core, &
                                     jacobian2D_ymin_outer_core,jacobian2D_ymax_outer_core, &
-                                    jacobian2D_bottom_outer_core, &
                                     vp_outer_core)
 
   endif
@@ -1350,7 +1368,7 @@
 
     ! allocates arrays with all global points on ocean surface
     npoin_oceans = ipoin
-    allocate(iglob_ocean_load(npoin_oceans), &
+    allocate(ibool_ocean_load(npoin_oceans), &
             normal_ocean_load(NDIM,npoin_oceans), &
             rmass_ocean_load_selected(npoin_oceans), &
             stat=ier)
@@ -1369,7 +1387,7 @@
           if(.not. updated_dof_ocean_load(iglob)) then
             ipoin = ipoin + 1
             ! fills arrays
-            iglob_ocean_load(ipoin) = iglob
+            ibool_ocean_load(ipoin) = iglob
             rmass_ocean_load_selected(ipoin) = rmass_ocean_load(iglob)
             normal_ocean_load(:,ipoin) = normal_top_crust_mantle(:,i,j,ispec2D)
             ! masks this global point
@@ -1381,10 +1399,12 @@
 
     ! prepares arrays on GPU
     call prepare_oceans_device(Mesh_pointer,npoin_oceans, &
-                              rmass_ocean_load_selected,iglob_ocean_load,normal_ocean_load)
+                              ibool_ocean_load, &
+                              rmass_ocean_load_selected, &
+                              normal_ocean_load)
 
     ! frees memory
-    deallocate(iglob_ocean_load,rmass_ocean_load_selected,normal_ocean_load)
+    deallocate(ibool_ocean_load,rmass_ocean_load_selected,normal_ocean_load)
 
   endif
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/specfem3D_par.F90	2012-08-19 01:24:46 UTC (rev 20591)
@@ -92,7 +92,7 @@
   logical, dimension(NGLOB_CRUST_MANTLE_OCEANS) :: updated_dof_ocean_load
 
   integer :: npoin_oceans
-  integer, dimension(:),allocatable :: iglob_ocean_load
+  integer, dimension(:),allocatable :: ibool_ocean_load
   real(kind=CUSTOM_REAL), dimension(:,:),allocatable :: normal_ocean_load
   real(kind=CUSTOM_REAL), dimension(:),allocatable :: rmass_ocean_load_selected
 

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90	2012-08-18 17:02:11 UTC (rev 20590)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/write_seismograms.f90	2012-08-19 01:24:46 UTC (rev 20591)
@@ -44,13 +44,14 @@
     ! gets resulting array values onto CPU
     if( GPU_MODE ) then
       ! this transfers fields only in elements with stations for efficiency
-      call write_seismograms_transfer_cuda(displ_crust_mantle,b_displ_crust_mantle, &
-                            eps_trace_over_3_crust_mantle, &
-                            epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
-                            epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
-                            Mesh_pointer,number_receiver_global, &
-                            ispec_selected_rec,ispec_selected_source, &
-                            ibool_crust_mantle)
+      call write_seismograms_transfer_cuda(Mesh_pointer, &
+                                displ_crust_mantle,b_displ_crust_mantle, &
+                                eps_trace_over_3_crust_mantle, &
+                                epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+                                epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+                                number_receiver_global, &
+                                ispec_selected_rec,ispec_selected_source, &
+                                ibool_crust_mantle)
     endif
 
     ! computes traces at interpolated receiver locations



More information about the CIG-COMMITS mailing list