[cig-commits] r19050 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda generate_databases specfem3D

Fri Oct 7 18:09:46 PDT 2011

Author: danielpeter
Date: 2011-10-07 18:09:46 -0700 (Fri, 07 Oct 2011)
New Revision: 19050

Added:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
Modified:
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_mass_matrices.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_acoustic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_elastic.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90
   seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
Log:
updates acoustic/elastic coupling and oceans routines

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
@@ -657,9 +685,9 @@
           i = threadIdx.x;
           j = threadIdx.y;
           k = threadIdx.z;
-          iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+          iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
           
-          //kappal = kappastore[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+          //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
           
           //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
           //                                            pre_computed_irec_local_index[irec],

Added: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	                        (rev 0)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -0,0 +1,326 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+#include <mpi.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC - ELASTIC coupling
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_coupling_acoustic_el_kernel(float* displ, 
+                                                    float* potential_dot_dot_acoustic, 
+                                                    int num_coupling_ac_el_faces,
+                                                    int* coupling_ac_el_ispec,
+                                                    int* coupling_ac_el_ijk, 
+                                                    float* coupling_ac_el_normal,
+                                                    float* coupling_ac_el_jacobian2Dw,
+                                                    int* ibool,
+                                                    int* ispec_is_inner, 
+                                                    int phase_is_inner) {
+  
+  int igll = threadIdx.x; 
+  int iface = blockIdx.x + gridDim.x*blockIdx.y; 
+  
+  int i,j,k,iglob,ispec;
+  realw displ_x,displ_y,displ_z,displ_n;
+  realw nx,ny,nz;
+  realw jacobianw;
+  
+  if( iface < num_coupling_ac_el_faces){
+  
+    // don't compute points outside NGLLSQUARE==NGLL2==25  
+    // way 2: no further check needed since blocksize = 25
+    //  if(igll<NGLL2) {    
+    
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = coupling_ac_el_ispec[iface]-1;
+    
+    if(ispec_is_inner[ispec] == phase_is_inner ) {
+      
+      i = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1;
+      j = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+      k = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-  1;
+      
+      // elastic displacement on global point
+      displ_x = displ[iglob*3] ; // (1,iglob)
+      displ_y = displ[iglob*3+1] ; // (2,iglob)
+      displ_z = displ[iglob*3+2] ; // (3,iglob)
+      
+      // gets associated normal on GLL point
+      nx = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; // (1,igll,iface)
+      ny = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; // (2,igll,iface)
+      nz = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; // (3,igll,iface)
+
+      // calculates displacement component along normal
+      // (normal points outwards of acoustic element)
+      displ_n = displ_x*nx + displ_y*ny + displ_z*nz;
+      
+      
+      // gets associated, weighted jacobian      
+      jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];            
+      
+      //daniel
+      //if( igll == 0 ) printf("gpu: %i %i %i %i %i %e \n",i,j,k,ispec,iglob,jacobianw);
+
+
+      // continuity of pressure and normal displacement on global point
+      
+      // note: newark time scheme together with definition of scalar potential:
+      //          pressure = - chi_dot_dot
+      //          requires that this coupling term uses the updated displacement at time step [t+delta_t],
+      //          which is done at the very beginning of the time loop
+      //          (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...)
+      //          it also means you have to calculate and update this here first before
+      //          calculating the coupling on the elastic side for the acceleration...
+      atomicAdd(&potential_dot_dot_acoustic[iglob],+ jacobianw*displ_n);
+            
+    }
+  //  }  
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C" 
+void FC_FUNC_(compute_coupling_acoustic_el_cuda,
+              COMPUTE_COUPLING_ACOUSTIC_EL_CUDA)(
+                                            long* Mesh_pointer_f, 
+                                            int* phase_is_innerf, 
+                                            int* num_coupling_ac_el_facesf, 
+                                            int* SIMULATION_TYPEf) {
+  TRACE("compute_coupling_acoustic_el_cuda");
+  //double start_time = get_time();
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  int phase_is_inner            = *phase_is_innerf;
+  int num_coupling_ac_el_faces  = *num_coupling_ac_el_facesf;
+  int SIMULATION_TYPE           = *SIMULATION_TYPEf;
+  
+  // way 1: exact blocksize to match NGLLSQUARE
+  int blocksize = 25; 
+  
+  int num_blocks_x = num_coupling_ac_el_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+  
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+//daniel
+// printf("gpu: %i %i %i \n",num_coupling_ac_el_faces,SIMULATION_TYPE,phase_is_inner);
+
+    
+  compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_displ,
+                                                       mp->d_potential_dot_dot_acoustic,
+                                                       num_coupling_ac_el_faces,
+                                                       mp->d_coupling_ac_el_ispec, 
+                                                       mp->d_coupling_ac_el_ijk, 
+                                                       mp->d_coupling_ac_el_normal,
+                                                       mp->d_coupling_ac_el_jacobian2Dw, 
+                                                       mp->d_ibool, 
+                                                       mp->d_ispec_is_inner, 
+                                                       phase_is_inner);
+  
+  //  adjoint simulations
+  if (SIMULATION_TYPE == 3 ){  
+    compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_b_displ,
+                                                          mp->d_b_potential_dot_dot_acoustic,
+                                                          num_coupling_ac_el_faces,
+                                                          mp->d_coupling_ac_el_ispec, 
+                                                          mp->d_coupling_ac_el_ijk, 
+                                                          mp->d_coupling_ac_el_normal,
+                                                          mp->d_coupling_ac_el_jacobian2Dw, 
+                                                          mp->d_ibool, 
+                                                          mp->d_ispec_is_inner, 
+                                                          phase_is_inner);
+    
+  }
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING  
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_coupling_acoustic_el_kernel");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC - ACOUSTIC coupling
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_coupling_elastic_ac_kernel(float* potential_dot_dot_acoustic, 
+                                                    float* accel, 
+                                                    int num_coupling_ac_el_faces,
+                                                    int* coupling_ac_el_ispec,
+                                                    int* coupling_ac_el_ijk, 
+                                                    float* coupling_ac_el_normal,
+                                                    float* coupling_ac_el_jacobian2Dw,
+                                                    int* ibool,
+                                                    int* ispec_is_inner, 
+                                                    int phase_is_inner) {
+  
+  int igll = threadIdx.x; 
+  int iface = blockIdx.x + gridDim.x*blockIdx.y; 
+  
+  int i,j,k,iglob,ispec;
+  realw pressure;
+  realw nx,ny,nz;
+  realw jacobianw;
+  
+  if( iface < num_coupling_ac_el_faces){
+    
+    // don't compute points outside NGLLSQUARE==NGLL2==25  
+    // way 2: no further check needed since blocksize = 25
+    //  if(igll<NGLL2) {    
+    
+    // "-1" from index values to convert from Fortran-> C indexing
+    ispec = coupling_ac_el_ispec[iface]-1;
+    
+    if(ispec_is_inner[ispec] == phase_is_inner ) {
+      
+      i = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1;
+      j = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+      k = coupling_ac_el_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-  1;
+      
+      // acoustic pressure on global point
+      pressure = - potential_dot_dot_acoustic[iglob];
+            
+      // gets associated normal on GLL point
+      nx = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; // (1,igll,iface)
+      ny = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,1,igll,iface)]; // (2,igll,iface)
+      nz = coupling_ac_el_normal[INDEX3(NDIM,NGLL2,2,igll,iface)]; // (3,igll,iface)
+      
+      // gets associated, weighted jacobian      
+      jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];            
+      
+      //daniel
+      //if( igll == 0 ) printf("gpu: %i %i %i %i %i %e \n",i,j,k,ispec,iglob,jacobianw);
+      
+      
+      // continuity of displacement and pressure on global point
+      //
+      // note: newark time scheme together with definition of scalar potential:
+      //          pressure = - chi_dot_dot
+      //          requires that this coupling term uses the *UPDATED* pressure (chi_dot_dot), i.e.
+      //          pressure at time step [t + delta_t]
+      //          (see e.g. Chaljub & Vilotte, Nissen-Meyer thesis...)
+      //          it means you have to calculate and update the acoustic pressure first before
+      //          calculating this term...
+      atomicAdd(&accel[iglob*3],+ jacobianw*nx*pressure);
+      atomicAdd(&accel[iglob*3+1],+ jacobianw*ny*pressure);
+      atomicAdd(&accel[iglob*3+2],+ jacobianw*nz*pressure);
+    }
+    //  }  
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C" 
+void FC_FUNC_(compute_coupling_elastic_ac_cuda,
+              COMPUTE_COUPLING_ELASTIC_AC_CUDA)(
+                                                 long* Mesh_pointer_f, 
+                                                 int* phase_is_innerf, 
+                                                 int* num_coupling_ac_el_facesf, 
+                                                 int* SIMULATION_TYPEf) {
+  TRACE("compute_coupling_elastic_ac_cuda");
+  //double start_time = get_time();
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+  int phase_is_inner            = *phase_is_innerf;
+  int num_coupling_ac_el_faces  = *num_coupling_ac_el_facesf;
+  int SIMULATION_TYPE           = *SIMULATION_TYPEf;
+  
+  // way 1: exact blocksize to match NGLLSQUARE
+  int blocksize = 25; 
+  
+  int num_blocks_x = num_coupling_ac_el_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+  
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+  
+  //daniel
+  // printf("gpu: %i %i %i \n",num_coupling_ac_el_faces,SIMULATION_TYPE,phase_is_inner);
+  
+  
+  compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+                                                       mp->d_accel,
+                                                       num_coupling_ac_el_faces,
+                                                       mp->d_coupling_ac_el_ispec, 
+                                                       mp->d_coupling_ac_el_ijk, 
+                                                       mp->d_coupling_ac_el_normal,
+                                                       mp->d_coupling_ac_el_jacobian2Dw, 
+                                                       mp->d_ibool, 
+                                                       mp->d_ispec_is_inner, 
+                                                       phase_is_inner);
+  
+  //  adjoint simulations
+  if (SIMULATION_TYPE == 3 ){  
+    compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+                                                         mp->d_b_accel,
+                                                         num_coupling_ac_el_faces,
+                                                         mp->d_coupling_ac_el_ispec, 
+                                                         mp->d_coupling_ac_el_ijk, 
+                                                         mp->d_coupling_ac_el_normal,
+                                                         mp->d_coupling_ac_el_jacobian2Dw, 
+                                                         mp->d_ibool, 
+                                                         mp->d_ispec_is_inner, 
+                                                         phase_is_inner);
+    
+  }
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING  
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("compute_coupling_elastic_ac_cuda");
+#endif
+}

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,32 +1,31 @@
 /*
-!=====================================================================
-!
-!               S p e c f e m 3 D  V e r s i o n  2 . 0
-!               ---------------------------------------
-!
-!          Main authors: Dimitri Komatitsch and Jeroen Tromp
-!    Princeton University, USA and University of Pau / CNRS / INRIA
-! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
-!                            April 2011
-!
-! This program is free software; you can redistribute it and/or modify
-! it under the terms of the GNU General Public License as published by
-! the Free Software Foundation; either version 2 of the License, or
-! (at your option) any later version.
-!
-! This program is distributed in the hope that it will be useful,
-! but WITHOUT ANY WARRANTY; without even the implied warranty of
-! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-! GNU General Public License for more details.
-!
-! You should have received a copy of the GNU General Public License along
-! with this program; if not, write to the Free Software Foundation, Inc.,
-! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-!
-!=====================================================================
-*/
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
 
-
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
@@ -51,8 +50,6 @@
                                                      int* d_ibool_interfaces_ext_mesh) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-  //int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  //int tx = threadIdx.x;
   int iinterface=0;  
   
   for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
@@ -144,8 +141,6 @@
                                                       int* d_ibool_interfaces_ext_mesh) {
 
   int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
-  //int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  //int tx = threadIdx.x;
   int iinterface=0;  
 
   for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
@@ -186,11 +181,19 @@
 TRACE("transfer_and_assemble_potential_to_device");
 
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+  //double start_time = get_time();
+  // cudaEvent_t start, stop; 
+  // float time; 
+  // cudaEventCreate(&start); 
+  // cudaEventCreate(&stop); 
+  // cudaEventRecord( start, 0 );
 
+  // copies buffer onto GPU  
   cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh, 
              *max_nibool_interfaces_ext_mesh* *num_interfaces_ext_mesh*sizeof(real), cudaMemcpyHostToDevice);
 
-  int blocksize = 256;
+  // assembles on GPU
+  int blocksize = 256;  
   int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
   int num_blocks_x = size_padded/blocksize;
   int num_blocks_y = 1;
@@ -201,12 +204,7 @@
 
   dim3 grid(num_blocks_x,num_blocks_y);
   dim3 threads(blocksize,1,1);
-  //double start_time = get_time();
-  // cudaEvent_t start, stop; 
-  // float time; 
-  // cudaEventCreate(&start); 
-  // cudaEventCreate(&stop); 
-  // cudaEventRecord( start, 0 );
+
   if(*FORWARD_OR_ADJOINT == 1) { 
     //assemble forward field
     assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic, 
@@ -226,13 +224,15 @@
                                                             mp->d_ibool_interfaces_ext_mesh);    
   }
   
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   // cudaEventRecord( stop, 0 );
   // cudaEventSynchronize( stop );
   // cudaEventElapsedTime( &time, start, stop );
   // cudaEventDestroy( start );
   // cudaEventDestroy( stop );
   // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
   exit_on_cuda_error("transfer_and_assemble_potential_to_device");
 #endif  
 }
@@ -267,6 +267,7 @@
                                             int* SIMULATION_TYPE) {
 
 TRACE("compute_forces_acoustic_cuda");
+  //double start_time = get_time();
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
@@ -284,12 +285,19 @@
   Kernel_2_acoustic(num_elements, mp, *iphase, *SIMULATION_TYPE);  
   
   cudaThreadSynchronize();
-/* MPI_Barrier(MPI_COMM_WORLD); */
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  /* MPI_Barrier(MPI_COMM_WORLD); */
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+#endif
 }
 
 
 /* ----------------------------------------------------------------------------------------------- */
+
 /* KERNEL 2 */
+
 /* ----------------------------------------------------------------------------------------------- */
 
 void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase, int SIMULATION_TYPE)
@@ -358,12 +366,13 @@
 #endif
 }
 
+/* ----------------------------------------------------------------------------------------------- */
 
 /* KERNEL 2 on device*/
 
-//typedef double reald;
-//typedef float reald;
+/* ----------------------------------------------------------------------------------------------- */
 
+
 __global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
                                       int* d_phase_ispec_inner_acoustic, 
                                       int num_phase_ispec_acoustic, int d_iphase,
@@ -374,15 +383,9 @@
                                       float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
                                       float* d_rhostore){
     
-  /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
   int bx = blockIdx.y*gridDim.x+blockIdx.x;
-  /* int bx = blockIdx.x; */
   int tx = threadIdx.x;
 
-  
-  
-  //const int NGLLX = 5;
-  // const int NGLL2 = 25; 
   const int NGLL3 = 125;
   const int NGLL3_ALIGN = 128;
     
@@ -466,23 +469,23 @@
         }
 #else
 
-            temp1l = s_dummy_loc[K*NGLL2+J*NGLLX]*hprime_xx[I]
-                    + s_dummy_loc[K*NGLL2+J*NGLLX+1]*hprime_xx[NGLLX+I]
-                    + s_dummy_loc[K*NGLL2+J*NGLLX+2]*hprime_xx[2*NGLLX+I]
-                    + s_dummy_loc[K*NGLL2+J*NGLLX+3]*hprime_xx[3*NGLLX+I]
-                    + s_dummy_loc[K*NGLL2+J*NGLLX+4]*hprime_xx[4*NGLLX+I];	    
-	    
-            temp2l = s_dummy_loc[K*NGLL2+I]*hprime_xx[J]
-                    + s_dummy_loc[K*NGLL2+NGLLX+I]*hprime_xx[NGLLX+J]
-                    + s_dummy_loc[K*NGLL2+2*NGLLX+I]*hprime_xx[2*NGLLX+J]
-                    + s_dummy_loc[K*NGLL2+3*NGLLX+I]*hprime_xx[3*NGLLX+J]
-                    + s_dummy_loc[K*NGLL2+4*NGLLX+I]*hprime_xx[4*NGLLX+J];
+        temp1l = s_dummy_loc[K*NGLL2+J*NGLLX]*hprime_xx[I]
+                + s_dummy_loc[K*NGLL2+J*NGLLX+1]*hprime_xx[NGLLX+I]
+                + s_dummy_loc[K*NGLL2+J*NGLLX+2]*hprime_xx[2*NGLLX+I]
+                + s_dummy_loc[K*NGLL2+J*NGLLX+3]*hprime_xx[3*NGLLX+I]
+                + s_dummy_loc[K*NGLL2+J*NGLLX+4]*hprime_xx[4*NGLLX+I];	    
+  
+        temp2l = s_dummy_loc[K*NGLL2+I]*hprime_xx[J]
+                + s_dummy_loc[K*NGLL2+NGLLX+I]*hprime_xx[NGLLX+J]
+                + s_dummy_loc[K*NGLL2+2*NGLLX+I]*hprime_xx[2*NGLLX+J]
+                + s_dummy_loc[K*NGLL2+3*NGLLX+I]*hprime_xx[3*NGLLX+J]
+                + s_dummy_loc[K*NGLL2+4*NGLLX+I]*hprime_xx[4*NGLLX+J];
 
-            temp3l = s_dummy_loc[J*NGLLX+I]*hprime_xx[K]
-                    + s_dummy_loc[NGLL2+J*NGLLX+I]*hprime_xx[NGLLX+K]
-                    + s_dummy_loc[2*NGLL2+J*NGLLX+I]*hprime_xx[2*NGLLX+K]
-                    + s_dummy_loc[3*NGLL2+J*NGLLX+I]*hprime_xx[3*NGLLX+K]
-                    + s_dummy_loc[4*NGLL2+J*NGLLX+I]*hprime_xx[4*NGLLX+K];
+        temp3l = s_dummy_loc[J*NGLLX+I]*hprime_xx[K]
+                + s_dummy_loc[NGLL2+J*NGLLX+I]*hprime_xx[NGLLX+K]
+                + s_dummy_loc[2*NGLL2+J*NGLLX+I]*hprime_xx[2*NGLLX+K]
+                + s_dummy_loc[3*NGLL2+J*NGLLX+I]*hprime_xx[3*NGLLX+K]
+                + s_dummy_loc[4*NGLL2+J*NGLLX+I]*hprime_xx[4*NGLLX+K];
 
 #endif
 
@@ -546,25 +549,25 @@
         }
 #else
 
-            temp1l = s_temp1[K*NGLL2+J*NGLLX]*hprimewgll_xx[I*NGLLX]
-                    + s_temp1[K*NGLL2+J*NGLLX+1]*hprimewgll_xx[I*NGLLX+1]
-                    + s_temp1[K*NGLL2+J*NGLLX+2]*hprimewgll_xx[I*NGLLX+2]
-                    + s_temp1[K*NGLL2+J*NGLLX+3]*hprimewgll_xx[I*NGLLX+3]
-                    + s_temp1[K*NGLL2+J*NGLLX+4]*hprimewgll_xx[I*NGLLX+4];
-	    
+        temp1l = s_temp1[K*NGLL2+J*NGLLX]*hprimewgll_xx[I*NGLLX]
+                + s_temp1[K*NGLL2+J*NGLLX+1]*hprimewgll_xx[I*NGLLX+1]
+                + s_temp1[K*NGLL2+J*NGLLX+2]*hprimewgll_xx[I*NGLLX+2]
+                + s_temp1[K*NGLL2+J*NGLLX+3]*hprimewgll_xx[I*NGLLX+3]
+                + s_temp1[K*NGLL2+J*NGLLX+4]*hprimewgll_xx[I*NGLLX+4];
+  
 
-            temp2l = s_temp2[K*NGLL2+I]*hprimewgll_xx[J*NGLLX]
-                    + s_temp2[K*NGLL2+NGLLX+I]*hprimewgll_xx[J*NGLLX+1]
-                    + s_temp2[K*NGLL2+2*NGLLX+I]*hprimewgll_xx[J*NGLLX+2]
-                    + s_temp2[K*NGLL2+3*NGLLX+I]*hprimewgll_xx[J*NGLLX+3]
-                    + s_temp2[K*NGLL2+4*NGLLX+I]*hprimewgll_xx[J*NGLLX+4];
+        temp2l = s_temp2[K*NGLL2+I]*hprimewgll_xx[J*NGLLX]
+                + s_temp2[K*NGLL2+NGLLX+I]*hprimewgll_xx[J*NGLLX+1]
+                + s_temp2[K*NGLL2+2*NGLLX+I]*hprimewgll_xx[J*NGLLX+2]
+                + s_temp2[K*NGLL2+3*NGLLX+I]*hprimewgll_xx[J*NGLLX+3]
+                + s_temp2[K*NGLL2+4*NGLLX+I]*hprimewgll_xx[J*NGLLX+4];
 
 
-            temp3l = s_temp3[J*NGLLX+I]*hprimewgll_xx[K*NGLLX]
-                    + s_temp3[NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+1]
-                    + s_temp3[2*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+2]
-                    + s_temp3[3*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+3]
-                    + s_temp3[4*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+4];
+        temp3l = s_temp3[J*NGLLX+I]*hprimewgll_xx[K*NGLLX]
+                + s_temp3[NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+1]
+                + s_temp3[2*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+2]
+                + s_temp3[3*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+3]
+                + s_temp3[4*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+4];
 
 
 #endif
@@ -582,7 +585,7 @@
       // d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
       // d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);		
 	
-	atomicAdd(&d_potential_dot_dot_acoustic[iglob],-(fac1*temp1l + fac2*temp2l + fac3*temp3l));		
+        atomicAdd(&d_potential_dot_dot_acoustic[iglob],-(fac1*temp1l + fac2*temp2l + fac3*temp3l));		
 	
 #endif
     }
@@ -594,7 +597,9 @@
 
 
 /* ----------------------------------------------------------------------------------------------- */
+
 /* KERNEL 3 */
+
 /* ----------------------------------------------------------------------------------------------- */
 
 
@@ -717,11 +722,11 @@
 
 
 /* ----------------------------------------------------------------------------------------------- */
+
 /* KERNEL for enforce free surface */
+
 /* ----------------------------------------------------------------------------------------------- */
 
-#define INDEX(i,j,k,ispec) i + (j)*5 + (k)*25 + (ispec)*125
-#define INDEX_IJK(x,y,z) x + (y)*3 + (z)*3*25
 
 __global__ void enforce_free_surface_cuda_kernel(
                                        float* potential_acoustic, 
@@ -748,17 +753,17 @@
     if( ispec_is_acoustic[ispec] == 1 ){
   
       // gets global point index
-      int tx = threadIdx.x + threadIdx.y*blockDim.x;  
+      int igll = threadIdx.x + threadIdx.y*blockDim.x;  
 
 //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING    
-//      if( tx > 25-1 ){printf("device tx: %i \n",tx);}
+//      if( igll > 25-1 ){printf("device igll: %i \n",igll);}
 //#endif
     
-      int i = free_surface_ijk[INDEX_IJK(0,tx,iface)] - 1;
-      int j = free_surface_ijk[INDEX_IJK(1,tx,iface)] - 1;
-      int k = free_surface_ijk[INDEX_IJK(2,tx,iface)] - 1;
+      int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
+      int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+      int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
       
-      int iglob = ibool[INDEX(i,j,k,ispec)] - 1;
+      int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
       
       // sets potentials to zero at free surface
       potential_acoustic[iglob] = 0;
@@ -766,7 +771,7 @@
       potential_dot_dot_acoustic[iglob] = 0; 
 
 //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING    
-//    if( ispec == 160 && tx < 25 ){printf("device: %i %i %i %i %i \n",tx,i,j,k,iglob);}
+//    if( ispec == 160 && igll < 25 ){printf("device: %i %i %i %i %i \n",igll,i,j,k,iglob);}
 //#endif
     }
   }  

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
@@ -44,9 +72,6 @@
                               float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
                               float* alphaval,float* betaval,float* gammaval);
 
-__global__ void kernel_3_cuda_device(real* veloc,
-                                     real* accel, int size,
-                                     real deltatover2, real* rmass);
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -264,6 +289,7 @@
 TRACE("compute_forces_elastic_cuda");  
 // EPIK_TRACER("compute_forces_elastic_cuda");
 //printf("Running compute_forces\n");
+  //double start_time = get_time();
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
 
@@ -278,11 +304,15 @@
   /* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
   /* if(myrank==0) { */
 
-  Kernel_2(num_elements,mp,*iphase,*COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION);
+  Kernel_2(num_elements,mp,*iphase,*COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION);  
   
+  cudaThreadSynchronize();
   
-  cudaThreadSynchronize();
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
 /* MPI_Barrier(MPI_COMM_WORLD); */
+  //double end_time = get_time();
+  //printf("Elapsed time: %e\n",end_time-start_time);
+#endif
 }
 
 
@@ -1037,64 +1067,155 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
+__global__ void kernel_3_cuda_device(real* veloc,
+                                     real* accel, int size,
+                                     real deltatover2, real* rmass) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    accel[3*id] = accel[3*id]*rmass[id]; 
+    accel[3*id+1] = accel[3*id+1]*rmass[id]; 
+    accel[3*id+2] = accel[3*id+2]*rmass[id];
+    
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];      
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_accel_cuda_device(real* accel, 
+                                           int size,
+                                           real* rmass) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    accel[3*id] = accel[3*id]*rmass[id]; 
+    accel[3*id+1] = accel[3*id+1]*rmass[id]; 
+    accel[3*id+2] = accel[3*id+2]*rmass[id];    
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void kernel_3_veloc_cuda_device(real* veloc,
+                                           real* accel, 
+                                           int size,
+                                           real deltatover2) {
+  int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+  
+  /* because of block and grid sizing problems, there is a small */
+  /* amount of buffer at the end of the calculation */
+  if(id < size) {
+    veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+    veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
+    veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];      
+  }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
 extern "C" 
-void FC_FUNC_(kernel_3_cuda,
-              KERNEL_3_CUDA)(long* Mesh_pointer,
-                             int* size_F, 
-                             float* deltatover2_F, 
-                             int* SIMULATION_TYPE_f, 
-                             float* b_deltatover2) {
-TRACE("kernel_3_cuda");  
+void FC_FUNC_(kernel_3_a_cuda,
+              KERNEL_3_A_CUDA)(long* Mesh_pointer,
+                               int* size_F, 
+                               float* deltatover2_F, 
+                               int* SIMULATION_TYPE_f, 
+                               float* b_deltatover2_F,
+                               int* OCEANS) {
+TRACE("kernel_3_a_cuda");  
 
    Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
    int size = *size_F;
    int SIMULATION_TYPE = *SIMULATION_TYPE_f;
    real deltatover2 = *deltatover2_F;
+   real b_deltatover2 = *b_deltatover2_F;
+  
    int blocksize=128;
    int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+   
    int num_blocks_x = size_padded/blocksize;
    int num_blocks_y = 1;
    while(num_blocks_x > 65535) {
      num_blocks_x = ceil(num_blocks_x/2.0);
      num_blocks_y = num_blocks_y*2;
    }
+   
    dim3 grid(num_blocks_x,num_blocks_y);
    dim3 threads(blocksize,1,1);
    
-   kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc, mp->d_accel, size, deltatover2, mp->d_rmass);
+   // check whether we can update accel and veloc, or only accel at this point
+   if( *OCEANS == 0 ){
+     // updates both, accel and veloc
+     kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc, mp->d_accel, size, deltatover2, mp->d_rmass);
 
-   if(SIMULATION_TYPE == 3) {
-     kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc, mp->d_b_accel, size, *b_deltatover2,mp->d_rmass);
+     if(SIMULATION_TYPE == 3) {
+       kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc, mp->d_b_accel, size, b_deltatover2,mp->d_rmass);
+     }
+   }else{
+     // updates only accel 
+     kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel, size, mp->d_rmass);
+     
+     if(SIMULATION_TYPE == 3) {
+       kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel, size, mp->d_rmass);
+     }     
    }
    
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
    //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-  exit_on_cuda_error("after kernel 3");
+  exit_on_cuda_error("after kernel 3 a");
 #endif
 }
 
 /* ----------------------------------------------------------------------------------------------- */
 
- __global__ void kernel_3_cuda_device(real* veloc,
-					real* accel, int size,
-					real deltatover2, real* rmass) {
-    int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+extern "C" 
+void FC_FUNC_(kernel_3_b_cuda,
+              KERNEL_3_B_CUDA)(long* Mesh_pointer,
+                             int* size_F, 
+                             float* deltatover2_F, 
+                             int* SIMULATION_TYPE_f, 
+                             float* b_deltatover2_F) {
+  TRACE("kernel_3_b_cuda");  
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+  int size = *size_F;
+  int SIMULATION_TYPE = *SIMULATION_TYPE_f;
+  real deltatover2 = *deltatover2_F;
+  real b_deltatover2 = *b_deltatover2_F;
+  
+  int blocksize=128;
+  int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
 
-    /* because of block and grid sizing problems, there is a small */
-    /* amount of buffer at the end of the calculation */
-    if(id < size) {
-      accel[3*id] = accel[3*id]*rmass[id]; 
-      accel[3*id+1] = accel[3*id+1]*rmass[id]; 
-      accel[3*id+2] = accel[3*id+2]*rmass[id];
-      
-      veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
-      veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
-      veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];      
-    }
+  int num_blocks_x = size_padded/blocksize;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
   }
 
-/* ----------------------------------------------------------------------------------------------- */
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+  
+  // updates only veloc at this point
+  kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_veloc,mp->d_accel,size,deltatover2);
+    
+  if(SIMULATION_TYPE == 3) {
+    kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_b_veloc,mp->d_b_accel,size,b_deltatover2);
+  }     
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+  exit_on_cuda_error("after kernel 3 b");
+#endif
+}
 
+
 /* ----------------------------------------------------------------------------------------------- */
 
 /* note: 
@@ -1241,3 +1362,129 @@
   }      
   
 }
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/* KERNEL for ocean load on free surface */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void elastic_ocean_load_cuda_kernel(float* accel, 
+                                                 float* rmass,
+                                                 float* rmass_ocean_load, 
+                                                 int num_free_surface_faces,
+                                                 int* free_surface_ispec,
+                                                 int* free_surface_ijk,
+                                                 float* free_surface_normal,
+                                                 int* ibool,
+                                                 int* updated_dof_ocean_load) {
+  // gets spectral element face id
+  int igll = threadIdx.x ;  //  threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
+  int iface = blockIdx.x + gridDim.x*blockIdx.y;   
+  realw nx,ny,nz;
+  realw force_normal_comp,additional_term;
+  
+  // for all faces on free surface
+  if( iface < num_free_surface_faces ){
+    
+    int ispec = free_surface_ispec[iface]-1;
+        
+    // gets global point index            
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+      
+    int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+      
+    // only update this global point once
+    // daniel: todo - workaround to not use the temporary update array
+    //            atomicExch returns the old value, i.e. 0 indicates that we still have to do this point
+    if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
+      
+      // get normal
+      nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface)
+      ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
+      nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];      
+      
+      // make updated component of right-hand side
+      // we divide by rmass() which is 1 / M
+      // we use the total force which includes the Coriolis term above
+      force_normal_comp = ( accel[iglob*3]*nx + accel[iglob*3+1]*ny + accel[iglob*3+2]*nz ) / rmass[iglob];
+      
+      additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
+      
+      // daniel: probably wouldn't need atomicAdd anymore, but just to be sure...
+      atomicAdd(&accel[iglob*3], + additional_term * nx);
+      atomicAdd(&accel[iglob*3+1], + additional_term * ny);
+      atomicAdd(&accel[iglob*3+2], + additional_term * nz);      
+    }
+  }  
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(elastic_ocean_load_cuda,
+              ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f, 
+                                       int* SIMULATION_TYPE) {
+  
+TRACE("elastic_ocean_load_cuda");
+  
+  Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+  // checks if anything to do
+  if( mp->num_free_surface_faces == 0 ) return;
+  
+  // block sizes: exact blocksize to match NGLLSQUARE
+  int blocksize = 25;
+  
+  int num_blocks_x = mp->num_free_surface_faces;
+  int num_blocks_y = 1;
+  while(num_blocks_x > 65535) {
+    num_blocks_x = ceil(num_blocks_x/2.0);
+    num_blocks_y = num_blocks_y*2;
+  }
+  
+  dim3 grid(num_blocks_x,num_blocks_y);
+  dim3 threads(blocksize,1,1);
+
+  // temporary global array: used to synchronize updates on global accel array
+  int* d_updated_dof_ocean_load;
+  print_CUDA_error_if_any(cudaMalloc((void**)&(d_updated_dof_ocean_load),sizeof(int)*mp->NGLOB_AB),88501);
+  // initializes array
+  cudaMemset((void*)d_updated_dof_ocean_load,0,sizeof(int)*mp->NGLOB_AB);
+  
+  elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_accel, 
+                                                   mp->d_rmass, 
+                                                   mp->d_rmass_ocean_load,
+                                                   mp->num_free_surface_faces,
+                                                   mp->d_free_surface_ispec, 
+                                                   mp->d_free_surface_ijk,
+                                                   mp->d_free_surface_normal,
+                                                   mp->d_ibool,
+                                                   d_updated_dof_ocean_load);
+  // for backward/reconstructed potentials
+  if(*SIMULATION_TYPE == 3) {
+    // re-initializes array
+    cudaMemset(d_updated_dof_ocean_load,0,sizeof(int)*mp->NGLOB_AB);
+
+    elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_b_accel, 
+                                                       mp->d_rmass, 
+                                                       mp->d_rmass_ocean_load,
+                                                       mp->num_free_surface_faces,
+                                                       mp->d_free_surface_ispec, 
+                                                       mp->d_free_surface_ijk,
+                                                       mp->d_free_surface_normal,
+                                                       mp->d_ibool,
+                                                       d_updated_dof_ocean_load);      
+      
+  }
+  
+  cudaFree(d_updated_dof_ocean_load);
+  
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("enforce_free_surface_cuda");
+#endif  
+}

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
@@ -185,15 +213,15 @@
     int ispec = free_surface_ispec[iface]-1;
     int igll = threadIdx.x;        
     int ipoin = igll + 25*iface;
-    int i = free_surface_ijk[INDEX3(3,25,0,igll,iface)]-1;
-    int j = free_surface_ijk[INDEX3(3,25,0,igll,iface)]-1;
-    int k = free_surface_ijk[INDEX3(3,25,0,igll,iface)]-1;
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
     
     int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
     
-    float eta = (noise_surface_movie[INDEX3(3,25,0,igll,iface)]*normal_x_noise[ipoin]+
-		 noise_surface_movie[INDEX3(3,25,1,igll,iface)]*normal_y_noise[ipoin]+ 
-		 noise_surface_movie[INDEX3(3,25,2,igll,iface)]*normal_z_noise[ipoin]);
+    float eta = ( noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x_noise[ipoin]+
+                 noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y_noise[ipoin]+ 
+                 noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z_noise[ipoin]);
 
     // if(ijk_ispec == 78496) {
     //   d_debug[0] = Sigma_kl[ijk_ispec];

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
@@ -16,30 +44,33 @@
                                                float* potential_dot_dot_acoustic, 
                                                int* abs_boundary_ispec,
                                                int* abs_boundary_ijk, 
+                                               real* abs_boundary_jacobian2Dw,
                                                int* ibool,
                                                float* rhostore,
                                                float* kappastore,
-                                               real* abs_boundary_jacobian2Dw,
                                                int* ispec_is_inner, 
                                                int* ispec_is_acoustic,
                                                int phase_is_inner,
                                                int SIMULATION_TYPE, int SAVE_FORWARD,
+                                               int num_abs_boundary_faces,
                                                float* b_potential_dot_acoustic,
                                                float* b_potential_dot_dot_acoustic,
-                                               float* b_absorb_potential // ,float* debug_val,int* debug_val_int
+                                               float* b_absorb_potential 
                                                ) {
 
-  int igll = threadIdx.x; // tx
-  int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
-
+  int igll = threadIdx.x; 
+  int iface = blockIdx.x + gridDim.x*blockIdx.y; 
   
   int i,j,k,iglob,ispec;
   realw rhol,kappal,cpl;
   realw jacobianw;
   
+  
   // don't compute points outside NGLLSQUARE==NGLL2==25  
   // way 2: no further check needed since blocksize = 25
-  //  if(igll<NGLL2) {    
+  if( iface < num_abs_boundary_faces){
+  
+  //  if(igll<NGLL2 && iface < num_abs_boundary_faces) {    
     
     // "-1" from index values to convert from Fortran-> C indexing
     ispec = abs_boundary_ispec[iface]-1;
@@ -49,12 +80,12 @@
       i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
       j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
       k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
-      iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+      iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
       
       // determines bulk sound speed
       rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
 
-      kappal = kappastore[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+      kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
 
       cpl = sqrt( kappal / rhol );
             
@@ -77,7 +108,7 @@
       
     }
 //  }
-
+  }
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -87,7 +118,6 @@
               COMPUTE_STACEY_ACOUSTIC_CUDA)(
                                     long* Mesh_pointer_f, 
                                     int* phase_is_innerf, 
-                                    int* num_abs_boundary_facesf, 
                                     int* SIMULATION_TYPEf, 
                                     int* SAVE_FORWARDf,
                                     float* h_b_absorb_potential) {
@@ -96,7 +126,6 @@
 
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
   int phase_is_inner          = *phase_is_innerf;
-  int num_abs_boundary_faces  = *num_abs_boundary_facesf;
   int SIMULATION_TYPE         = *SIMULATION_TYPEf;
   int SAVE_FORWARD            = *SAVE_FORWARDf;              
 
@@ -108,7 +137,7 @@
   // > NGLLSQUARE==NGLL2==25, no further check inside kernel
   int blocksize = 25; 
   
-  int num_blocks_x = num_abs_boundary_faces;
+  int num_blocks_x = mp->d_num_abs_boundary_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = ceil(num_blocks_x/2.0);
@@ -119,33 +148,34 @@
   dim3 threads(blocksize,1,1);
 
   //  adjoint simulations: reads in absorbing boundary
-  if (SIMULATION_TYPE == 3 && num_abs_boundary_faces > 0 ){  
+  if (SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0 ){  
     // copies array to GPU
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,h_b_absorb_potential,
-                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),700);    
+                                       mp->d_b_reclen_potential,cudaMemcpyHostToDevice),7700);    
   }
       
   compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_acoustic,
                                                    mp->d_potential_dot_dot_acoustic,
                                                    mp->d_abs_boundary_ispec, 
                                                    mp->d_abs_boundary_ijk, 
+                                                   mp->d_abs_boundary_jacobian2Dw, 
                                                    mp->d_ibool, 
                                                    mp->d_rhostore, 
                                                    mp->d_kappastore, 
-                                                   mp->d_abs_boundary_jacobian2Dw, 
                                                    mp->d_ispec_is_inner, 
                                                    mp->d_ispec_is_acoustic, 
                                                    phase_is_inner,
                                                    SIMULATION_TYPE,SAVE_FORWARD,
+                                                   mp->d_num_abs_boundary_faces,
                                                    mp->d_b_potential_dot_acoustic,
                                                    mp->d_b_potential_dot_dot_acoustic,
                                                    mp->d_b_absorb_potential);
   
   //  adjoint simulations: stores absorbed wavefield part
-  if (SIMULATION_TYPE == 1 && SAVE_FORWARD == 1 && num_abs_boundary_faces > 0 ){  
+  if (SIMULATION_TYPE == 1 && SAVE_FORWARD == 1 && mp->d_num_abs_boundary_faces > 0 ){  
     // copies array to CPU  
     print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_potential,mp->d_b_absorb_potential,
-                                       mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),701);
+                                       mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),7701);
   }
   
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING  

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
@@ -14,17 +42,23 @@
 
 __global__ void compute_stacey_elastic_kernel(real* veloc, 
                                               real* accel, 
-                                              real* b_accel, 
                                               int* abs_boundary_ispec,
-                                              int* abs_boundary_ijk, int* ibool,
+                                              int* abs_boundary_ijk, 
                                               real* abs_boundary_normal,
-                                              real* rho_vp, real* rho_vs,
                                               real* abs_boundary_jacobian2Dw,
-                                              real* b_absorb_field,
-                                              int* ispec_is_inner, int* ispec_is_elastic,
-                                              int phase_is_inner,float* debug_val,int* debug_val_int,
+                                              int* ibool,
+                                              real* rho_vp, 
+                                              real* rho_vs,
+                                              int* ispec_is_inner, 
+                                              int* ispec_is_elastic,
+                                              int phase_is_inner,
+                                              int SIMULATION_TYPE,int SAVE_FORWARD,
                                               int num_abs_boundary_faces,
-                                              int SAVE_FORWARD,int SIMULATION_TYPE) {
+                                              real* b_accel, 
+                                              real* b_absorb_field,                                              
+                                              float* debug_val,
+                                              int* debug_val_int                                              
+                                              ) {
 
   int igll = threadIdx.x; // tx
   int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
@@ -39,9 +73,13 @@
   realw tx,ty,tz;
   realw jacobianw;
   
-  // don't compute points outside NGLLSQUARE == NGLL2 == 25
-  if(igll < NGLL2 && iface < num_abs_boundary_faces) {    
+
+  // don't compute points outside NGLLSQUARE==NGLL2==25  
+  // way 2: no further check needed since blocksize = 25
+  if( iface < num_abs_boundary_faces){
     
+  //if(igll < NGLL2 && iface < num_abs_boundary_faces) {    
+    
     // "-1" from index values to convert from Fortran-> C indexing
     ispec = abs_boundary_ispec[iface]-1;
     i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
@@ -82,19 +120,18 @@
       atomicAdd(&accel[iglob*3+2],-tz*jacobianw);
 
       if(SIMULATION_TYPE == 3) {
-	atomicAdd(&b_accel[iglob*3  ],-b_absorb_field[0+3*(igll+25*(iface))]);
-	atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[1+3*(igll+25*(iface))]);
-	atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[2+3*(igll+25*(iface))]);
+        atomicAdd(&b_accel[iglob*3  ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
+        atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
+        atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
       }
       else if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
-	b_absorb_field[0+3*(igll+25*(iface))] = tx*jacobianw;
-	b_absorb_field[1+3*(igll+25*(iface))] = ty*jacobianw;
-	b_absorb_field[2+3*(igll+25*(iface))] = tz*jacobianw;
+        b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)] = tx*jacobianw;
+        b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw;
+        b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw;
       }
       
     }
   }
-
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -103,38 +140,28 @@
 extern "C"
 void FC_FUNC_(compute_stacey_elastic_cuda,
               COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f, 
-                                           int* NSPEC_ABf, 
-                                           int* NGLOB_ABf, 
                                            int* phase_is_innerf, 
-                                           int* num_abs_boundary_facesf, 
                                            int* SIMULATION_TYPEf, 
-                                           int* NSTEPf, 
-                                           int* NGLOB_ADJOINTf, 
-                                           int* b_num_abs_boundary_facesf, 
-                                           int* b_reclen_fieldf,
-                                           float* b_absorb_field, 
-                                           int* SAVE_FORWARDf, 
-                                           int* itf) {
+                                           int* SAVE_FORWARDf,
+                                           float* h_b_absorb_field) {
 
 TRACE("compute_stacey_elastic_cuda");  
   
   Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-  //int fid = 0;
-  //int it = *itf;
-  //int NSPEC_AB = *NSPEC_ABf;
-  //int NGLOB_AB = *NGLOB_ABf;
-  int phase_is_inner	     = *phase_is_innerf;
-  int num_abs_boundary_faces     = *num_abs_boundary_facesf;
-  int SIMULATION_TYPE	     = *SIMULATION_TYPEf;
-  //int NSTEP			     = *NSTEPf;
-  int myrank; MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
-  //int NGLOB_ADJOINT		     = *NGLOB_ADJOINTf;
-  //int b_num_abs_boundary_faces   = *b_num_abs_boundary_facesf;
-  int b_reclen_field	     = *b_reclen_fieldf;
-  int SAVE_FORWARD             = *SAVE_FORWARDf;              
 
-  int blocksize = 32; // > NGLL2=25, but we handle this inside kernel
-  int num_blocks_x = num_abs_boundary_faces;
+  int phase_is_inner	  = *phase_is_innerf;
+  int SIMULATION_TYPE	= *SIMULATION_TYPEf;
+  int SAVE_FORWARD     = *SAVE_FORWARDf;              
+
+  // way 1
+  // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+  //int blocksize = 32;
+  
+  // way 2: seems sligthly faster 
+  // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+  int blocksize = 25;
+  
+  int num_blocks_x = mp->d_num_abs_boundary_faces;
   int num_blocks_y = 1;
   while(num_blocks_x > 65535) {
     num_blocks_x = ceil(num_blocks_x/2.0);
@@ -147,30 +174,36 @@
   float* d_debug_val;
   int* d_debug_val_int;
 
-  if(SIMULATION_TYPE == 3 && num_abs_boundary_faces > 0) {
+  if(SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0) {
     // int val = NSTEP-it+1;
     // read_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&val);    
     // The read is done in fortran
-    cudaMemcpy(mp->d_b_absorb_field,b_absorb_field,b_reclen_field,cudaMemcpyHostToDevice);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field,
+                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700);
   }
   
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("between cudamemcpy and compute_stacey_elastic_kernel");
 #endif
   
-  compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc,mp->d_accel,mp->d_b_accel,
-                                                  mp->d_abs_boundary_ispec, mp->d_abs_boundary_ijk, 
-                                                  mp->d_ibool, 
+  compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc,
+                                                  mp->d_accel,
+                                                  mp->d_abs_boundary_ispec, 
+                                                  mp->d_abs_boundary_ijk, 
                                                   mp->d_abs_boundary_normal, 
-                                                  mp->d_rho_vp, mp->d_rho_vs, 
                                                   mp->d_abs_boundary_jacobian2Dw, 
-                                                  mp->d_b_absorb_field,
+                                                  mp->d_ibool, 
+                                                  mp->d_rho_vp, 
+                                                  mp->d_rho_vs, 
                                                   mp->d_ispec_is_inner, 
                                                   mp->d_ispec_is_elastic, 
                                                   phase_is_inner,
-                                                  d_debug_val,d_debug_val_int,
-                                                  num_abs_boundary_faces,
-                                                  SAVE_FORWARD,SIMULATION_TYPE);
+                                                  SIMULATION_TYPE,SAVE_FORWARD,
+                                                  mp->d_num_abs_boundary_faces,
+                                                  mp->d_b_accel,
+                                                  mp->d_b_absorb_field,
+                                                  d_debug_val,
+                                                  d_debug_val_int);
   
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_cuda_error("compute_stacey_elastic_kernel");  
@@ -180,8 +213,9 @@
   // if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) &
   //   write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field
   
-  if(SIMULATION_TYPE==1 && SAVE_FORWARD && num_abs_boundary_faces>0) {
-    cudaMemcpy(b_absorb_field,mp->d_b_absorb_field,b_reclen_field,cudaMemcpyDeviceToHost);
+  if(SIMULATION_TYPE==1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces>0) {
+    print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_field,mp->d_b_absorb_field,
+                                       mp->d_b_reclen_field,cudaMemcpyDeviceToHost),7701);
     // The write is done in fortran
     // write_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&it);    
   }

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #ifndef GPU_MESH_
 #define GPU_MESH_
 #include <sys/types.h>
@@ -28,8 +56,13 @@
 #define PRINT5(var,offset) // for(i=0;i<10;i++) printf("var(%d)=%f\n",i,var[offset+i]);
 #endif
 
+
 #define ENABLE_VERY_SLOW_ERROR_CHECKING
 
+/* ----------------------------------------------------------------------------------------------- */
+
+// indexing
+
 #define INDEX2(xsize,x,y) x + (y)*xsize
 #define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
 #define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
@@ -44,6 +77,8 @@
 //#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
 //#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*j)))
 
+/* ----------------------------------------------------------------------------------------------- */
+
 #define MAX(x,y)                    (((x) < (y)) ? (y) : (x))
 
 double get_time();
@@ -122,15 +157,18 @@
   int* d_ibool_interfaces_ext_mesh;
     
   //used for absorbing stacey boundaries
+  int d_num_abs_boundary_faces;
   int* d_abs_boundary_ispec;
   int* d_abs_boundary_ijk;
   float* d_abs_boundary_normal;
-  float* d_rho_vp;
-  float* d_rho_vs;
   float* d_abs_boundary_jacobian2Dw;
+
   float* d_b_absorb_field;
-  int b_num_abs_boundary_faces;
+  int d_b_reclen_field;
   
+  float* d_rho_vp;
+  float* d_rho_vs;
+  
   // inner / outer elements  
   int* d_ispec_is_inner;
   int* d_ispec_is_elastic;
@@ -214,6 +252,10 @@
   // noise sensitivity kernel
   float* d_Sigma_kl;
 
+  // oceans
+  float* d_rmass_ocean_load;
+  float* d_free_surface_normal;
+  
   // ------------------------------------------------------------------ //
   // acoustic wavefield
   // ------------------------------------------------------------------ //
@@ -244,6 +286,13 @@
   float* d_rho_ac_kl;
   float* d_kappa_ac_kl;
   
+  // coupling acoustic-elastic
+  int* d_coupling_ac_el_ispec;
+  int* d_coupling_ac_el_ijk;
+  float* d_coupling_ac_el_normal;
+  float* d_coupling_ac_el_jacobian2Dw;
+
+
   
 } Mesh;
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
@@ -21,15 +49,15 @@
   if(iface < num_free_surface_faces) {
     int ispec = free_surface_ispec[iface]-1; //-1 for C-based indexing
 
-    int i = free_surface_ijk[0+3*(igll + 25*(iface))]-1;
-    int j = free_surface_ijk[1+3*(igll + 25*(iface))]-1;
-    int k = free_surface_ijk[2+3*(igll + 25*(iface))]-1;
+    int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+    int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+    int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
         
     int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;    
     
-    noise_surface_movie[INDEX3(3,25,0,igll,iface)] = displ[iglob*3];
-    noise_surface_movie[INDEX3(3,25,1,igll,iface)] = displ[iglob*3+1];
-    noise_surface_movie[INDEX3(3,25,2,igll,iface)] = displ[iglob*3+2];
+    noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)] = displ[iglob*3];
+    noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)] = displ[iglob*3+1];
+    noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)] = displ[iglob*3+2];
   }
 }
 
@@ -141,9 +169,9 @@
     int igll = threadIdx.x;
     
     int ipoin = 25*iface + igll;
-    int i=free_surface_ijk[0+3*(igll + 25*(iface))]-1;
-    int j=free_surface_ijk[1+3*(igll + 25*(iface))]-1;
-    int k=free_surface_ijk[2+3*(igll + 25*(iface))]-1;    
+    int i=free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+    int j=free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+    int k=free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;    
     
     int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
     
@@ -151,9 +179,9 @@
     real normal_y = normal_y_noise[ipoin];
     real normal_z = normal_z_noise[ipoin];
 
-    real eta = (noise_surface_movie[INDEX3(3,25,0,igll,iface)]*normal_x + 
-		noise_surface_movie[INDEX3(3,25,1,igll,iface)]*normal_y +
-		noise_surface_movie[INDEX3(3,25,2,igll,iface)]*normal_z);
+    real eta = (noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x + 
+                noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y +
+                noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z);
     
     // error from cuda-memcheck and ddt seems "incorrect", because we
     // are passing a __constant__ variable pointer around like it was

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,21 +1,45 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #ifndef CUDA_HEADER_H
 #define CUDA_HEADER_H
+
 /* CUDA specific things from specfem3D_kernels.cu */
 
-//#define NGLL2 25
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
 #ifdef USE_TEXTURES
-// declaration of textures
-texture<float, 1, cudaReadModeElementType> tex_displ;
-texture<float, 1, cudaReadModeElementType> tex_accel;
+  // declaration of textures
+  texture<float, 1, cudaReadModeElementType> tex_displ;
+  texture<float, 1, cudaReadModeElementType> tex_accel;
 
-texture<float, 1, cudaReadModeElementType> tex_potential_acoustic;
-texture<float, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
+  texture<float, 1, cudaReadModeElementType> tex_potential_acoustic;
+  texture<float, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
 
-// for binding the textures
+  // for binding the textures
 
   void bindTexturesDispl(float* d_displ)
   {
@@ -92,7 +116,5 @@
 void setConst_wgllwgll_xz(float* array, Mesh* mp);
 void setConst_wgllwgll_yz(float* array, Mesh* mp);
 
-//void exit_on_cuda_error(char* kernel_name);
-//void show_free_memory(char* info_str);
 
 #endif //CUDA_HEADER_H

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
@@ -87,7 +115,7 @@
 {
   if (cudaSuccess != err)
   {
-    printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call # %d\n",cudaGetErrorString(err),num);
+    printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
     fflush(stdout);
 #ifdef USE_MPI
     MPI_Abort(MPI_COMM_WORLD,1);
@@ -217,12 +245,12 @@
                                         float* h_hprimewgll_xx,
                                         float* h_wgllwgll_xy, 
                                         float* h_wgllwgll_xz,
-                                        float* h_wgllwgll_yz,            
+                                        float* h_wgllwgll_yz,        
+                                        int* ABSORBING_CONDITIONS,    
                                         int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
                                         float* h_abs_boundary_normal,
                                         float* h_abs_boundary_jacobian2Dw,
-                                        float* h_b_absorb_field,
-                                        int* num_abs_boundary_faces, int* b_num_abs_boundary_faces,
+                                        int* h_num_abs_boundary_faces,
                                         int* h_ispec_is_inner, 
                                         int* NSOURCES,
                                         float* h_sourcearrays,
@@ -328,29 +356,30 @@
               
 
   // absorbing boundaries
-  if( *num_abs_boundary_faces > 0 ){  
+  mp->d_num_abs_boundary_faces = *h_num_abs_boundary_faces;
+  if( *ABSORBING_CONDITIONS == 1 && mp->d_num_abs_boundary_faces > 0 ){  
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_ispec,
-                                       (*num_abs_boundary_faces)*sizeof(int)),1101);
+                                       (mp->d_num_abs_boundary_faces)*sizeof(int)),1101);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ispec, h_abs_boundary_ispec,
-                                       (*num_abs_boundary_faces)*sizeof(int),
+                                       (mp->d_num_abs_boundary_faces)*sizeof(int),
                                        cudaMemcpyHostToDevice),1102);
     
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_ijk,
-                                       3*25*(*num_abs_boundary_faces)*sizeof(int)),1103);
+                                       3*25*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
-                                       3*25*(*num_abs_boundary_faces)*sizeof(int),
+                                       3*25*(mp->d_num_abs_boundary_faces)*sizeof(int),
                                        cudaMemcpyHostToDevice),1104);
     
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_normal,
-                                       3*25*(*num_abs_boundary_faces)*sizeof(int)),1105);
+                                       3*25*(mp->d_num_abs_boundary_faces)*sizeof(int)),1105);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
-                                       3*25*(*num_abs_boundary_faces)*sizeof(int),
+                                       3*25*(mp->d_num_abs_boundary_faces)*sizeof(int),
                                        cudaMemcpyHostToDevice),1106);
     
     print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_jacobian2Dw,
-                                       25*(*num_abs_boundary_faces)*sizeof(float)),1107);
+                                       25*(mp->d_num_abs_boundary_faces)*sizeof(float)),1107);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
-                                       25*(*num_abs_boundary_faces)*sizeof(float),
+                                       25*(mp->d_num_abs_boundary_faces)*sizeof(float),
                                        cudaMemcpyHostToDevice),1108);  
   }
   
@@ -476,7 +505,13 @@
                                               float* b_absorb_potential,
                                               int* SIMULATION_TYPE,
                                               float* rho_ac_kl,
-                                              float* kappa_ac_kl
+                                              float* kappa_ac_kl,
+                                              int* ELASTIC_SIMULATION,
+                                              int* num_coupling_ac_el_faces,
+                                              int* coupling_ac_el_ispec,
+                                              int* coupling_ac_el_ijk,
+                                              float* coupling_ac_el_normal,
+                                              float* coupling_ac_el_jacobian2Dw                                              
                                               ) {
   
   TRACE("prepare_fields_acoustic_device");
@@ -567,7 +602,32 @@
                                      mp->nrec_local*125*sizeof(float)),9107);
   mp->h_station_seismo_potential = (float*)malloc(mp->nrec_local*125*sizeof(float));
   if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
+
+
+  // coupling with elastic parts
+  if( *ELASTIC_SIMULATION == 1 && *num_coupling_ac_el_faces > 0 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec),
+                                       (*num_coupling_ac_el_faces)*sizeof(int)),9601); 
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,
+                                       (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9602);    
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
+                                       3*25*(*num_coupling_ac_el_faces)*sizeof(int)),9603); 
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
+                                       3*25*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9604);    
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
+                                        3*25*(*num_coupling_ac_el_faces)*sizeof(float)),9605); 
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
+                                        3*25*(*num_coupling_ac_el_faces)*sizeof(float),cudaMemcpyHostToDevice),9606);    
+
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
+                                        25*(*num_coupling_ac_el_faces)*sizeof(float)),9607);  
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
+                                        25*(*num_coupling_ac_el_faces)*sizeof(float),cudaMemcpyHostToDevice),9608);
   
+  }
+  
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING       
   exit_on_cuda_error("prepare_fields_acoustic_device");  
 #endif    
@@ -592,7 +652,7 @@
                                              int* ispec_is_elastic,
                                              int* ABSORBING_CONDITIONS,
                                              float* h_b_absorb_field,
-                                             int* b_num_abs_boundary_faces,
+                                             int* h_b_reclen_field,
                                              int* SIMULATION_TYPE,
                                              float* rho_kl,
                                              float* mu_kl,
@@ -609,8 +669,9 @@
                                              float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
                                              float* one_minus_sum_beta,float* factor_common,
                                              float* alphaval,float* betaval,float* gammaval,
-                                             float* b_alphaval,float* b_betaval,float* b_gammaval                                             
-                                             ){
+                                             float* b_alphaval,float* b_betaval,float* b_gammaval,
+                                             int* OCEANS,float* rmass_ocean_load,
+                                             float* free_surface_normal,int* num_free_surface_faces){
   
 TRACE("prepare_fields_elastic_device");
   
@@ -656,12 +717,12 @@
   mp->h_station_seismo_field = (float*)malloc(3*125*(mp->nrec_local)*sizeof(float));
   
   // absorbing conditions
-  if( *ABSORBING_CONDITIONS == 1 ){
-    mp->b_num_abs_boundary_faces = *b_num_abs_boundary_faces;
+  if( *ABSORBING_CONDITIONS == 1 && mp->d_num_abs_boundary_faces > 0){
+    mp->d_b_reclen_field = *h_b_reclen_field;
     print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field), 
-                                       3*25*mp->b_num_abs_boundary_faces*sizeof(float)),8016);
+                                       mp->d_b_reclen_field),8016);
     print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
-                                       3*25*mp->b_num_abs_boundary_faces*sizeof(float),cudaMemcpyHostToDevice),8017);
+                                       mp->d_b_reclen_field,cudaMemcpyHostToDevice),8017);
   }
 
   // kernel simulations
@@ -846,11 +907,21 @@
       print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
                                          N_SLS*sizeof(float),cudaMemcpyHostToDevice),8439);
     }
+  }
+
+  if( *OCEANS == 1 ){
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),sizeof(float)*mp->NGLOB_AB),8501);
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
+                                       sizeof(float)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8502);    
+
+    mp->num_free_surface_faces = *num_free_surface_faces;
+    print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
+                                       3*25*(mp->num_free_surface_faces)*sizeof(float)),8503); 
+    print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
+                                       3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice),8504);    
     
-    
   }
   
-  
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING       
   exit_on_cuda_error("prepare_fields_elastic_device");  
 #endif    
@@ -963,7 +1034,6 @@
 extern "C"
 void FC_FUNC_(prepare_cleanup_device,
               PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
-                                      int* num_abs_boundary_faces,
                                       int* SIMULATION_TYPE,
                                       int* ACOUSTIC_SIMULATION,
                                       int* ELASTIC_SIMULATION,
@@ -991,7 +1061,7 @@
   cudaFree(mp->d_muv);
   
   // absorbing boundaries
-  if( *num_abs_boundary_faces > 0 ){ 
+  if( *ABSORBING_CONDITIONS == 1 && mp->d_num_abs_boundary_faces > 0 ){ 
     cudaFree(mp->d_abs_boundary_ispec);
     cudaFree(mp->d_abs_boundary_ijk);
     cudaFree(mp->d_abs_boundary_normal);
@@ -1064,7 +1134,7 @@
     cudaFree(mp->d_ispec_is_elastic);
     cudaFree(mp->d_station_seismo_field);
     
-    if( *ABSORBING_CONDITIONS == 1 ) cudaFree(mp->d_b_absorb_field);
+    if( *ABSORBING_CONDITIONS == 1 && mp->d_num_abs_boundary_faces > 0) cudaFree(mp->d_b_absorb_field);
 
     if( *SIMULATION_TYPE == 3 ) {
       cudaFree(mp->d_b_displ);

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu	2011-10-08 01:09:46 UTC (rev 19050)
@@ -1,3 +1,31 @@
+/*
+ !=====================================================================
+ !
+ !               S p e c f e m 3 D  V e r s i o n  2 . 0
+ !               ---------------------------------------
+ !
+ !          Main authors: Dimitri Komatitsch and Jeroen Tromp
+ !    Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ !                            April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
 #include <stdio.h>
 #include <cuda.h>
 #include <cublas.h>
@@ -8,12 +36,6 @@
 #include "config.h"
 #include "mesh_constants_cuda.h"
 
-//#define INDEX2(xsize,x,y) x + (y)*xsize
-//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
-//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
-//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
-//
-//#define ENABLE_VERY_SLOW_ERROR_CHECKING
 
 /* ----------------------------------------------------------------------------------------------- */
 
@@ -49,10 +71,7 @@
   }
 }
 
-/* ----------------------------------------------------------------------------------------------- */
 
-//extern "C" void pause_for_debuger(int);
-
 /* ----------------------------------------------------------------------------------------------- */
 
 void transfer_field_from_device(Mesh* mp, float* d_field,float* h_field,
@@ -223,11 +242,7 @@
                                                                          mp->d_station_seismo_potential,
                                                                          d_potential);
   
-  
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-  exit_on_cuda_error("transfer_stations_fields_acoustic_from_device_kernel");
-#endif
-  
+    
   print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential,
                                      mp->nrec_local*125*sizeof(float),cudaMemcpyDeviceToHost),500);
   
@@ -249,6 +264,9 @@
     //memcpy(&(h_potential[iglob]),&(mp->h_station_seismo_potential[irec_local*125]),125*sizeof(float));
     
   }
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+  exit_on_cuda_error("transfer_field_acoustic_from_device");
+#endif  
 }
 
 /* ----------------------------------------------------------------------------------------------- */
@@ -313,6 +331,7 @@
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   //double end_time = get_time();
   //printf("Elapsed time: %e\n",end_time-start_time);
+  exit_on_cuda_error("transfer_station_fields_acoustic_from_device");  
 #endif
 }
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_mass_matrices.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_mass_matrices.f90	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/create_mass_matrices.f90	2011-10-08 01:09:46 UTC (rev 19050)
@@ -184,8 +184,8 @@
 
         do igll=1,NGLLSQUARE
           ix_oceans = free_surface_ijk(1,igll,ispec2D)
-          iy_oceans = free_surface_ijk(1,igll,ispec2D)
-          iz_oceans = free_surface_ijk(1,igll,ispec2D)
+          iy_oceans = free_surface_ijk(2,igll,ispec2D)
+          iz_oceans = free_surface_ijk(3,igll,ispec2D)
 
           iglobnum=ibool(ix_oceans,iy_oceans,iz_oceans,ispec_oceans)
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in	2011-10-08 01:09:46 UTC (rev 19050)
@@ -134,6 +134,7 @@
 CUDA_OBJECTS = \
 	$O/check_fields_cuda.cuda.o \
 	$O/compute_add_sources_cuda.cuda.o \
+	$O/compute_coupling_cuda.cuda.o \
 	$O/compute_forces_acoustic_cuda.cuda.o \
 	$O/compute_forces_elastic_cuda.cuda.o \
 	$O/compute_kernels_cuda.cuda.o \

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_acoustic.f90	2011-10-08 01:09:46 UTC (rev 19050)
@@ -215,46 +215,30 @@
     endif
 
 ! elastic coupling
-    if(ELASTIC_SIMULATION ) then
-    
-      ! transfers potentials to CPU
-      if(GPU_MODE) then 
-        call transfer_fields_acoustic_from_device(NGLOB_AB,potential_acoustic, &
-                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)    
-        call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc,accel, Mesh_pointer)
-        ! backward simulation
-        if( SIMULATION_TYPE == 3 ) then
-          call transfer_fields_acoustic_from_device(NGLOB_AB,b_potential_acoustic, &
-                              b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer) 
-          call transfer_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer)
-        endif
-      endif
-    
-      call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, &
+    if(ELASTIC_SIMULATION ) then    
+      if( .NOT. GPU_MODE ) then
+        call compute_coupling_acoustic_el(NSPEC_AB,NGLOB_AB, &
                         ibool,displ,potential_dot_dot_acoustic, &
                         num_coupling_ac_el_faces, &
                         coupling_ac_el_ispec,coupling_ac_el_ijk, &
                         coupling_ac_el_normal, &
                         coupling_ac_el_jacobian2Dw, &
                         ispec_is_inner,phase_is_inner)
-      ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) &
-        call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+        ! adjoint simulations
+        if( SIMULATION_TYPE == 3 ) &
+          call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, &
                         ibool,b_displ,b_potential_dot_dot_acoustic, &
                         num_coupling_ac_el_faces, &
                         coupling_ac_el_ispec,coupling_ac_el_ijk, &
                         coupling_ac_el_normal, &
                         coupling_ac_el_jacobian2Dw, &
                         ispec_is_inner,phase_is_inner)
-        
-      ! transfers potentials to CPU
-      if(GPU_MODE) then 
-        ! only potential_dot_dot_acoustic/b_potential_dot_dot_acoustic is updated above
-        call transfer_fields_acoustic_to_device(NGLOB_AB,potential_acoustic, &
-                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
-        if( SIMULATION_TYPE == 3 ) &
-          call transfer_fields_acoustic_to_device(NGLOB_AB,b_potential_acoustic, &
-                              b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer) 
+      else
+        ! on GPU
+        if( num_coupling_ac_el_faces > 0 ) &
+          call compute_coupling_acoustic_el_cuda(Mesh_pointer,phase_is_inner, &
+                                              num_coupling_ac_el_faces,SIMULATION_TYPE)
+      
       endif
     endif
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90	2011-10-08 01:09:46 UTC (rev 19050)
@@ -452,23 +452,9 @@
                         GPU_MODE,Mesh_pointer)
 
 ! acoustic coupling
-    if( ACOUSTIC_SIMULATION ) then
-      
-      ! daniel: workaround - todo on GPU
-      ! transfers potentials to CPU
-      if(GPU_MODE) then 
-        call transfer_fields_acoustic_from_device(NGLOB_AB,potential_acoustic, &
-                              potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)    
-        call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc,accel,Mesh_pointer)
-        ! backward simulation
-        if( SIMULATION_TYPE == 3 ) then
-          call transfer_fields_acoustic_from_device(NGLOB_AB,b_potential_acoustic, &
-                              b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer) 
-          call transfer_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer) 
-        endif
-      endif
-    
-      call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
+    if( ACOUSTIC_SIMULATION ) then      
+      if( .NOT. GPU_MODE ) then    
+        call compute_coupling_elastic_ac(NSPEC_AB,NGLOB_AB, &
                         ibool,accel,potential_dot_dot_acoustic, &
                         num_coupling_ac_el_faces, &
                         coupling_ac_el_ispec,coupling_ac_el_ijk, &
@@ -476,25 +462,22 @@
                         coupling_ac_el_jacobian2Dw, &
                         ispec_is_inner,phase_is_inner)
 
-      ! adjoint simulations
-      if( SIMULATION_TYPE == 3 ) &
-        call compute_coupling_elastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+        ! adjoint simulations
+        if( SIMULATION_TYPE == 3 ) &
+          call compute_coupling_elastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
                         ibool,b_accel,b_potential_dot_dot_acoustic, &
                         num_coupling_ac_el_faces, &
                         coupling_ac_el_ispec,coupling_ac_el_ijk, &
                         coupling_ac_el_normal, &
                         coupling_ac_el_jacobian2Dw, &
                         ispec_is_inner,phase_is_inner)
-
-      ! daniel: workaround - todo on GPU
-      ! transfers potentials to CPU
-      if(GPU_MODE) then 
-        ! only accel/b_accel is updated above
-        call transfer_fields_to_device(NDIM*NGLOB_AB,displ,veloc,accel, Mesh_pointer) 
-        if( SIMULATION_TYPE == 3 ) &
-          call transfer_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer) 
+      else
+        ! on GPU
+        if( num_coupling_ac_el_faces > 0 ) &
+          call compute_coupling_elastic_ac_cuda(Mesh_pointer,phase_is_inner, &
+                                              num_coupling_ac_el_faces,SIMULATION_TYPE)
+      
       endif
-                        
     endif
 
 
@@ -615,16 +598,21 @@
        b_accel(3,:) = b_accel(3,:)*rmass(:)
     endif !adjoint
  else ! GPU_MODE == 1    
-    call kernel_3_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2)
+    call kernel_3_a_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2,OCEANS)
  endif
 
 ! updates acceleration with ocean load term
   if(OCEANS) then
-    call elastic_ocean_load(NSPEC_AB,NGLOB_AB, &
+    if( .NOT. GPU_MODE ) then
+      call elastic_ocean_load(NSPEC_AB,NGLOB_AB, &
                         ibool,rmass,rmass_ocean_load,accel, &
                         free_surface_normal,free_surface_ijk,free_surface_ispec, &
                         num_free_surface_faces,SIMULATION_TYPE, &
                         NGLOB_ADJOINT,b_accel)
+    else
+      ! on GPU
+      call elastic_ocean_load_cuda(Mesh_pointer,SIMULATION_TYPE)
+    endif
   endif
 
 ! updates velocities
@@ -648,6 +636,8 @@
      veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
      ! adjoint simulations
      if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
+  else ! GPU_MODE == 1    
+    if( OCEANS ) call kernel_3_b_cuda(Mesh_pointer, NGLOB_AB, deltatover2,SIMULATION_TYPE,b_deltatover2)  
   endif
 
 

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_acoustic.f90	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_acoustic.f90	2011-10-08 01:09:46 UTC (rev 19050)
@@ -76,6 +76,9 @@
   real(kind=CUSTOM_REAL) :: rhol,cpl,jacobianw,absorbl
   integer :: ispec,iglob,i,j,k,iface,igll
   !integer:: reclen1,reclen2
+  
+  ! checks if anything to do
+  if( num_abs_boundary_faces == 0 ) return
 
 ! adjoint simulations:
   if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
@@ -144,8 +147,7 @@
   else 
     ! GPU_MODE == .true.
     call compute_stacey_acoustic_cuda(Mesh_pointer, phase_is_inner, &
-                                num_abs_boundary_faces, SIMULATION_TYPE, &
-                                SAVE_FORWARD,b_absorb_potential)
+                                SIMULATION_TYPE,SAVE_FORWARD,b_absorb_potential)
 
   endif
   

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_elastic.f90	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_stacey_elastic.f90	2011-10-08 01:09:46 UTC (rev 19050)
@@ -84,6 +84,8 @@
   integer :: ispec,iglob,i,j,k,iface,igll
   !integer:: reclen1,reclen2
 
+  ! checks if anything to do
+  if( num_abs_boundary_faces == 0 ) return
 
 ! adjoint simulations:
   if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0)  then
@@ -162,10 +164,8 @@
 
   else 
     ! GPU_MODE == .true.
-    call compute_stacey_elastic_cuda(Mesh_pointer, NSPEC_AB, NGLOB_AB, phase_is_inner,&
-          num_abs_boundary_faces, SIMULATION_TYPE, NSTEP, NGLOB_ADJOINT,&
-          b_num_abs_boundary_faces, b_reclen_field,b_absorb_field, &
-          SAVE_FORWARD,it)
+    call compute_stacey_elastic_cuda(Mesh_pointer,phase_is_inner, &
+                        SIMULATION_TYPE,SAVE_FORWARD,b_absorb_field)
   endif
 
   ! adjoint simulations: stores absorbed wavefield part

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90	2011-10-08 01:09:46 UTC (rev 19050)
@@ -659,7 +659,7 @@
 
   
   ! frees allocated memory on GPU
-  call prepare_cleanup_device(Mesh_pointer,num_abs_boundary_faces, &
+  call prepare_cleanup_device(Mesh_pointer, &
                               SIMULATION_TYPE,ACOUSTIC_SIMULATION,ELASTIC_SIMULATION, &
                               ABSORBING_CONDITIONS,NOISE_TOMOGRAPHY,COMPUTE_AND_STORE_STRAIN, &
                               ATTENUATION)

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90	2011-10-08 01:09:46 UTC (rev 19050)
@@ -397,22 +397,31 @@
   endif
 
   if (NOISE_TOMOGRAPHY/=0) then
-     ! save/read the surface movie using the same c routine as we do for absorbing boundaries (file ID is 2)
+    ! save/read the surface movie using the same c routine as we do for absorbing boundaries (file ID is 2)
 
-     ! size of single record
-     reclen=CUSTOM_REAL*NDIM*NGLLSQUARE*NSPEC_TOP
-     ! total file size
-     filesize = reclen
-     filesize = filesize*NSTEP
+    ! size of single record
+    reclen=CUSTOM_REAL*NDIM*NGLLSQUARE*NSPEC_TOP
      
-     write(outputname,"('/proc',i6.6,'_surface_movie')") myrank
-     if (NOISE_TOMOGRAPHY==1) call open_file_abs_w(2,trim(LOCAL_PATH)//trim(outputname), &
+    ! check integer size limit: size of b_reclen_field must fit onto an 4-byte integer 
+    if( NSPEC_TOP > 2147483647 / (CUSTOM_REAL * NGLLSQUARE * NDIM) ) then
+      print *,'reclen of noise surface_movie needed exceeds integer 4-byte limit: ',reclen
+      print *,'  ',CUSTOM_REAL, NDIM, NGLLSQUARE, NSPEC_TOP
+      print*,'bit size fortran: ',bit_size(NSPEC_TOP)
+      call exit_MPI(myrank,"error NSPEC_TOP integer limit")
+    endif
+     
+    ! total file size
+    filesize = reclen
+    filesize = filesize*NSTEP
+     
+    write(outputname,"('/proc',i6.6,'_surface_movie')") myrank
+    if (NOISE_TOMOGRAPHY==1) call open_file_abs_w(2,trim(LOCAL_PATH)//trim(outputname), &
                                       len_trim(trim(LOCAL_PATH)//trim(outputname)), &
                                       filesize)
-     if (NOISE_TOMOGRAPHY==2) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
+    if (NOISE_TOMOGRAPHY==2) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
                                       len_trim(trim(LOCAL_PATH)//trim(outputname)), &
                                       filesize)
-     if (NOISE_TOMOGRAPHY==3) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
+    if (NOISE_TOMOGRAPHY==3) call open_file_abs_r(2,trim(LOCAL_PATH)//trim(outputname), &
                                       len_trim(trim(LOCAL_PATH)//trim(outputname)), &
                                       filesize)
   endif

Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-10-08 00:51:03 UTC (rev 19049)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90	2011-10-08 01:09:46 UTC (rev 19050)
@@ -541,6 +541,14 @@
 
         ! size of single record
         b_reclen_field = CUSTOM_REAL * NDIM * NGLLSQUARE * num_abs_boundary_faces
+
+        ! check integer size limit: size of b_reclen_field must fit onto an 4-byte integer 
+        if( num_abs_boundary_faces > 2147483647 / (CUSTOM_REAL * NDIM * NGLLSQUARE) ) then
+          print *,'reclen needed exceeds integer 4-byte limit: ',b_reclen_field
+          print *,'  ',CUSTOM_REAL, NDIM, NGLLSQUARE, num_abs_boundary_faces
+          print*,'bit size fortran: ',bit_size(b_reclen_field)
+          call exit_MPI(myrank,"error b_reclen_field integer limit")
+        endif
         
         ! total file size
         filesize = b_reclen_field
@@ -582,13 +590,21 @@
 
         ! size of single record
         b_reclen_potential = CUSTOM_REAL * NGLLSQUARE * num_abs_boundary_faces
+
+        ! check integer size limit: size of b_reclen_potential must fit onto an 4-byte integer 
+        if( num_abs_boundary_faces > 2147483647 / (CUSTOM_REAL * NGLLSQUARE) ) then
+          print *,'reclen needed exceeds integer 4-byte limit: ',b_reclen_potential
+          print *,'  ',CUSTOM_REAL, NGLLSQUARE, num_abs_boundary_faces
+          print*,'bit size fortran: ',bit_size(b_reclen_potential)
+          call exit_MPI(myrank,"error b_reclen_potential integer limit")
+        endif
         
         ! total file size (two lines to implicitly convert to 8-byte integers)
         filesize = b_reclen_potential
         filesize = filesize*NSTEP
 
         ! daniel: debug check size limit
-        !if( NSTEP > 2147483648 / b_reclen_potential ) then
+        !if( NSTEP > 2147483647 / b_reclen_potential ) then
         !  print *,'file size needed exceeds integer 4-byte limit: ',b_reclen_potential,NSTEP
         !  print *,'  ',CUSTOM_REAL, NGLLSQUARE, num_abs_boundary_faces,NSTEP
         !  print*,'file size fortran: ',filesize
@@ -739,10 +755,11 @@
                                   nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
                                   hprime_xx, hprime_yy, hprime_zz, &
                                   hprimewgll_xx, wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
+                                  ABSORBING_CONDITIONS, &
                                   abs_boundary_ispec, abs_boundary_ijk, &
                                   abs_boundary_normal, &
                                   abs_boundary_jacobian2Dw, &
-                                  b_absorb_field, num_abs_boundary_faces, b_num_abs_boundary_faces, &
+                                  num_abs_boundary_faces, &
                                   ispec_is_inner, &
                                   NSOURCES, sourcearrays, islice_selected_source, ispec_selected_source, &
                                   number_receiver_global, ispec_selected_rec, nrec, nrec_local, &
@@ -755,7 +772,10 @@
                                   ispec_is_acoustic, &
                                   NOISE_TOMOGRAPHY,num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
                                   ABSORBING_CONDITIONS,b_reclen_potential,b_absorb_potential, &
-                                  SIMULATION_TYPE,rho_ac_kl,kappa_ac_kl)
+                                  SIMULATION_TYPE,rho_ac_kl,kappa_ac_kl, &
+                                  ELASTIC_SIMULATION, num_coupling_ac_el_faces, &
+                                  coupling_ac_el_ispec,coupling_ac_el_ijk, &
+                                  coupling_ac_el_normal,coupling_ac_el_jacobian2Dw)
   
   ! prepares fields on GPU for elastic simulations 
   if( ELASTIC_SIMULATION ) &
@@ -763,7 +783,7 @@
                                   rmass,rho_vp,rho_vs, &
                                   num_phase_ispec_elastic,phase_ispec_inner_elastic, &
                                   ispec_is_elastic, &
-                                  ABSORBING_CONDITIONS,b_absorb_field,b_num_abs_boundary_faces, &
+                                  ABSORBING_CONDITIONS,b_absorb_field,b_reclen_field, &
                                   SIMULATION_TYPE,rho_kl,mu_kl,kappa_kl, &
                                   COMPUTE_AND_STORE_STRAIN, &
                                   epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
@@ -775,7 +795,9 @@
                                   b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
                                   one_minus_sum_beta,factor_common, &
                                   alphaval,betaval,gammaval, &
-                                  b_alphaval,b_betaval,b_gammaval)
+                                  b_alphaval,b_betaval,b_gammaval, &
+                                  OCEANS,rmass_ocean_load, &
+                                  free_surface_normal,num_free_surface_faces)
 
   ! prepares needed receiver array for adjoint runs
   if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) &
@@ -819,9 +841,9 @@
   ! outputs usage 
   if( myrank == 0 ) then
     call get_free_device_memory(free_mb,used_mb,total_mb)
-    write(IMAIN,*)"  GPU usage: free  =",free_mb," MB"
-    write(IMAIN,*)"             used  =",used_mb," MB"
-    write(IMAIN,*)"             total =",total_mb," MB"
+    write(IMAIN,*)"  GPU usage: free  =",free_mb," MB",nint(free_mb/total_mb*100.0),"%"
+    write(IMAIN,*)"             used  =",used_mb," MB",nint(used_mb/total_mb*100.0),"%"
+    write(IMAIN,*)"             total =",total_mb," MB",nint(total_mb/total_mb*100.0),"%"
     write(IMAIN,*)
   endif