[cig-commits] r19944 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER: EXAMPLES/global_s362ani_small src/cuda src/specfem3D

joseph.charles at geodynamics.org joseph.charles at geodynamics.org
Fri Apr 13 08:18:05 PDT 2012


Author: joseph.charles
Date: 2012-04-13 08:18:05 -0700 (Fri, 13 Apr 2012)
New Revision: 19944

Modified:
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani_small/process.sh
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
   seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
Log:
fixes attenuation bug in cuda code by using 1st order Taylor expansion for epsilondev_loc variable

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani_small/process.sh
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani_small/process.sh	2012-04-13 02:00:32 UTC (rev 19943)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani_small/process.sh	2012-04-13 15:18:05 UTC (rev 19944)
@@ -11,7 +11,7 @@
 ##################################################
 # USER PARAMETER
 # source directory
-rootdir=~/SPECFEM3D_GLOBE_GPU/
+rootdir=~/SPECFEM3D_GLOBE_GPU
 
 ##################################################
 
@@ -40,11 +40,11 @@
 #./configure F90=ifort MPIF90=mpif90 FLAGS_CHECK="-O3 -assume byterecl" FLAGS_NO_CHECK="-O3 -assume byterecl"
 # compiles for a forward simulation
 cp $currentdir/DATA/Par_file DATA/Par_file
+make clean
 make 
 
 # backup of constants setup
 cp setup/* $currentdir/OUTPUT_FILES/
-cp OUTPUT_FILES/values_from_mesher.h $currentdir/OUTPUT_FILES/
 cp DATA/Par_file $currentdir/OUTPUT_FILES/
 
 cd $currentdir

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-04-13 02:00:32 UTC (rev 19943)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu	2012-04-13 15:18:05 UTC (rev 19944)
@@ -896,8 +896,10 @@
                                           int* d_phase_ispec_inner,
                                           int num_phase_ispec,
                                           int d_iphase,
+					  realw d_deltat, 
                                           int use_mesh_coloring_gpu,
                                           realw* d_displ,
+                                          realw* d_veloc,
                                           realw* d_accel,
                                           realw* d_xix, realw* d_xiy, realw* d_xiz,
                                           realw* d_etax, realw* d_etay, realw* d_etaz,
@@ -955,6 +957,10 @@
   reald duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
   reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
 
+  reald tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
+  reald duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+  reald duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+
   reald fac1,fac2,fac3;
   reald minus_sum_beta,one_minus_sum_beta_use;
 
@@ -976,6 +982,10 @@
     __shared__ reald s_dummyy_loc[NGLL3];
     __shared__ reald s_dummyz_loc[NGLL3];
 
+    __shared__ reald s_dummyx_loc_att[NGLL3];
+    __shared__ reald s_dummyy_loc_att[NGLL3];
+    __shared__ reald s_dummyz_loc_att[NGLL3];
+
     __shared__ reald s_tempx1[NGLL3];
     __shared__ reald s_tempx2[NGLL3];
     __shared__ reald s_tempx3[NGLL3];
@@ -1030,6 +1040,18 @@
 
     if (active) {
 
+      // use first order Taylor expansion of displacement for local storage of stresses 
+      // at this current time step, to fix attenuation in a consistent way
+#ifdef USE_TEXTURES
+      s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob);
+      s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + NGLOB);
+      s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + 2*NGLOB);
+#else
+      s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
+      s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
+      s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+#endif
+
 #ifndef MANUALLY_UNROLLED_LOOPS
 
       tempx1l = 0.f;
@@ -1064,6 +1086,41 @@
           tempz3l += s_dummyz_loc[offset]*hp3;
 
       }
+
+      // temporary variables used for fixing attenuation in a consistent way
+
+      tempx1l_att = 0.f;
+      tempx2l_att = 0.f;
+      tempx3l_att = 0.f;
+
+      tempy1l_att = 0.f;
+      tempy2l_att = 0.f;
+      tempy3l_att = 0.f;
+
+      tempz1l_att = 0.f;
+      tempz2l_att = 0.f;
+      tempz3l_att = 0.f;
+
+      for (l=0;l<NGLLX;l++) {
+          hp1 = d_hprime_xx[l*NGLLX+I];
+          offset = K*NGLL2+J*NGLLX+l;
+          tempx1l_att += s_dummyx_loc_att[offset]*hp1;
+          tempy1l_att += s_dummyy_loc_att[offset]*hp1;
+          tempz1l_att += s_dummyz_loc_att[offset]*hp1;
+
+          hp2 = d_hprime_xx[l*NGLLX+J];
+          offset = K*NGLL2+l*NGLLX+I;
+          tempx2l_att += s_dummyx_loc_att[offset]*hp2;
+          tempy2l_att += s_dummyy_loc_att[offset]*hp2;
+          tempz2l_att += s_dummyz_loc_att[offset]*hp2;
+
+          hp3 = d_hprime_xx[l*NGLLX+K];
+          offset = l*NGLL2+J*NGLLX+I;
+          tempx3l_att += s_dummyx_loc_att[offset]*hp3;
+          tempy3l_att += s_dummyy_loc_att[offset]*hp3;
+          tempz3l_att += s_dummyz_loc_att[offset]*hp3;
+
+      }
 #else
 
       tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
@@ -1120,6 +1177,62 @@
               + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
               + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
+      // temporary variables used for fixing attenuation in a consistent way
+
+      tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+              + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+              + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+              + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+              + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+              + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+              + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+              + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+              + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+      tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+      tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+              + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+              + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+              + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+              + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
 #endif
 
       // compute derivatives of ux, uy and uz with respect to x, y and z
@@ -1155,16 +1268,35 @@
       duzdxl_plus_duxdzl = duzdxl + duxdzl;
       duzdyl_plus_duydzl = duzdyl + duydzl;
 
+      // temporary variables used for fixing attenuation in a consistent way
+
+      duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
+      duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
+      duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
+
+      duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
+      duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
+      duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
+
+      duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
+      duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
+      duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
+
+      // precompute some sums to save CPU time
+      duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
+      duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
+      duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
+
       // computes deviatoric strain attenuation and/or for kernel calculations
       if(COMPUTE_AND_STORE_STRAIN) {
-        realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+	realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
 
         // local storage: stresses at this current time step
-        epsilondev_xx_loc = duxdxl - templ;
-        epsilondev_yy_loc = duydyl - templ;
-        epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
-        epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
-        epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+	epsilondev_xx_loc = duxdxl_att - templ;
+        epsilondev_yy_loc = duydyl_att - templ;
+        epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
+        epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
+        epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
 
         if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) {
           epsilon_trace_over_3[tx] = templ;
@@ -1459,6 +1591,7 @@
 /* ----------------------------------------------------------------------------------------------- */
 
 void Kernel_2_crust_mantle(int nb_blocks_to_compute,Mesh* mp,
+			  realw d_deltat, 
                           int d_iphase,
                           int* d_ibool,
                           int* d_ispec_is_tiso,
@@ -1534,8 +1667,10 @@
                                   mp->d_phase_ispec_inner_crust_mantle,
                                   mp->num_phase_ispec_crust_mantle,
                                   d_iphase,
+				  d_deltat,
                                   mp->use_mesh_coloring_gpu,
                                   mp->d_displ_crust_mantle,
+                                  mp->d_veloc_crust_mantle,
                                   mp->d_accel_crust_mantle,
                                   d_xix, d_xiy, d_xiz,
                                   d_etax, d_etay, d_etaz,
@@ -1578,8 +1713,10 @@
                                      mp->d_phase_ispec_inner_crust_mantle,
                                      mp->num_phase_ispec_crust_mantle,
                                      d_iphase,
+                                     d_deltat,
                                      mp->use_mesh_coloring_gpu,
                                      mp->d_b_displ_crust_mantle,
+                                     mp->d_b_veloc_crust_mantle,
                                      mp->d_b_accel_crust_mantle,
                                      d_xix, d_xiy, d_xiz,
                                      d_etax, d_etay, d_etaz,
@@ -1634,6 +1771,7 @@
 extern "C"
 void FC_FUNC_(compute_forces_crust_mantle_cuda,
               COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
+                                                realw* deltat,
                                                 int* iphase) {
 
   TRACE("compute_forces_crust_mantle_cuda");
@@ -1701,6 +1839,7 @@
       //}
 
       Kernel_2_crust_mantle(nb_blocks_to_compute,mp,
+			    *deltat,
                             *iphase,
                             mp->d_ibool_crust_mantle + color_offset_nonpadded,
                             mp->d_ispec_is_tiso_crust_mantle + color_offset_ispec,
@@ -1780,6 +1919,7 @@
     // no mesh coloring: uses atomic updates
 
     Kernel_2_crust_mantle(num_elements,mp,
+			  *deltat,
                           *iphase,
                           mp->d_ibool_crust_mantle,
                           mp->d_ispec_is_tiso_crust_mantle,

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2012-04-13 02:00:32 UTC (rev 19943)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h	2012-04-13 15:18:05 UTC (rev 19944)
@@ -57,6 +57,7 @@
 #ifdef USE_TEXTURES
   // declaration of textures
   texture<realw, 1, cudaReadModeElementType> tex_displ;
+  texture<realw, 1, cudaReadModeElementType> tex_veloc;
   texture<realw, 1, cudaReadModeElementType> tex_accel;
 
   texture<realw, 1, cudaReadModeElementType> tex_potential;
@@ -78,6 +79,20 @@
     }
   }
 
+  void bindTexturesVeloc(realw* d_veloc)
+  {
+    cudaError_t err;
+
+    cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+
+    err = cudaBindTexture(NULL,tex_veloc, d_veloc, channelDescFloat, NDIM*NGLOB*sizeof(realw));
+    if (err != cudaSuccess)
+    {
+      fprintf(stderr, "Error in bindTexturesVeloc for veloc: %s\n", cudaGetErrorString(err));
+      exit(1);
+    }
+  }
+
   void bindTexturesAccel(realw* d_accel)
   {
     cudaError_t err;

Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-04-13 02:00:32 UTC (rev 19943)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90	2012-04-13 15:18:05 UTC (rev 19944)
@@ -204,7 +204,7 @@
       ! on GPU
       ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
       ! for crust/mantle
-      call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase)
+      call compute_forces_crust_mantle_cuda(Mesh_pointer,deltat,iphase)
       ! for inner core
       call compute_forces_inner_core_cuda(Mesh_pointer,iphase)
     endif ! GPU_MODE



More information about the CIG-COMMITS mailing list