[cig-commits] r21036 - in seismo/3D/SPECFEM3D/trunk/src: cuda specfem3D

joseph.charles at geodynamics.org joseph.charles at geodynamics.org
Thu Nov 15 05:41:58 PST 2012


Author: joseph.charles
Date: 2012-11-15 05:41:58 -0800 (Thu, 15 Nov 2012)
New Revision: 21036

Modified:
   seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
   seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
   seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90
Log:
fixes attenuation bug in GPU code by using 1st order Taylor expansion for epsilondev_loc variable


Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu	2012-11-15 09:20:21 UTC (rev 21035)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu	2012-11-15 13:41:58 UTC (rev 21036)
@@ -655,7 +655,8 @@
                               int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
                               int d_iphase,
                               int use_mesh_coloring_gpu,
-                              realw* d_displ, realw* d_accel,
+			      realw d_deltat,
+                              realw* d_displ,realw* d_veloc,realw* d_accel,
                               realw* d_xix, realw* d_xiy, realw* d_xiz,
                               realw* d_etax, realw* d_etay, realw* d_etaz,
                               realw* d_gammax, realw* d_gammay, realw* d_gammaz,
@@ -722,6 +723,10 @@
   realw duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
   realw duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
 
+  realw tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
+  realw duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+  realw duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+
   realw fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
   realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
   realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
@@ -742,6 +747,10 @@
   __shared__ realw s_dummyy_loc[NGLL3];
   __shared__ realw s_dummyz_loc[NGLL3];
 
+  __shared__ realw s_dummyx_loc_att[NGLL3];
+  __shared__ realw s_dummyy_loc_att[NGLL3];
+  __shared__ realw s_dummyz_loc_att[NGLL3];
+
   __shared__ realw s_tempx1[NGLL3];
   __shared__ realw s_tempx2[NGLL3];
   __shared__ realw s_tempx3[NGLL3];
@@ -789,6 +798,20 @@
 #endif
   }
 
+  if(ATTENUATION){
+    // use first order Taylor expansion of displacement for local storage of stresses 
+    // at this current time step, to fix attenuation in a consistent way
+#ifdef USE_TEXTURES
+    s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(d_veloc_tex, iglob);
+    s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(d_veloc_tex, iglob + NGLOB);
+    s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(d_veloc_tex, iglob + 2*NGLOB);
+#else
+    s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
+    s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
+    s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+#endif
+  }
+
   if (tx < NGLL2) {
     #ifdef USE_TEXTURES_CONSTANTS
     sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
@@ -839,6 +862,44 @@
         tempz3l += s_dummyz_loc[offset]*hp3;
 
     }
+
+    if( ATTENUATION){
+      // temporary variables used for fixing attenuation in a consistent way
+
+      tempx1l_att = 0.f;
+      tempx2l_att = 0.f;
+      tempx3l_att = 0.f;
+
+      tempy1l_att = 0.f;
+      tempy2l_att = 0.f;
+      tempy3l_att = 0.f;
+
+      tempz1l_att = 0.f;
+      tempz2l_att = 0.f;
+      tempz3l_att = 0.f;
+
+      for (l=0;l<NGLLX;l++) {
+	hp1 = sh_hprime_xx[l*NGLLX+I];
+	offset = K*NGLL2+J*NGLLX+l;
+	tempx1l_att += s_dummyx_loc_att[offset]*hp1;
+	tempy1l_att += s_dummyy_loc_att[offset]*hp1;
+	tempz1l_att += s_dummyz_loc_att[offset]*hp1;
+
+	hp2 = sh_hprime_xx[l*NGLLX+J];
+	offset = K*NGLL2+l*NGLLX+I;
+	tempx2l_att += s_dummyx_loc_att[offset]*hp2;
+	tempy2l_att += s_dummyy_loc_att[offset]*hp2;
+	tempz2l_att += s_dummyz_loc_att[offset]*hp2;
+
+	hp3 = sh_hprime_xx[l*NGLLX+K];
+	offset = l*NGLL2+J*NGLLX+I;
+	tempx3l_att += s_dummyx_loc_att[offset]*hp3;
+	tempy3l_att += s_dummyy_loc_att[offset]*hp3;
+	tempz3l_att += s_dummyz_loc_att[offset]*hp3;
+
+      }
+    }
+
 #else
 
     tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
@@ -895,6 +956,64 @@
             + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
             + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
 
+    if( ATTENUATION){
+      // temporary variables used for fixing attenuation in a consistent way
+
+      tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+	+ s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+	+ s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+	+ s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+	+ s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+	+ s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+	+ s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+	+ s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+	+ s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+	+ s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+	+ s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+	+ s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+	+ s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+      tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+	+ s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+	+ s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+	+ s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+	+ s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+	+ s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+	+ s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+	+ s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+	+ s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+	+ s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+	+ s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+	+ s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+	+ s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+      tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+	+ s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+	+ s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+	+ s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+	+ s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+      tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+	+ s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+	+ s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+	+ s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+	+ s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+      tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+	+ s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+	+ s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+	+ s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+	+ s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+    }
+
 #endif
 
 // compute derivatives of ux, uy and uz with respect to x, y and z
@@ -930,26 +1049,63 @@
     duzdxl_plus_duxdzl = duzdxl + duxdzl;
     duzdyl_plus_duydzl = duzdyl + duydzl;
 
-    // computes deviatoric strain attenuation and/or for kernel calculations
-    if(COMPUTE_AND_STORE_STRAIN) {
-      realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
-      /*
-      epsilondev_xx[offset] = duxdxl - templ;
-      epsilondev_yy[offset] = duydyl - templ;
-      epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
-      epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
-      epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
-            */
-      // local storage: stresses at this current time step
-      epsilondev_xx_loc = duxdxl - templ;
-      epsilondev_yy_loc = duydyl - templ;
-      epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
-      epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
-      epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+    if( ATTENUATION){
+      // temporary variables used for fixing attenuation in a consistent way
 
-      if(SIMULATION_TYPE == 3) {
-        epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+      duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
+      duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
+      duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
+
+      duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
+      duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
+      duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
+
+      duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
+      duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
+      duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
+
+      // precompute some sums to save CPU time
+      duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
+      duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
+      duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
+
+      // computes deviatoric strain attenuation and/or for kernel calculations
+      if(COMPUTE_AND_STORE_STRAIN) {
+	realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
+
+	// local storage: stresses at this current time step
+	epsilondev_xx_loc = duxdxl_att - templ;
+	epsilondev_yy_loc = duydyl_att - templ;
+	epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
+	epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
+	epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
+
+	if(SIMULATION_TYPE == 3) {
+	  epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+	}
       }
+    }else{
+      // computes deviatoric strain attenuation and/or for kernel calculations
+      if(COMPUTE_AND_STORE_STRAIN) {
+	realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+	/*
+	  epsilondev_xx[offset] = duxdxl - templ;
+	  epsilondev_yy[offset] = duydyl - templ;
+	  epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
+	  epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
+	  epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
+	*/
+	// local storage: stresses at this current time step
+	epsilondev_xx_loc = duxdxl - templ;
+	epsilondev_yy_loc = duydyl - templ;
+	epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+	epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+	epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+
+	if(SIMULATION_TYPE == 3) {
+	  epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+	}
+      }
     }
 
     // compute elements with an elastic isotropic rheology
@@ -1246,7 +1402,7 @@
 
 /* ----------------------------------------------------------------------------------------------- */
 
-void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,
+void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
               int COMPUTE_AND_STORE_STRAIN,
               int ATTENUATION,int ANISOTROPY,
               int* d_ibool,
@@ -1341,7 +1497,8 @@
                                                         mp->num_phase_ispec_elastic,
                                                         d_iphase,
                                                         mp->use_mesh_coloring_gpu,
-                                                        mp->d_displ, mp->d_accel,
+						        d_deltat,
+                                                        mp->d_displ,mp->d_veloc,mp->d_accel,
                                                         d_xix, d_xiy, d_xiz,
                                                         d_etax, d_etay, d_etaz,
                                                         d_gammax, d_gammay, d_gammaz,
@@ -1399,7 +1556,8 @@
                                                            mp->num_phase_ispec_elastic,
                                                            d_iphase,
                                                            mp->use_mesh_coloring_gpu,
-                                                           mp->d_b_displ, mp->d_b_accel,
+							   d_deltat,
+                                                           mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
                                                            d_xix, d_xiy, d_xiz,
                                                            d_etax, d_etay, d_etaz,
                                                            d_gammax, d_gammay, d_gammaz,
@@ -1470,6 +1628,7 @@
 void FC_FUNC_(compute_forces_elastic_cuda,
               COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f,
                                            int* iphase,
+					   realw* deltat,
                                            int* nspec_outer_elastic,
                                            int* nspec_inner_elastic,
                                            int* COMPUTE_AND_STORE_STRAIN,
@@ -1536,7 +1695,7 @@
       //  exit(EXIT_FAILURE);
       //}
 
-      Kernel_2(nb_blocks_to_compute,mp,*iphase,
+      Kernel_2(nb_blocks_to_compute,mp,*iphase,*deltat,
                *COMPUTE_AND_STORE_STRAIN,
                *ATTENUATION,*ANISOTROPY,
                mp->d_ibool + color_offset_nonpadded,
@@ -1618,7 +1777,7 @@
 
     // no mesh coloring: uses atomic updates
 
-    Kernel_2(num_elements,mp,*iphase,
+    Kernel_2(num_elements,mp,*iphase,*deltat,
              *COMPUTE_AND_STORE_STRAIN,
              *ATTENUATION,*ANISOTROPY,
              mp->d_ibool,

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h	2012-11-15 09:20:21 UTC (rev 21035)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h	2012-11-15 13:41:58 UTC (rev 21036)
@@ -274,6 +274,7 @@
 #ifdef USE_TEXTURES_FIELDS
   // Texture references for fast non-coalesced scattered access
   const textureReference* d_displ_tex_ref_ptr;
+  const textureReference* d_veloc_tex_ref_ptr;
   const textureReference* d_accel_tex_ref_ptr;
 #endif
 

Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2012-11-15 09:20:21 UTC (rev 21035)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu	2012-11-15 13:41:58 UTC (rev 21036)
@@ -45,6 +45,7 @@
 #else
   #ifdef USE_TEXTURES_FIELDS
 extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_tex;
+extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_veloc_tex;
 extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_tex;
   #endif
 
@@ -721,6 +722,17 @@
 
   {
     #ifdef USE_OLDER_CUDA4_GPU
+      print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_veloc_tex_ref_ptr, "d_veloc_tex"), 4002);
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      print_CUDA_error_if_any(cudaBindTexture(0, mp->d_veloc_tex_ref_ptr, mp->d_veloc, &channelDesc, sizeof(realw)*(*size)), 4002);
+    #else
+      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+      print_CUDA_error_if_any(cudaBindTexture(0, &d_veloc_tex, mp->d_veloc, &channelDesc, sizeof(realw)*(*size)), 4002);
+    #endif
+  }
+
+  {
+    #ifdef USE_OLDER_CUDA4_GPU
       print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_tex_ref_ptr, "d_accel_tex"), 4003);
       cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
       print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_tex_ref_ptr, mp->d_accel, &channelDesc, sizeof(realw)*(*size)), 4003);

Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90	2012-11-15 09:20:21 UTC (rev 21035)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90	2012-11-15 13:41:58 UTC (rev 21036)
@@ -121,7 +121,7 @@
     else
       ! on GPU
       ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
-      call compute_forces_elastic_cuda(Mesh_pointer, iphase, &
+      call compute_forces_elastic_cuda(Mesh_pointer, iphase, deltat, &
                                       nspec_outer_elastic, &
                                       nspec_inner_elastic, &
                                       COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)



More information about the CIG-COMMITS mailing list