[cig-commits] r21036 - in seismo/3D/SPECFEM3D/trunk/src: cuda specfem3D
joseph.charles at geodynamics.org
joseph.charles at geodynamics.org
Thu Nov 15 05:41:58 PST 2012
Author: joseph.charles
Date: 2012-11-15 05:41:58 -0800 (Thu, 15 Nov 2012)
New Revision: 21036
Modified:
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90
Log:
fixes attenuation bug in GPU code by using 1st order Taylor expansion for epsilondev_loc variable
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu 2012-11-15 09:20:21 UTC (rev 21035)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu 2012-11-15 13:41:58 UTC (rev 21036)
@@ -655,7 +655,8 @@
int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
int d_iphase,
int use_mesh_coloring_gpu,
- realw* d_displ, realw* d_accel,
+ realw d_deltat,
+ realw* d_displ,realw* d_veloc,realw* d_accel,
realw* d_xix, realw* d_xiy, realw* d_xiz,
realw* d_etax, realw* d_etay, realw* d_etaz,
realw* d_gammax, realw* d_gammay, realw* d_gammaz,
@@ -722,6 +723,10 @@
realw duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
realw duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+ realw tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
+ realw duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+ realw duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+
realw fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
@@ -742,6 +747,10 @@
__shared__ realw s_dummyy_loc[NGLL3];
__shared__ realw s_dummyz_loc[NGLL3];
+ __shared__ realw s_dummyx_loc_att[NGLL3];
+ __shared__ realw s_dummyy_loc_att[NGLL3];
+ __shared__ realw s_dummyz_loc_att[NGLL3];
+
__shared__ realw s_tempx1[NGLL3];
__shared__ realw s_tempx2[NGLL3];
__shared__ realw s_tempx3[NGLL3];
@@ -789,6 +798,20 @@
#endif
}
+ if(ATTENUATION){
+ // use first order Taylor expansion of displacement for local storage of stresses
+ // at this current time step, to fix attenuation in a consistent way
+#ifdef USE_TEXTURES
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(d_veloc_tex, iglob);
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(d_veloc_tex, iglob + NGLOB);
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(d_veloc_tex, iglob + 2*NGLOB);
+#else
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+#endif
+ }
+
if (tx < NGLL2) {
#ifdef USE_TEXTURES_CONSTANTS
sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
@@ -839,6 +862,44 @@
tempz3l += s_dummyz_loc[offset]*hp3;
}
+
+ if( ATTENUATION){
+ // temporary variables used for fixing attenuation in a consistent way
+
+ tempx1l_att = 0.f;
+ tempx2l_att = 0.f;
+ tempx3l_att = 0.f;
+
+ tempy1l_att = 0.f;
+ tempy2l_att = 0.f;
+ tempy3l_att = 0.f;
+
+ tempz1l_att = 0.f;
+ tempz2l_att = 0.f;
+ tempz3l_att = 0.f;
+
+ for (l=0;l<NGLLX;l++) {
+ hp1 = sh_hprime_xx[l*NGLLX+I];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l_att += s_dummyx_loc_att[offset]*hp1;
+ tempy1l_att += s_dummyy_loc_att[offset]*hp1;
+ tempz1l_att += s_dummyz_loc_att[offset]*hp1;
+
+ hp2 = sh_hprime_xx[l*NGLLX+J];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l_att += s_dummyx_loc_att[offset]*hp2;
+ tempy2l_att += s_dummyy_loc_att[offset]*hp2;
+ tempz2l_att += s_dummyz_loc_att[offset]*hp2;
+
+ hp3 = sh_hprime_xx[l*NGLLX+K];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l_att += s_dummyx_loc_att[offset]*hp3;
+ tempy3l_att += s_dummyy_loc_att[offset]*hp3;
+ tempz3l_att += s_dummyz_loc_att[offset]*hp3;
+
+ }
+ }
+
#else
tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
@@ -895,6 +956,64 @@
+ s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ if( ATTENUATION){
+ // temporary variables used for fixing attenuation in a consistent way
+
+ tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ }
+
#endif
// compute derivatives of ux, uy and uz with respect to x, y and z
@@ -930,26 +1049,63 @@
duzdxl_plus_duxdzl = duzdxl + duxdzl;
duzdyl_plus_duydzl = duzdyl + duydzl;
- // computes deviatoric strain attenuation and/or for kernel calculations
- if(COMPUTE_AND_STORE_STRAIN) {
- realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
- /*
- epsilondev_xx[offset] = duxdxl - templ;
- epsilondev_yy[offset] = duydyl - templ;
- epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
- epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
- epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
- */
- // local storage: stresses at this current time step
- epsilondev_xx_loc = duxdxl - templ;
- epsilondev_yy_loc = duydyl - templ;
- epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
- epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
- epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+ if( ATTENUATION){
+ // temporary variables used for fixing attenuation in a consistent way
- if(SIMULATION_TYPE == 3) {
- epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+ duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
+ duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
+ duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
+
+ duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
+ duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
+ duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
+
+ duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
+ duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
+ duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
+
+ // precompute some sums to save CPU time
+ duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
+ duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
+ duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
+
+ // computes deviatoric strain attenuation and/or for kernel calculations
+ if(COMPUTE_AND_STORE_STRAIN) {
+ realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
+
+ // local storage: stresses at this current time step
+ epsilondev_xx_loc = duxdxl_att - templ;
+ epsilondev_yy_loc = duydyl_att - templ;
+ epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
+ epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
+ epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
+
+ if(SIMULATION_TYPE == 3) {
+ epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+ }
}
+ }else{
+ // computes deviatoric strain attenuation and/or for kernel calculations
+ if(COMPUTE_AND_STORE_STRAIN) {
+ realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+ /*
+ epsilondev_xx[offset] = duxdxl - templ;
+ epsilondev_yy[offset] = duydyl - templ;
+ epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
+ epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
+ epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
+ */
+ // local storage: stresses at this current time step
+ epsilondev_xx_loc = duxdxl - templ;
+ epsilondev_yy_loc = duydyl - templ;
+ epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+ epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+ epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+
+ if(SIMULATION_TYPE == 3) {
+ epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+ }
+ }
}
// compute elements with an elastic isotropic rheology
@@ -1246,7 +1402,7 @@
/* ----------------------------------------------------------------------------------------------- */
-void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,
+void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
int COMPUTE_AND_STORE_STRAIN,
int ATTENUATION,int ANISOTROPY,
int* d_ibool,
@@ -1341,7 +1497,8 @@
mp->num_phase_ispec_elastic,
d_iphase,
mp->use_mesh_coloring_gpu,
- mp->d_displ, mp->d_accel,
+ d_deltat,
+ mp->d_displ,mp->d_veloc,mp->d_accel,
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
d_gammax, d_gammay, d_gammaz,
@@ -1399,7 +1556,8 @@
mp->num_phase_ispec_elastic,
d_iphase,
mp->use_mesh_coloring_gpu,
- mp->d_b_displ, mp->d_b_accel,
+ d_deltat,
+ mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
d_gammax, d_gammay, d_gammaz,
@@ -1470,6 +1628,7 @@
void FC_FUNC_(compute_forces_elastic_cuda,
COMPUTE_FORCES_ELASTIC_CUDA)(long* Mesh_pointer_f,
int* iphase,
+ realw* deltat,
int* nspec_outer_elastic,
int* nspec_inner_elastic,
int* COMPUTE_AND_STORE_STRAIN,
@@ -1536,7 +1695,7 @@
// exit(EXIT_FAILURE);
//}
- Kernel_2(nb_blocks_to_compute,mp,*iphase,
+ Kernel_2(nb_blocks_to_compute,mp,*iphase,*deltat,
*COMPUTE_AND_STORE_STRAIN,
*ATTENUATION,*ANISOTROPY,
mp->d_ibool + color_offset_nonpadded,
@@ -1618,7 +1777,7 @@
// no mesh coloring: uses atomic updates
- Kernel_2(num_elements,mp,*iphase,
+ Kernel_2(num_elements,mp,*iphase,*deltat,
*COMPUTE_AND_STORE_STRAIN,
*ATTENUATION,*ANISOTROPY,
mp->d_ibool,
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h 2012-11-15 09:20:21 UTC (rev 21035)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h 2012-11-15 13:41:58 UTC (rev 21036)
@@ -274,6 +274,7 @@
#ifdef USE_TEXTURES_FIELDS
// Texture references for fast non-coalesced scattered access
const textureReference* d_displ_tex_ref_ptr;
+ const textureReference* d_veloc_tex_ref_ptr;
const textureReference* d_accel_tex_ref_ptr;
#endif
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu 2012-11-15 09:20:21 UTC (rev 21035)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu 2012-11-15 13:41:58 UTC (rev 21036)
@@ -45,6 +45,7 @@
#else
#ifdef USE_TEXTURES_FIELDS
extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_tex;
+extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_veloc_tex;
extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_tex;
#endif
@@ -721,6 +722,17 @@
{
#ifdef USE_OLDER_CUDA4_GPU
+ print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_veloc_tex_ref_ptr, "d_veloc_tex"), 4002);
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_veloc_tex_ref_ptr, mp->d_veloc, &channelDesc, sizeof(realw)*(*size)), 4002);
+ #else
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_veloc_tex, mp->d_veloc, &channelDesc, sizeof(realw)*(*size)), 4002);
+ #endif
+ }
+
+ {
+ #ifdef USE_OLDER_CUDA4_GPU
print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_tex_ref_ptr, "d_accel_tex"), 4003);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_tex_ref_ptr, mp->d_accel, &channelDesc, sizeof(realw)*(*size)), 4003);
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90 2012-11-15 09:20:21 UTC (rev 21035)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90 2012-11-15 13:41:58 UTC (rev 21036)
@@ -121,7 +121,7 @@
else
! on GPU
! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
- call compute_forces_elastic_cuda(Mesh_pointer, iphase, &
+ call compute_forces_elastic_cuda(Mesh_pointer, iphase, deltat, &
nspec_outer_elastic, &
nspec_inner_elastic, &
COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
More information about the CIG-COMMITS
mailing list