[cig-commits] r19944 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER: EXAMPLES/global_s362ani_small src/cuda src/specfem3D
joseph.charles at geodynamics.org
joseph.charles at geodynamics.org
Fri Apr 13 08:18:05 PDT 2012
Author: joseph.charles
Date: 2012-04-13 08:18:05 -0700 (Fri, 13 Apr 2012)
New Revision: 19944
Modified:
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani_small/process.sh
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
Log:
fixes attenuation bug in cuda code by using 1st order Taylor expansion for epsilondev_loc variable
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani_small/process.sh
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani_small/process.sh 2012-04-13 02:00:32 UTC (rev 19943)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/EXAMPLES/global_s362ani_small/process.sh 2012-04-13 15:18:05 UTC (rev 19944)
@@ -11,7 +11,7 @@
##################################################
# USER PARAMETER
# source directory
-rootdir=~/SPECFEM3D_GLOBE_GPU/
+rootdir=~/SPECFEM3D_GLOBE_GPU
##################################################
@@ -40,11 +40,11 @@
#./configure F90=ifort MPIF90=mpif90 FLAGS_CHECK="-O3 -assume byterecl" FLAGS_NO_CHECK="-O3 -assume byterecl"
# compiles for a forward simulation
cp $currentdir/DATA/Par_file DATA/Par_file
+make clean
make
# backup of constants setup
cp setup/* $currentdir/OUTPUT_FILES/
-cp OUTPUT_FILES/values_from_mesher.h $currentdir/OUTPUT_FILES/
cp DATA/Par_file $currentdir/OUTPUT_FILES/
cd $currentdir
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu 2012-04-13 02:00:32 UTC (rev 19943)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu 2012-04-13 15:18:05 UTC (rev 19944)
@@ -896,8 +896,10 @@
int* d_phase_ispec_inner,
int num_phase_ispec,
int d_iphase,
+ realw d_deltat,
int use_mesh_coloring_gpu,
realw* d_displ,
+ realw* d_veloc,
realw* d_accel,
realw* d_xix, realw* d_xiy, realw* d_xiz,
realw* d_etax, realw* d_etay, realw* d_etaz,
@@ -955,6 +957,10 @@
reald duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+ reald tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
+ reald duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+ reald duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+
reald fac1,fac2,fac3;
reald minus_sum_beta,one_minus_sum_beta_use;
@@ -976,6 +982,10 @@
__shared__ reald s_dummyy_loc[NGLL3];
__shared__ reald s_dummyz_loc[NGLL3];
+ __shared__ reald s_dummyx_loc_att[NGLL3];
+ __shared__ reald s_dummyy_loc_att[NGLL3];
+ __shared__ reald s_dummyz_loc_att[NGLL3];
+
__shared__ reald s_tempx1[NGLL3];
__shared__ reald s_tempx2[NGLL3];
__shared__ reald s_tempx3[NGLL3];
@@ -1030,6 +1040,18 @@
if (active) {
+ // use first order Taylor expansion of displacement for local storage of stresses
+ // at this current time step, to fix attenuation in a consistent way
+#ifdef USE_TEXTURES
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob);
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + NGLOB);
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + 2*NGLOB);
+#else
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+#endif
+
#ifndef MANUALLY_UNROLLED_LOOPS
tempx1l = 0.f;
@@ -1064,6 +1086,41 @@
tempz3l += s_dummyz_loc[offset]*hp3;
}
+
+ // temporary variables used for fixing attenuation in a consistent way
+
+ tempx1l_att = 0.f;
+ tempx2l_att = 0.f;
+ tempx3l_att = 0.f;
+
+ tempy1l_att = 0.f;
+ tempy2l_att = 0.f;
+ tempy3l_att = 0.f;
+
+ tempz1l_att = 0.f;
+ tempz2l_att = 0.f;
+ tempz3l_att = 0.f;
+
+ for (l=0;l<NGLLX;l++) {
+ hp1 = d_hprime_xx[l*NGLLX+I];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l_att += s_dummyx_loc_att[offset]*hp1;
+ tempy1l_att += s_dummyy_loc_att[offset]*hp1;
+ tempz1l_att += s_dummyz_loc_att[offset]*hp1;
+
+ hp2 = d_hprime_xx[l*NGLLX+J];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l_att += s_dummyx_loc_att[offset]*hp2;
+ tempy2l_att += s_dummyy_loc_att[offset]*hp2;
+ tempz2l_att += s_dummyz_loc_att[offset]*hp2;
+
+ hp3 = d_hprime_xx[l*NGLLX+K];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l_att += s_dummyx_loc_att[offset]*hp3;
+ tempy3l_att += s_dummyy_loc_att[offset]*hp3;
+ tempz3l_att += s_dummyz_loc_att[offset]*hp3;
+
+ }
#else
tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
@@ -1120,6 +1177,62 @@
+ s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ // temporary variables used for fixing attenuation in a consistent way
+
+ tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
#endif
// compute derivatives of ux, uy and uz with respect to x, y and z
@@ -1155,16 +1268,35 @@
duzdxl_plus_duxdzl = duzdxl + duxdzl;
duzdyl_plus_duydzl = duzdyl + duydzl;
+ // temporary variables used for fixing attenuation in a consistent way
+
+ duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
+ duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
+ duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
+
+ duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
+ duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
+ duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
+
+ duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
+ duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
+ duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
+
+ // precompute some sums to save CPU time
+ duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
+ duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
+ duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
+
// computes deviatoric strain attenuation and/or for kernel calculations
if(COMPUTE_AND_STORE_STRAIN) {
- realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+ realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
// local storage: stresses at this current time step
- epsilondev_xx_loc = duxdxl - templ;
- epsilondev_yy_loc = duydyl - templ;
- epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
- epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
- epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+ epsilondev_xx_loc = duxdxl_att - templ;
+ epsilondev_yy_loc = duydyl_att - templ;
+ epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
+ epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
+ epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) {
epsilon_trace_over_3[tx] = templ;
@@ -1459,6 +1591,7 @@
/* ----------------------------------------------------------------------------------------------- */
void Kernel_2_crust_mantle(int nb_blocks_to_compute,Mesh* mp,
+ realw d_deltat,
int d_iphase,
int* d_ibool,
int* d_ispec_is_tiso,
@@ -1534,8 +1667,10 @@
mp->d_phase_ispec_inner_crust_mantle,
mp->num_phase_ispec_crust_mantle,
d_iphase,
+ d_deltat,
mp->use_mesh_coloring_gpu,
mp->d_displ_crust_mantle,
+ mp->d_veloc_crust_mantle,
mp->d_accel_crust_mantle,
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
@@ -1578,8 +1713,10 @@
mp->d_phase_ispec_inner_crust_mantle,
mp->num_phase_ispec_crust_mantle,
d_iphase,
+ d_deltat,
mp->use_mesh_coloring_gpu,
mp->d_b_displ_crust_mantle,
+ mp->d_b_veloc_crust_mantle,
mp->d_b_accel_crust_mantle,
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
@@ -1634,6 +1771,7 @@
extern "C"
void FC_FUNC_(compute_forces_crust_mantle_cuda,
COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
+ realw* deltat,
int* iphase) {
TRACE("compute_forces_crust_mantle_cuda");
@@ -1701,6 +1839,7 @@
//}
Kernel_2_crust_mantle(nb_blocks_to_compute,mp,
+ *deltat,
*iphase,
mp->d_ibool_crust_mantle + color_offset_nonpadded,
mp->d_ispec_is_tiso_crust_mantle + color_offset_ispec,
@@ -1780,6 +1919,7 @@
// no mesh coloring: uses atomic updates
Kernel_2_crust_mantle(num_elements,mp,
+ *deltat,
*iphase,
mp->d_ibool_crust_mantle,
mp->d_ispec_is_tiso_crust_mantle,
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2012-04-13 02:00:32 UTC (rev 19943)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2012-04-13 15:18:05 UTC (rev 19944)
@@ -57,6 +57,7 @@
#ifdef USE_TEXTURES
// declaration of textures
texture<realw, 1, cudaReadModeElementType> tex_displ;
+ texture<realw, 1, cudaReadModeElementType> tex_veloc;
texture<realw, 1, cudaReadModeElementType> tex_accel;
texture<realw, 1, cudaReadModeElementType> tex_potential;
@@ -78,6 +79,20 @@
}
}
+ void bindTexturesVeloc(realw* d_veloc)
+ {
+ cudaError_t err;
+
+ cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+
+ err = cudaBindTexture(NULL,tex_veloc, d_veloc, channelDescFloat, NDIM*NGLOB*sizeof(realw));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in bindTexturesVeloc for veloc: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+ }
+
void bindTexturesAccel(realw* d_accel)
{
cudaError_t err;
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90 2012-04-13 02:00:32 UTC (rev 19943)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90 2012-04-13 15:18:05 UTC (rev 19944)
@@ -204,7 +204,7 @@
! on GPU
! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
! for crust/mantle
- call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase)
+ call compute_forces_crust_mantle_cuda(Mesh_pointer,deltat,iphase)
! for inner core
call compute_forces_inner_core_cuda(Mesh_pointer,iphase)
endif ! GPU_MODE
More information about the CIG-COMMITS
mailing list