[cig-commits] r22992 - seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Fri Jan 17 07:36:14 PST 2014
Author: danielpeter
Date: 2014-01-17 07:36:14 -0800 (Fri, 17 Jan 2014)
New Revision: 22992
Modified:
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_constants_cuda.h
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu
Log:
adds textures as an option for GPU simulation; adds kernel launch optimization for kepler architectures
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu 2014-01-17 15:36:14 UTC (rev 22992)
@@ -212,6 +212,10 @@
/* ----------------------------------------------------------------------------------------------- */
+// CUDA synchronization
+
+/* ----------------------------------------------------------------------------------------------- */
+
void synchronize_cuda(){
#if CUDA_VERSION >= 4000
cudaDeviceSynchronize();
@@ -230,7 +234,39 @@
/* ----------------------------------------------------------------------------------------------- */
+// Timing helper functions
+/* ----------------------------------------------------------------------------------------------- */
+
+void start_timing_cuda(cudaEvent_t* start,cudaEvent_t* stop){
+ // creates & starts event
+ cudaEventCreate(start);
+ cudaEventCreate(stop);
+ cudaEventRecord( *start, 0 );
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void stop_timing_cuda(cudaEvent_t* start,cudaEvent_t* stop, char* info_str){
+ realw time;
+ // stops events
+ cudaEventRecord( *stop, 0 );
+ cudaEventSynchronize( *stop );
+ cudaEventElapsedTime( &time, *start, *stop );
+ cudaEventDestroy( *start );
+ cudaEventDestroy( *stop );
+ // user output
+ printf("%s: Execution Time = %f ms\n",info_str,time);
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// CUDA kernel setup functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
// Initially sets the blocks_x to be the num_blocks, and adds rows as needed (block size limit of 65535).
@@ -251,6 +287,10 @@
/* ----------------------------------------------------------------------------------------------- */
+// GPU device memory functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
void get_free_memory(double* free_db, double* used_db, double* total_db) {
// gets memory usage in byte
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu 2014-01-17 15:36:14 UTC (rev 22992)
@@ -53,22 +53,23 @@
// templates definitions
template<int FORWARD_OR_ADJOINT> __device__ float texfetch_displ_cm(int x);
-template<int FORWARD_OR_ADJOINT> __device__ float texfetch_veloc_cm(int x);
template<int FORWARD_OR_ADJOINT> __device__ float texfetch_accel_cm(int x);
// templates for texture fetching
// FORWARD_OR_ADJOINT == 1 <- forward arrays
template<> __device__ float texfetch_displ_cm<1>(int x) { return tex1Dfetch(d_displ_cm_tex, x); }
-//template<> __device__ float texfetch_veloc_cm<1>(int x) { return tex1Dfetch(d_veloc_cm_tex, x); }
template<> __device__ float texfetch_accel_cm<1>(int x) { return tex1Dfetch(d_accel_cm_tex, x); }
// FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
template<> __device__ float texfetch_displ_cm<3>(int x) { return tex1Dfetch(d_b_displ_cm_tex, x); }
-//template<> __device__ float texfetch_veloc_cm<3>(int x) { return tex1Dfetch(d_b_veloc_cm_tex, x); }
template<> __device__ float texfetch_accel_cm<3>(int x) { return tex1Dfetch(d_b_accel_cm_tex, x); }
#endif
#ifdef USE_TEXTURES_CONSTANTS
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_cm_tex;
+realw_texture d_hprime_xx_tex;
+__constant__ size_t d_hprime_xx_tex_offset;
+// weighted
+realw_texture d_hprimewgll_xx_tex;
+__constant__ size_t d_hprimewgll_xx_tex_offset;
#endif
@@ -81,11 +82,11 @@
// updates stress
__device__ void compute_element_cm_att_stress(int tx,int working_element,
- realw* R_xx,
- realw* R_yy,
- realw* R_xy,
- realw* R_xz,
- realw* R_yz,
+ realw_p R_xx,
+ realw_p R_yy,
+ realw_p R_xy,
+ realw_p R_xz,
+ realw_p R_yz,
realw* sigma_xx,
realw* sigma_yy,
realw* sigma_zz,
@@ -123,21 +124,22 @@
// updates R_memory
__device__ void compute_element_cm_att_memory(int tx,int working_element,
- realw* d_muvstore,
- realw* factor_common,
- realw* alphaval,realw* betaval,realw* gammaval,
- realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
- realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
- realw* epsilondev_xz,realw* epsilondev_yz,
+ realw_const_p d_muvstore,
+ realw_const_p factor_common,
+ realw_const_p alphaval,realw_const_p betaval,realw_const_p gammaval,
+ realw_p R_xx,realw_p R_yy,realw_p R_xy,realw_p R_xz,realw_p R_yz,
+ realw_p epsilondev_xx,realw_p epsilondev_yy,realw_p epsilondev_xy,
+ realw_p epsilondev_xz,realw_p epsilondev_yz,
realw epsilondev_xx_loc,realw epsilondev_yy_loc,realw epsilondev_xy_loc,
realw epsilondev_xz_loc,realw epsilondev_yz_loc,
- realw* d_c44store,
- int ANISOTROPY,
- int USE_3D_ATTENUATION_ARRAYS) {
+ realw_const_p d_c44store,
+ const int ANISOTROPY,
+ const int USE_3D_ATTENUATION_ARRAYS) {
realw fac;
+ realw factor_loc;
realw alphaval_loc,betaval_loc,gammaval_loc;
- realw factor_loc,Sn,Snp1;
+ realw Sn,Snp1;
int offset_sls;
// shear moduli for common factor (only Q_mu attenuation)
@@ -204,13 +206,13 @@
// pre-computes gravity term
-__device__ void compute_element_cm_gravity(int tx,int working_element,
- int* d_ibool,
- realw* d_xstore,realw* d_ystore,realw* d_zstore,
- realw* d_minus_gravity_table,
- realw* d_minus_deriv_gravity_table,
- realw* d_density_table,
- realw* wgll_cube,
+__device__ void compute_element_cm_gravity(int tx,
+ const int iglob,
+ realw_const_p d_xstore,realw_const_p d_ystore,realw_const_p d_zstore,
+ realw_const_p d_minus_gravity_table,
+ realw_const_p d_minus_deriv_gravity_table,
+ realw_const_p d_density_table,
+ realw_const_p wgll_cube,
realw jacobianl,
realw* s_dummyx_loc,
realw* s_dummyy_loc,
@@ -238,6 +240,7 @@
realw Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
realw sx_l,sy_l,sz_l;
realw factor;
+ int int_radius;
// R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
//const realw R_EARTH_KM = 6371.0f;
@@ -248,13 +251,11 @@
// use mesh coordinates to get theta and phi
// x y z contain r theta phi
- int iglob = d_ibool[working_element*NGLL3 + tx]-1;
-
radius = d_xstore[iglob];
theta = d_ystore[iglob];
phi = d_zstore[iglob];
- if( sizeof( theta ) == sizeof( float ) ){
+ if( sizeof( realw ) == sizeof( float ) ){
// float operations
// sincos function return sinus and cosine for given value
sincosf(theta, &sin_theta, &cos_theta);
@@ -269,7 +270,7 @@
// for efficiency replace with lookup table every 100 m in radial direction
// note: radius in crust mantle should never be zero,
// and arrays in C start from 0, thus we need to subtract -1
- int int_radius = rint(radius * R_EARTH_KM * 10.0f ) - 1;
+ int_radius = rint(radius * R_EARTH_KM * 10.0f ) - 1;
// get g, rho and dg/dr=dg
// spherical components of the gravitational acceleration
@@ -332,14 +333,14 @@
// computes stresses for anisotropic element
__device__ void compute_element_cm_aniso(int offset,
- realw* d_c11store,realw* d_c12store,realw* d_c13store,
- realw* d_c14store,realw* d_c15store,realw* d_c16store,
- realw* d_c22store,realw* d_c23store,realw* d_c24store,
- realw* d_c25store,realw* d_c26store,realw* d_c33store,
- realw* d_c34store,realw* d_c35store,realw* d_c36store,
- realw* d_c44store,realw* d_c45store,realw* d_c46store,
- realw* d_c55store,realw* d_c56store,realw* d_c66store,
- int ATTENUATION,
+ realw_const_p d_c11store,realw_const_p d_c12store,realw_const_p d_c13store,
+ realw_const_p d_c14store,realw_const_p d_c15store,realw_const_p d_c16store,
+ realw_const_p d_c22store,realw_const_p d_c23store,realw_const_p d_c24store,
+ realw_const_p d_c25store,realw_const_p d_c26store,realw_const_p d_c33store,
+ realw_const_p d_c34store,realw_const_p d_c35store,realw_const_p d_c36store,
+ realw_const_p d_c44store,realw_const_p d_c45store,realw_const_p d_c46store,
+ realw_const_p d_c55store,realw_const_p d_c56store,realw_const_p d_c66store,
+ const int ATTENUATION,
realw one_minus_sum_beta_use,
realw duxdxl,realw duxdyl,realw duxdzl,
realw duydxl,realw duydyl,realw duydzl,
@@ -409,8 +410,8 @@
// computes stresses for isotropic element
__device__ void compute_element_cm_iso(int offset,
- realw* d_kappavstore,realw* d_muvstore,
- int ATTENUATION,
+ realw_const_p d_kappavstore,realw_const_p d_muvstore,
+ const int ATTENUATION,
realw one_minus_sum_beta_use,
realw duxdxl,realw duydyl,realw duzdzl,
realw duxdxl_plus_duydyl,realw duxdxl_plus_duzdzl,realw duydyl_plus_duzdzl,
@@ -448,16 +449,16 @@
// computes stresses for transversely isotropic element
__device__ void compute_element_cm_tiso(int offset,
- realw* d_kappavstore,realw* d_muvstore,
- realw* d_kappahstore,realw* d_muhstore,realw* d_eta_anisostore,
- int ATTENUATION,
+ realw_const_p d_kappavstore,realw_const_p d_muvstore,
+ realw_const_p d_kappahstore,realw_const_p d_muhstore,realw_const_p d_eta_anisostore,
+ const int ATTENUATION,
realw one_minus_sum_beta_use,
realw duxdxl,realw duxdyl,realw duxdzl,
realw duydxl,realw duydyl,realw duydzl,
realw duzdxl,realw duzdyl,realw duzdzl,
realw duxdyl_plus_duydxl,realw duzdxl_plus_duxdzl,realw duzdyl_plus_duydzl,
- int iglob,int NGLOB,
- realw* d_ystore, realw* d_zstore,
+ int iglob,
+ realw_const_p d_ystore, realw_const_p d_zstore,
realw* sigma_xx,realw* sigma_yy,realw* sigma_zz,
realw* sigma_xy,realw* sigma_xz,realw* sigma_yz){
@@ -472,7 +473,6 @@
realw two_rhovsvsq,two_rhovshsq;
realw four_rhovsvsq,four_rhovshsq;
realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
-
// cosine and sine function in CUDA only supported for float
realw theta,phi;
@@ -503,7 +503,7 @@
theta = d_ystore[iglob];
phi = d_zstore[iglob];
- if( sizeof( theta ) == sizeof( float ) ){
+ if( sizeof( realw ) == sizeof( float ) ){
// float operations
// sincos function return sinus and cosine for given value
@@ -704,62 +704,189 @@
c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
}
+/* ----------------------------------------------------------------------------------------------- */
+// loads displacement into shared memory for element
+
+template<int FORWARD_OR_ADJOINT>
+__device__ void load_shared_memory_cm(const int* tx, const int* iglob,
+ realw_const_p d_displ,
+ realw* s_dummyx_loc,
+ realw* s_dummyy_loc,
+ realw* s_dummyz_loc){
+
+ // copy from global memory to shared memory
+ // each thread writes one of the NGLL^3 = 125 data points
+#ifdef USE_TEXTURES_FIELDS
+ s_dummyx_loc[(*tx)] = texfetch_displ_cm<FORWARD_OR_ADJOINT>((*iglob)*3);
+ s_dummyy_loc[(*tx)] = texfetch_displ_cm<FORWARD_OR_ADJOINT>((*iglob)*3 + 1);
+ s_dummyz_loc[(*tx)] = texfetch_displ_cm<FORWARD_OR_ADJOINT>((*iglob)*3 + 2);
+#else
+ // changing iglob indexing to match fortran row changes fast style
+ s_dummyx_loc[(*tx)] = d_displ[(*iglob)*3];
+ s_dummyy_loc[(*tx)] = d_displ[(*iglob)*3 + 1];
+ s_dummyz_loc[(*tx)] = d_displ[(*iglob)*3 + 2];
+#endif
+}
+
/* ----------------------------------------------------------------------------------------------- */
+// loads displacement into shared memory for element
+/*
+__device__ void load_shared_memory_hprime_hprimewgll(const int* tx,
+ realw_const_p d_hprime_xx,
+ realw_const_p d_hprimewgll_xx,
+ realw* sh_hprime_xx,
+ realw* sh_hprimewgll_xx ){
+
+ // way 1:
+ // each thread reads its corresponding value
+ // (might be faster sometimes...)
+ if( (*tx) < NGLL2 ) {
+#ifdef USE_TEXTURES_CONSTANTS
+ // debug
+ //printf("tex1dfetch offsets: %lu %lu\n",d_hprime_xx_tex_offset,d_hprimewgll_xx_tex_offset);
+ // hprime
+ sh_hprime_xx[(*tx)] = tex1Dfetch(d_hprime_xx_tex,tx + d_hprime_xx_tex_offset);
+ // weighted hprime
+ sh_hprimewgll_xx[(*tx)] = tex1Dfetch(d_hprimewgll_xx_tex,tx + d_hprimewgll_xx_tex_offset);
+#else
+ // hprime
+ sh_hprime_xx[(*tx)] = d_hprime_xx[(*tx)];
+ // weighted hprime
+ sh_hprimewgll_xx[(*tx)] = d_hprimewgll_xx[(*tx)];
+#endif
+ }
+
+ // way 2:
+ // gets constant arrays into shared memory
+ // (only ghost threads which would be idle anyway)
+ // slightly slower on Kepler...
+ if( (*tx) == NGLL3_PADDED-1 ) {
+ for(int m=0; m < NGLL2; m++){
+#ifdef USE_TEXTURES_CONSTANTS
+ // debug
+ //printf("tex1dfetch offsets: %lu %lu\n",d_hprime_xx_tex_offset,d_hprimewgll_xx_tex_offset);
+ // hprime
+ sh_hprime_xx[m] = tex1Dfetch(d_hprime_xx_tex,m + d_hprime_xx_tex_offset);
+ // weighted hprime
+ sh_hprimewgll_xx[m] = tex1Dfetch(d_hprimewgll_xx_tex,m + d_hprimewgll_xx_tex_offset);
+#else
+ // hprime
+ sh_hprime_xx[m] = d_hprime_xx[m];
+ // weighted hprime
+ sh_hprimewgll_xx[m] = d_hprimewgll_xx[m];
+#endif
+ }
+ }
+
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// loads displacement into shared memory for element
+
+__device__ void load_shared_memory_hprime(const int* tx,
+ realw_const_p d_hprime_xx,
+ realw* sh_hprime_xx){
+
+ // each thread reads its corresponding value
+ // (might be faster sometimes...)
+#ifdef USE_TEXTURES_CONSTANTS
+ // hprime
+ sh_hprime_xx[(*tx)] = tex1Dfetch(d_hprime_xx_tex,tx + d_hprime_xx_tex_offset);
+#else
+ // hprime
+ sh_hprime_xx[(*tx)] = d_hprime_xx[(*tx)];
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// loads displacement into shared memory for element
+
+__device__ void load_shared_memory_hprimewgll(const int* tx,
+ realw_const_p d_hprimewgll_xx,
+ realw* sh_hprimewgll_xx ){
+
+ // each thread reads its corresponding value
+#ifdef USE_TEXTURES_CONSTANTS
+ // weighted hprime
+ sh_hprimewgll_xx[(*tx)] = tex1Dfetch(d_hprimewgll_xx_tex,tx + d_hprimewgll_xx_tex_offset);
+#else
+ // weighted hprime
+ sh_hprimewgll_xx[(*tx)] = d_hprimewgll_xx[(*tx)];
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
// KERNEL 2
//
// for crust_mantle
/* ----------------------------------------------------------------------------------------------- */
-__global__ void Kernel_2_crust_mantle_impl(int nb_blocks_to_compute,
- int NGLOB,
- int* d_ibool,
- int* d_ispec_is_tiso,
- int* d_phase_ispec_inner,
- int num_phase_ispec,
- int d_iphase,
- realw deltat,
- int use_mesh_coloring_gpu,
- realw* d_displ,
- realw* d_veloc,
- realw* d_accel,
- realw* d_xix, realw* d_xiy, realw* d_xiz,
- realw* d_etax, realw* d_etay, realw* d_etaz,
- realw* d_gammax, realw* d_gammay, realw* d_gammaz,
- realw* d_hprime_xx,
- realw* d_hprimewgll_xx,
- realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
- realw* d_kappavstore, realw* d_muvstore,
- realw* d_kappahstore, realw* d_muhstore,
- realw* d_eta_anisostore,
- int COMPUTE_AND_STORE_STRAIN,
- realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
- realw* epsilondev_xz,realw* epsilondev_yz,
- realw* epsilon_trace_over_3,
- int ATTENUATION,
- int PARTIAL_PHYS_DISPERSION_ONLY,
- int USE_3D_ATTENUATION_ARRAYS,
- realw* one_minus_sum_beta,realw* factor_common,
- realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
- realw* alphaval,realw* betaval,realw* gammaval,
- int ANISOTROPY,
- realw* d_c11store,realw* d_c12store,realw* d_c13store,
- realw* d_c14store,realw* d_c15store,realw* d_c16store,
- realw* d_c22store,realw* d_c23store,realw* d_c24store,
- realw* d_c25store,realw* d_c26store,realw* d_c33store,
- realw* d_c34store,realw* d_c35store,realw* d_c36store,
- realw* d_c44store,realw* d_c45store,realw* d_c46store,
- realw* d_c55store,realw* d_c56store,realw* d_c66store,
- int GRAVITY,
- realw* d_xstore,realw* d_ystore,realw* d_zstore,
- realw* d_minus_gravity_table,
- realw* d_minus_deriv_gravity_table,
- realw* d_density_table,
- realw* wgll_cube,
- int NSPEC_CRUST_MANTLE_STRAIN_ONLY){
+template<int FORWARD_OR_ADJOINT> __global__ void
+#ifdef USE_LAUNCH_BOUNDS
+// adds compiler specification
+__launch_bounds__(NGLL3_PADDED,LAUNCH_MIN_BLOCKS)
+#endif
+// main kernel
+Kernel_2_crust_mantle_impl( int nb_blocks_to_compute,
+ const int* d_ibool,
+ const int* d_ispec_is_tiso,
+ const int* d_phase_ispec_inner,
+ int num_phase_ispec,
+ const int d_iphase,
+ realw deltat,
+ const int use_mesh_coloring_gpu,
+ realw_const_p d_displ,
+ realw_p d_accel,
+ realw_const_p d_xix, realw_const_p d_xiy, realw_const_p d_xiz,
+ realw_const_p d_etax, realw_const_p d_etay, realw_const_p d_etaz,
+ realw_const_p d_gammax, realw_const_p d_gammay, realw_const_p d_gammaz,
+ realw_const_p d_hprime_xx,
+ realw_const_p d_hprimewgll_xx,
+ realw_const_p d_wgllwgll_xy,
+ realw_const_p d_wgllwgll_xz,
+ realw_const_p d_wgllwgll_yz,
+ realw_const_p d_kappavstore,
+ realw_const_p d_muvstore,
+ realw_const_p d_kappahstore,
+ realw_const_p d_muhstore,
+ realw_const_p d_eta_anisostore,
+ const int COMPUTE_AND_STORE_STRAIN,
+ realw_p epsilondev_xx,realw_p epsilondev_yy,realw_p epsilondev_xy,
+ realw_p epsilondev_xz,realw_p epsilondev_yz,
+ realw_p epsilon_trace_over_3,
+ const int ATTENUATION,
+ const int PARTIAL_PHYS_DISPERSION_ONLY,
+ const int USE_3D_ATTENUATION_ARRAYS,
+ realw_const_p one_minus_sum_beta,
+ realw_const_p factor_common,
+ realw_p R_xx, realw_p R_yy, realw_p R_xy, realw_p R_xz, realw_p R_yz,
+ realw_const_p alphaval,
+ realw_const_p betaval,
+ realw_const_p gammaval,
+ const int ANISOTROPY,
+ realw_const_p d_c11store,realw_const_p d_c12store,realw_const_p d_c13store,
+ realw_const_p d_c14store,realw_const_p d_c15store,realw_const_p d_c16store,
+ realw_const_p d_c22store,realw_const_p d_c23store,realw_const_p d_c24store,
+ realw_const_p d_c25store,realw_const_p d_c26store,realw_const_p d_c33store,
+ realw_const_p d_c34store,realw_const_p d_c35store,realw_const_p d_c36store,
+ realw_const_p d_c44store,realw_const_p d_c45store,realw_const_p d_c46store,
+ realw_const_p d_c55store,realw_const_p d_c56store,realw_const_p d_c66store,
+ const int GRAVITY,
+ realw_const_p d_xstore,realw_const_p d_ystore,realw_const_p d_zstore,
+ realw_const_p d_minus_gravity_table,
+ realw_const_p d_minus_deriv_gravity_table,
+ realw_const_p d_density_table,
+ realw_const_p wgll_cube,
+ const int NSPEC_CRUST_MANTLE_STRAIN_ONLY ){
// block id == spectral-element id
int bx = blockIdx.y*gridDim.x+blockIdx.x;
@@ -770,8 +897,8 @@
int J = ((tx-K*NGLL2)/NGLLX);
int I = (tx-K*NGLL2-J*NGLLX);
- int active,offset;
- int iglob;
+ unsigned short int active;
+ int iglob,offset;
int working_element;
realw tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
@@ -793,9 +920,10 @@
realw rho_s_H1,rho_s_H2,rho_s_H3;
#ifndef MANUALLY_UNROLLED_LOOPS
- int l;
+ int l;
#endif
+ // shared memory arrays
__shared__ realw s_dummyx_loc[NGLL3];
__shared__ realw s_dummyy_loc[NGLL3];
__shared__ realw s_dummyz_loc[NGLL3];
@@ -812,17 +940,17 @@
__shared__ realw s_tempz2[NGLL3];
__shared__ realw s_tempz3[NGLL3];
+ // note: using shared memory for hprime's improves performance
+ // (but could tradeoff with occupancy)
__shared__ realw sh_hprime_xx[NGLL2];
__shared__ realw sh_hprimewgll_xx[NGLL2];
-// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
-// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
+ // use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
+ // because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
-// copy from global memory to shared memory
-// each thread writes one of the NGLL^3 = 125 data points
- if (active) {
-
+ // determines spectral element to work on
+ if( active ) {
#ifdef USE_MESH_COLORING_GPU
working_element = bx;
#else
@@ -834,57 +962,30 @@
working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
}
#endif
+ // local padded index
+ offset = working_element*NGLL3_PADDED + tx;
- // iglob = d_ibool[working_element*NGLL3_PADDED + tx]-1;
+ // global index
iglob = d_ibool[working_element*NGLL3 + tx]-1;
-#ifdef USE_TEXTURES_FIELDS
- s_dummyx_loc[tx] = tex1Dfetch(d_displ_cm_tex, iglob*3);
- s_dummyy_loc[tx] = tex1Dfetch(d_displ_cm_tex, iglob*3 + 1);
- s_dummyz_loc[tx] = tex1Dfetch(d_displ_cm_tex, iglob*3 + 2);
-#else
- // changing iglob indexing to match fortran row changes fast style
- s_dummyx_loc[tx] = d_displ[iglob*3];
- s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
- s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
-#endif
-
+ // copy displacement from global memory to shared memory
+ load_shared_memory_cm<FORWARD_OR_ADJOINT>(&tx,&iglob,d_displ,s_dummyx_loc,s_dummyy_loc,s_dummyz_loc);
} // active
- // gets constant arrays into shared memory
- // (only ghost threads which would be idle anyway)
- if (tx == NGLL3_PADDED-1) {
- for(int m=0; m < NGLL2; m++){
- // hprime
-#ifdef USE_TEXTURES_CONSTANTS
- sh_hprime_xx[m] = tex1Dfetch(d_hprime_xx_cm_tex,m);
-#else
- sh_hprime_xx[m] = d_hprime_xx[m];
-#endif
- // weighted hprime
- sh_hprimewgll_xx[m] = d_hprimewgll_xx[m];
- }
+ // loads hprime's into shared memory
+ if( tx < NGLL2 ) {
+ // copy hprime from global memory to shared memory
+ load_shared_memory_hprime(&tx,d_hprime_xx,sh_hprime_xx);
+ // copy hprime from global memory to shared memory
+ load_shared_memory_hprimewgll(&tx,d_hprimewgll_xx,sh_hprimewgll_xx);
}
-/*
- if (tx < NGLL2) {
- // hprime
-#ifdef USE_TEXTURES_CONSTANTS
- sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_cm_tex,tx);
-#else
- sh_hprime_xx[tx] = d_hprime_xx[tx];
-#endif
- // weighted hprime
- sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx];
- }
-*/
-
// synchronize all the threads (one thread for each of the NGLL grid points of the
// current spectral element) because we need the whole element to be ready in order
// to be able to compute the matrix products along cut planes of the 3D element below
__syncthreads();
- if (active) {
+ if( active ) {
#ifndef MANUALLY_UNROLLED_LOOPS
tempx1l = 0.f;
@@ -975,8 +1076,6 @@
#endif
// compute derivatives of ux, uy and uz with respect to x, y and z
- offset = working_element*NGLL3_PADDED + tx;
-
xixl = d_xix[offset];
xiyl = d_xiy[offset];
xizl = d_xiz[offset];
@@ -1072,7 +1171,7 @@
duydxl,duydyl,duydzl,
duzdxl,duzdyl,duzdzl,
duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
- iglob, NGLOB,
+ iglob,
d_ystore,d_zstore,
&sigma_xx,&sigma_yy,&sigma_zz,
&sigma_xy,&sigma_xz,&sigma_yz);
@@ -1098,8 +1197,8 @@
if( GRAVITY ){
// computes non-symmetric terms for gravity
- compute_element_cm_gravity(tx,working_element,
- d_ibool,d_xstore,d_ystore,d_zstore,
+ compute_element_cm_gravity(tx,iglob,
+ d_xstore,d_ystore,d_zstore,
d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
wgll_cube,jacobianl,
s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
@@ -1120,7 +1219,6 @@
s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
-
}
// synchronize all the threads (one thread for each of the NGLL grid points of the
@@ -1128,7 +1226,7 @@
// to be able to compute the matrix products along cut planes of the 3D element below
__syncthreads();
- if (active) {
+ if( active ) {
#ifndef MANUALLY_UNROLLED_LOOPS
tempx1l = 0.f;
@@ -1233,14 +1331,13 @@
sum_terms3 += rho_s_H3;
}
-
#ifdef USE_MESH_COLORING_GPU
// no atomic operation needed, colors don't share global points between elements
#ifdef USE_TEXTURES_FIELDS
- d_accel[iglob*3] = tex1Dfetch(d_accel_cm_tex, iglob*3) + sum_terms1;
- d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 1) + sum_terms2;
- d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 2) + sum_terms3;
+ d_accel[iglob*3] = texfetch_accel_cm<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = texfetch_accel_cm<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = texfetch_accel_cm<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
#else
d_accel[iglob*3] += sum_terms1;
d_accel[iglob*3 + 1] += sum_terms2;
@@ -1254,9 +1351,9 @@
// no atomic operation needed, colors don't share global points between elements
#ifdef USE_TEXTURES_FIELDS
- d_accel[iglob*3] = tex1Dfetch(d_accel_cm_tex, iglob*3) + sum_terms1;
- d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 1) + sum_terms2;
- d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 2) + sum_terms3;
+ d_accel[iglob*3] = texfetch_accel_cm<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = texfetch_accel_cm<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = texfetch_accel_cm<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
#else
d_accel[iglob*3] += sum_terms1;
d_accel[iglob*3 + 1] += sum_terms2;
@@ -1364,15 +1461,13 @@
dim3 threads(blocksize,1,1);
// Cuda timing
- // cudaEvent_t start, stop;
- // realw time;
- // cudaEventCreate(&start);
- // cudaEventCreate(&stop);
- // cudaEventRecord( start, 0 );
+ //cudaEvent_t start, stop;
+ //start_timing_cuda(&start,&stop);
+ // calls kernel functions on GPU device
if( FORWARD_OR_ADJOINT == 1 ){
- Kernel_2_crust_mantle_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
- mp->NGLOB_CRUST_MANTLE,
+ // forward wavefields -> FORWARD_OR_ADJOINT == 1
+ Kernel_2_crust_mantle_impl<1><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
d_ibool,
d_ispec_is_tiso,
mp->d_phase_ispec_inner_crust_mantle,
@@ -1381,7 +1476,6 @@
mp->deltat,
mp->use_mesh_coloring_gpu,
mp->d_displ_crust_mantle,
- mp->d_veloc_crust_mantle,
mp->d_accel_crust_mantle,
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
@@ -1418,11 +1512,11 @@
mp->d_wgll_cube,
mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
}else if( FORWARD_OR_ADJOINT == 3 ){
+ // backward/reconstructed wavefields -> FORWARD_OR_ADJOINT == 3
// debug
DEBUG_BACKWARD_FORCES();
- Kernel_2_crust_mantle_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
- mp->NGLOB_CRUST_MANTLE,
+ Kernel_2_crust_mantle_impl<3><<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
d_ibool,
d_ispec_is_tiso,
mp->d_phase_ispec_inner_crust_mantle,
@@ -1431,7 +1525,6 @@
mp->b_deltat,
mp->use_mesh_coloring_gpu,
mp->d_b_displ_crust_mantle,
- mp->d_b_veloc_crust_mantle,
mp->d_b_accel_crust_mantle,
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
@@ -1469,15 +1562,9 @@
mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
}
- // cudaEventRecord( stop, 0 );
- // cudaEventSynchronize( stop );
- // cudaEventElapsedTime( &time, start, stop );
- // cudaEventDestroy( start );
- // cudaEventDestroy( stop );
- // printf("Kernel2 Execution Time: %f ms\n",time);
+ // Cuda timing
+ //stop_timing_cuda(&start,&stop,"Kernel_2_crust_mantle");
- /* cudaThreadSynchronize(); */
- /* LOG("Kernel 2 finished"); */
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("Kernel_2_crust_mantle");
#endif
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu 2014-01-17 15:36:14 UTC (rev 22992)
@@ -38,15 +38,25 @@
#include "mesh_constants_cuda.h"
#ifdef USE_TEXTURES_FIELDS
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_ic_tex;
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_ic_tex;
+//forward
+realw_texture d_displ_ic_tex;
+realw_texture d_accel_ic_tex;
+//backward/reconstructed
+realw_texture d_b_displ_ic_tex;
+realw_texture d_b_accel_ic_tex;
+// templates definitions
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_displ_ic(int x);
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_accel_ic(int x);
+// templates for texture fetching
+// FORWARD_OR_ADJOINT == 1 <- forward arrays
+template<> __device__ float texfetch_displ_ic<1>(int x) { return tex1Dfetch(d_displ_ic_tex, x); }
+template<> __device__ float texfetch_accel_ic<1>(int x) { return tex1Dfetch(d_accel_ic_tex, x); }
+// FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
+template<> __device__ float texfetch_displ_ic<3>(int x) { return tex1Dfetch(d_b_displ_ic_tex, x); }
+template<> __device__ float texfetch_accel_ic<3>(int x) { return tex1Dfetch(d_b_accel_ic_tex, x); }
#endif
-#ifdef USE_TEXTURES_CONSTANTS
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_ic_tex;
-#endif
-
/* ----------------------------------------------------------------------------------------------- */
// elemental routines
@@ -56,11 +66,11 @@
// updates stress
__device__ void compute_element_ic_att_stress(int tx,int working_element,
- realw* R_xx,
- realw* R_yy,
- realw* R_xy,
- realw* R_xz,
- realw* R_yz,
+ realw_p R_xx,
+ realw_p R_yy,
+ realw_p R_xy,
+ realw_p R_xz,
+ realw_p R_yz,
realw* sigma_xx,
realw* sigma_yy,
realw* sigma_zz,
@@ -98,12 +108,12 @@
// updates R_memory
__device__ void compute_element_ic_att_memory(int tx,int working_element,
- realw* d_muv,
- realw* factor_common,
- realw* alphaval,realw* betaval,realw* gammaval,
- realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
- realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
- realw* epsilondev_xz,realw* epsilondev_yz,
+ realw_const_p d_muv,
+ realw_const_p factor_common,
+ realw_const_p alphaval,realw_const_p betaval,realw_const_p gammaval,
+ realw_p R_xx,realw_p R_yy,realw_p R_xy,realw_p R_xz,realw_p R_yz,
+ realw_p epsilondev_xx,realw_p epsilondev_yy,realw_p epsilondev_xy,
+ realw_p epsilondev_xz,realw_p epsilondev_yz,
realw epsilondev_xx_loc,realw epsilondev_yy_loc,realw epsilondev_xy_loc,
realw epsilondev_xz_loc,realw epsilondev_yz_loc,
int USE_3D_ATTENUATION_ARRAYS
@@ -167,12 +177,12 @@
// pre-computes gravity term
__device__ void compute_element_ic_gravity(int tx,int working_element,
- int* d_ibool,
- realw* d_xstore,realw* d_ystore,realw* d_zstore,
- realw* d_minus_gravity_table,
- realw* d_minus_deriv_gravity_table,
- realw* d_density_table,
- realw* wgll_cube,
+ const int* d_ibool,
+ realw_const_p d_xstore,realw_const_p d_ystore,realw_const_p d_zstore,
+ realw_const_p d_minus_gravity_table,
+ realw_const_p d_minus_deriv_gravity_table,
+ realw_const_p d_density_table,
+ realw_const_p wgll_cube,
realw jacobianl,
realw* s_dummyx_loc,
realw* s_dummyy_loc,
@@ -306,47 +316,51 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void Kernel_2_inner_core_impl(int nb_blocks_to_compute,
- int NGLOB,
- int* d_ibool,
- int* d_idoubling,
- int* d_phase_ispec_inner,
- int num_phase_ispec,
- int d_iphase,
+template<int FORWARD_OR_ADJOINT> __global__ void
+#ifdef USE_LAUNCH_BOUNDS
+// adds compiler specification
+__launch_bounds__(NGLL3_PADDED,LAUNCH_MIN_BLOCKS)
+#endif
+// main kernel
+Kernel_2_inner_core_impl(int nb_blocks_to_compute,
+ const int* d_ibool,
+ const int* d_idoubling,
+ const int* d_phase_ispec_inner,
+ const int num_phase_ispec,
+ const int d_iphase,
realw deltat,
- int use_mesh_coloring_gpu,
- realw* d_displ,
- realw* d_veloc,
- realw* d_accel,
- realw* d_xix, realw* d_xiy, realw* d_xiz,
- realw* d_etax, realw* d_etay, realw* d_etaz,
- realw* d_gammax, realw* d_gammay, realw* d_gammaz,
- realw* d_hprime_xx,
- realw* d_hprimewgll_xx,
- realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
- realw* d_kappav,
- realw* d_muv,
- int COMPUTE_AND_STORE_STRAIN,
- realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
- realw* epsilondev_xz,realw* epsilondev_yz,
- realw* epsilon_trace_over_3,
- int ATTENUATION,
- int PARTIAL_PHYS_DISPERSION_ONLY,
- int USE_3D_ATTENUATION_ARRAYS,
- realw* one_minus_sum_beta,realw* factor_common,
- realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
- realw* alphaval,realw* betaval,realw* gammaval,
- int ANISOTROPY,
- realw* d_c11store,realw* d_c12store,realw* d_c13store,
- realw* d_c33store,realw* d_c44store,
- int GRAVITY,
- realw* d_xstore,realw* d_ystore,realw* d_zstore,
- realw* d_minus_gravity_table,
- realw* d_minus_deriv_gravity_table,
- realw* d_density_table,
- realw* wgll_cube,
- int NSPEC_INNER_CORE_STRAIN_ONLY,
- int NSPEC_INNER_CORE){
+ const int use_mesh_coloring_gpu,
+ realw_const_p d_displ,
+ realw_p d_accel,
+ realw_const_p d_xix, realw_const_p d_xiy, realw_const_p d_xiz,
+ realw_const_p d_etax, realw_const_p d_etay, realw_const_p d_etaz,
+ realw_const_p d_gammax, realw_const_p d_gammay, realw_const_p d_gammaz,
+ realw_const_p d_hprime_xx,
+ realw_const_p d_hprimewgll_xx,
+ realw_const_p d_wgllwgll_xy,realw_const_p d_wgllwgll_xz,realw_const_p d_wgllwgll_yz,
+ realw_const_p d_kappav,
+ realw_const_p d_muv,
+ const int COMPUTE_AND_STORE_STRAIN,
+ realw_p epsilondev_xx,realw_p epsilondev_yy,realw_p epsilondev_xy,
+ realw_p epsilondev_xz,realw_p epsilondev_yz,
+ realw_p epsilon_trace_over_3,
+ const int ATTENUATION,
+ const int PARTIAL_PHYS_DISPERSION_ONLY,
+ const int USE_3D_ATTENUATION_ARRAYS,
+ realw_const_p one_minus_sum_beta,realw_const_p factor_common,
+ realw_p R_xx, realw_p R_yy, realw_p R_xy, realw_p R_xz, realw_p R_yz,
+ realw_const_p alphaval,realw_const_p betaval,realw_const_p gammaval,
+ const int ANISOTROPY,
+ realw_const_p d_c11store,realw_const_p d_c12store,realw_const_p d_c13store,
+ realw_const_p d_c33store,realw_const_p d_c44store,
+ const int GRAVITY,
+ realw_const_p d_xstore,realw_const_p d_ystore,realw_const_p d_zstore,
+ realw_const_p d_minus_gravity_table,
+ realw_const_p d_minus_deriv_gravity_table,
+ realw_const_p d_density_table,
+ realw_const_p wgll_cube,
+ const int NSPEC_INNER_CORE_STRAIN_ONLY,
+ const int NSPEC_INNER_CORE){
// block id
int bx = blockIdx.y*gridDim.x+blockIdx.x;
@@ -357,8 +371,8 @@
int J = ((tx-K*NGLL2)/NGLLX);
int I = (tx-K*NGLL2-J*NGLLX);
- int active,offset;
- int iglob;
+ unsigned short int active;
+ int iglob,offset;
int working_element;
realw tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
@@ -432,9 +446,9 @@
iglob = d_ibool[working_element*NGLL3 + tx]-1;
#ifdef USE_TEXTURES_FIELDS
- s_dummyx_loc[tx] = tex1Dfetch(d_displ_ic_tex, iglob*3);
- s_dummyy_loc[tx] = tex1Dfetch(d_displ_ic_tex, iglob*3 + 1);
- s_dummyz_loc[tx] = tex1Dfetch(d_displ_ic_tex, iglob*3 + 2);
+ s_dummyx_loc[tx] = texfetch_displ_ic<FORWARD_OR_ADJOINT>(iglob*3);
+ s_dummyy_loc[tx] = texfetch_displ_ic<FORWARD_OR_ADJOINT>(iglob*3 + 1);
+ s_dummyz_loc[tx] = texfetch_displ_ic<FORWARD_OR_ADJOINT>(iglob*3 + 2);
#else
// changing iglob indexing to match fortran row changes fast style
s_dummyx_loc[tx] = d_displ[iglob*3];
@@ -446,11 +460,7 @@
if (tx < NGLL2) {
// hprime
-#ifdef USE_TEXTURES_CONSTANTS
- sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_ic_tex,tx);
-#else
sh_hprime_xx[tx] = d_hprime_xx[tx];
-#endif
// weighted hprime
sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx];
}
@@ -846,9 +856,9 @@
// no atomic operation needed, colors don't share global points between elements
#ifdef USE_TEXTURES_FIELDS
- d_accel[iglob*3] = tex1Dfetch(d_accel_ic_tex, iglob*3) + sum_terms1;
- d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 1) + sum_terms2;
- d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 2) + sum_terms3;
+ d_accel[iglob*3] = texfetch_accel_ic<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = texfetch_accel_ic<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = texfetch_accel_ic<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
#else
d_accel[iglob*3] += sum_terms1;
d_accel[iglob*3 + 1] += sum_terms2;
@@ -863,9 +873,9 @@
if( NSPEC_INNER_CORE > COLORING_MIN_NSPEC_INNER_CORE ){
// no atomic operation needed, colors don't share global points between elements
#ifdef USE_TEXTURES_FIELDS
- d_accel[iglob*3] = tex1Dfetch(d_accel_ic_tex, iglob*3) + sum_terms1;
- d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 1) + sum_terms2;
- d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 2) + sum_terms3;
+ d_accel[iglob*3] = texfetch_accel_ic<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = texfetch_accel_ic<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = texfetch_accel_ic<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
#else
d_accel[iglob*3] += sum_terms1;
d_accel[iglob*3 + 1] += sum_terms2;
@@ -972,14 +982,13 @@
dim3 threads(blocksize,1,1);
// Cuda timing
- // cudaEvent_t start, stop;
- // realw time;
- // cudaEventCreate(&start);
- // cudaEventCreate(&stop);
- // cudaEventRecord( start, 0 );
+ //cudaEvent_t start, stop;
+ //start_timing_cuda(&start,&stop);
+
+ // calls kernel functions on device
if( FORWARD_OR_ADJOINT == 1 ){
- Kernel_2_inner_core_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
- mp->NGLOB_INNER_CORE,
+ // forward wavefields -> FORWARD_OR_ADJOINT == 1
+ Kernel_2_inner_core_impl<1><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
d_ibool,
d_idoubling,
mp->d_phase_ispec_inner_inner_core,
@@ -988,7 +997,6 @@
mp->deltat,
mp->use_mesh_coloring_gpu,
mp->d_displ_inner_core,
- mp->d_veloc_inner_core,
mp->d_accel_inner_core,
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
@@ -1023,11 +1031,11 @@
mp->NSPEC_INNER_CORE_STRAIN_ONLY,
mp->NSPEC_INNER_CORE);
}else if( FORWARD_OR_ADJOINT == 3 ){
+ // backward/reconstructed wavefields -> FORWARD_OR_ADJOINT == 3
// debug
DEBUG_BACKWARD_FORCES();
- Kernel_2_inner_core_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
- mp->NGLOB_INNER_CORE,
+ Kernel_2_inner_core_impl<3><<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
d_ibool,
d_idoubling,
mp->d_phase_ispec_inner_inner_core,
@@ -1036,7 +1044,6 @@
mp->b_deltat,
mp->use_mesh_coloring_gpu,
mp->d_b_displ_inner_core,
- mp->d_b_veloc_inner_core,
mp->d_b_accel_inner_core,
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
@@ -1072,15 +1079,9 @@
mp->NSPEC_INNER_CORE);
}
- // cudaEventRecord( stop, 0 );
- // cudaEventSynchronize( stop );
- // cudaEventElapsedTime( &time, start, stop );
- // cudaEventDestroy( start );
- // cudaEventDestroy( stop );
- // printf("Kernel2 Execution Time: %f ms\n",time);
+ // Cuda timing
+ //stop_timing_cuda(&start,&stop,"Kernel_2_inner_core");
- /* cudaThreadSynchronize(); */
- /* LOG("Kernel 2 finished"); */
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("Kernel_2_inner_core");
#endif
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu 2014-01-17 15:36:14 UTC (rev 22992)
@@ -38,13 +38,24 @@
#include "mesh_constants_cuda.h"
#ifdef USE_TEXTURES_FIELDS
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_oc_tex;
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_oc_tex;
+//forward
+realw_texture d_displ_oc_tex;
+realw_texture d_accel_oc_tex;
+//backward/reconstructed
+realw_texture d_b_displ_oc_tex;
+realw_texture d_b_accel_oc_tex;
+// templates definitions
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_displ_oc(int x);
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_accel_oc(int x);
+// templates for texture fetching
+// FORWARD_OR_ADJOINT == 1 <- forward arrays
+template<> __device__ float texfetch_displ_oc<1>(int x) { return tex1Dfetch(d_displ_oc_tex, x); }
+template<> __device__ float texfetch_accel_oc<1>(int x) { return tex1Dfetch(d_accel_oc_tex, x); }
+// FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
+template<> __device__ float texfetch_displ_oc<3>(int x) { return tex1Dfetch(d_b_displ_oc_tex, x); }
+template<> __device__ float texfetch_accel_oc<3>(int x) { return tex1Dfetch(d_b_accel_oc_tex, x); }
#endif
-#ifdef USE_TEXTURES_CONSTANTS
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_oc_tex;
-#endif
/* ----------------------------------------------------------------------------------------------- */
@@ -58,8 +69,8 @@
realw time,
realw two_omega_earth,
realw deltat,
- realw* d_A_array_rotation,
- realw* d_B_array_rotation,
+ realw_p d_A_array_rotation,
+ realw_p d_B_array_rotation,
realw dpotentialdxl,
realw dpotentialdyl,
realw* dpotentialdx_with_rot,
@@ -105,49 +116,44 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void Kernel_2_outer_core_impl(int nb_blocks_to_compute,
- int NGLOB,
- int* d_ibool,
- int* d_phase_ispec_inner,
- int num_phase_ispec,
- int d_iphase,
- int use_mesh_coloring_gpu,
- realw* d_potential, realw* d_potential_dot_dot,
- realw* d_xix, realw* d_xiy, realw* d_xiz,
- realw* d_etax, realw* d_etay, realw* d_etaz,
- realw* d_gammax, realw* d_gammay, realw* d_gammaz,
- realw* d_hprime_xx,
- realw* d_hprimewgll_xx,
- realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
- int GRAVITY,
- realw* d_xstore, realw* d_ystore, realw* d_zstore,
- realw* d_d_ln_density_dr_table,
- realw* d_minus_rho_g_over_kappa_fluid,
- realw* wgll_cube,
- int ROTATION,
+template<int FORWARD_OR_ADJOINT> __global__ void Kernel_2_outer_core_impl(int nb_blocks_to_compute,
+ const int* d_ibool,
+ const int* d_phase_ispec_inner,
+ const int num_phase_ispec,
+ const int d_iphase,
+ const int use_mesh_coloring_gpu,
+ realw_const_p d_potential,
+ realw_p d_potential_dot_dot,
+ realw_const_p d_xix, realw_const_p d_xiy, realw_const_p d_xiz,
+ realw_const_p d_etax, realw_const_p d_etay, realw_const_p d_etaz,
+ realw_const_p d_gammax, realw_const_p d_gammay, realw_const_p d_gammaz,
+ realw_const_p d_hprime_xx,
+ realw_const_p d_hprimewgll_xx,
+ realw_const_p wgllwgll_xy,realw_const_p wgllwgll_xz,realw_const_p wgllwgll_yz,
+ const int GRAVITY,
+ realw_const_p d_xstore, realw_const_p d_ystore, realw_const_p d_zstore,
+ realw_const_p d_d_ln_density_dr_table,
+ realw_const_p d_minus_rho_g_over_kappa_fluid,
+ realw_const_p wgll_cube,
+ const int ROTATION,
realw time,
realw two_omega_earth,
realw deltat,
- realw* d_A_array_rotation,
- realw* d_B_array_rotation,
- int NSPEC_OUTER_CORE){
+ realw_p d_A_array_rotation,
+ realw_p d_B_array_rotation,
+ const int NSPEC_OUTER_CORE){
// block id == spectral-element id
int bx = blockIdx.y*gridDim.x+blockIdx.x;
// thread id == GLL point id
int tx = threadIdx.x;
- // R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
- //const realw R_EARTH_KM = 6371.0f;
- // uncomment line below for PREM with oceans
- //const realw R_EARTH_KM = 6368.0f;
-
int K = (tx/NGLL2);
int J = ((tx-K*NGLL2)/NGLLX);
int I = (tx-K*NGLL2-J*NGLLX);
- int active,offset;
- int iglob;
+ unsigned short int active;
+ int iglob,offset;
int working_element;
realw temp1l,temp2l,temp3l;
@@ -201,7 +207,7 @@
iglob = d_ibool[working_element*NGLL3 + tx]-1;
#ifdef USE_TEXTURES_FIELDS
- s_dummy_loc[tx] = tex1Dfetch(d_displ_oc_tex, iglob);
+ s_dummy_loc[tx] = texfetch_displ_oc<FORWARD_OR_ADJOINT>(iglob);
#else
// changing iglob indexing to match fortran row changes fast style
s_dummy_loc[tx] = d_potential[iglob];
@@ -210,11 +216,7 @@
if (tx < NGLL2) {
// hprime
-#ifdef USE_TEXTURES_CONSTANTS
- sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_oc_tex,tx);
-#else
sh_hprime_xx[tx] = d_hprime_xx[tx];
-#endif
// weighted hprime
sh_hprimewgll_xx[tx] = d_hprimewgll_xx[tx];
}
@@ -423,7 +425,7 @@
// no atomic operation needed, colors don't share global points between elements
#ifdef USE_TEXTURES_FIELDS
- d_potential_dot_dot[iglob] = tex1Dfetch(d_accel_oc_tex, iglob) + sum_terms;
+ d_potential_dot_dot[iglob] = texfetch_accel_oc<FORWARD_OR_ADJOINT>(iglob) + sum_terms;
#else
d_potential_dot_dot[iglob] += sum_terms;
#endif // USE_TEXTURES_FIELDS
@@ -436,7 +438,7 @@
if( NSPEC_OUTER_CORE > COLORING_MIN_NSPEC_OUTER_CORE ){
// no atomic operation needed, colors don't share global points between elements
#ifdef USE_TEXTURES_FIELDS
- d_potential_dot_dot[iglob] = tex1Dfetch(d_accel_oc_tex, iglob) + sum_terms;
+ d_potential_dot_dot[iglob] = texfetch_accel_oc<FORWARD_OR_ADJOINT>(iglob) + sum_terms;
#else
d_potential_dot_dot[iglob] += sum_terms;
#endif // USE_TEXTURES_FIELDS
@@ -486,16 +488,14 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- // Cuda timing
- // cudaEvent_t start, stop;
- // realw time;
- // cudaEventCreate(&start);
- // cudaEventCreate(&stop);
- // cudaEventRecord( start, 0 );
+ // CUDA timing
+ //cudaEvent_t start, stop;
+ //start_timing_cuda(&start,&stop);
+ // calls kernel functions on GPU device
if( FORWARD_OR_ADJOINT == 1 ){
- Kernel_2_outer_core_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
- mp->NGLOB_OUTER_CORE,
+ // forward wavefields -> FORWARD_OR_ADJOINT == 1
+ Kernel_2_outer_core_impl<1><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
d_ibool,
mp->d_phase_ispec_inner_outer_core,
mp->num_phase_ispec_outer_core,
@@ -522,11 +522,11 @@
d_B_array_rotation,
mp->NSPEC_OUTER_CORE);
}else if( FORWARD_OR_ADJOINT == 3 ){
+ // backward/reconstructed wavefields -> FORWARD_OR_ADJOINT == 3
// debug
DEBUG_BACKWARD_FORCES();
- Kernel_2_outer_core_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
- mp->NGLOB_OUTER_CORE,
+ Kernel_2_outer_core_impl<3><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
d_ibool,
mp->d_phase_ispec_inner_outer_core,
mp->num_phase_ispec_outer_core,
@@ -554,14 +554,9 @@
mp->NSPEC_OUTER_CORE);
}
- // cudaEventRecord( stop, 0 );
- // cudaEventSynchronize( stop );
- // cudaEventElapsedTime( &time, start, stop );
- // cudaEventDestroy( start );
- // cudaEventDestroy( stop );
- // printf("Kernel2 Execution Time: %f ms\n",time);
- /* cudaThreadSynchronize(); */
- /* TRACE("Kernel 2 finished"); */
+ // Cuda timing
+ //stop_timing_cuda(&start,&stop,"Kernel_2_outer_core");
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//printf("Tried to start with %dx1 blocks\n",nb_blocks_to_compute);
exit_on_cuda_error("kernel Kernel_2_outer_core");
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu 2014-01-17 15:36:14 UTC (rev 22992)
@@ -178,35 +178,46 @@
// output to file
if( do_output_info ){
- fp = fopen(filename,"a+");
+ fp = fopen(filename,"w");
if (fp != NULL){
// display device properties
fprintf(fp,"Device Name = %s\n",deviceProp.name);
- fprintf(fp,"multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
- fprintf(fp,"totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
- fprintf(fp,"totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
- fprintf(fp,"sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
- fprintf(fp,"Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
- fprintf(fp,"Maximum size of each dimension of a block: %d x %d x %d\n",
+ fprintf(fp,"memory:\n");
+ fprintf(fp," totalGlobalMem (in MB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f));
+ fprintf(fp," totalGlobalMem (in GB): %f\n",(unsigned long) deviceProp.totalGlobalMem / (1024.f * 1024.f * 1024.f));
+ fprintf(fp," totalConstMem (in bytes): %lu\n",(unsigned long) deviceProp.totalConstMem);
+ fprintf(fp," Maximum 1D texture size (in bytes): %lu\n",(unsigned long) deviceProp.maxTexture1D);
+ fprintf(fp," sharedMemPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.sharedMemPerBlock);
+ fprintf(fp," regsPerBlock (in bytes): %lu\n",(unsigned long) deviceProp.regsPerBlock);
+ fprintf(fp,"blocks:\n");
+ fprintf(fp," Maximum number of threads per block: %d\n",deviceProp.maxThreadsPerBlock);
+ fprintf(fp," Maximum size of each dimension of a block: %d x %d x %d\n",
deviceProp.maxThreadsDim[0],deviceProp.maxThreadsDim[1],deviceProp.maxThreadsDim[2]);
- fprintf(fp,"Maximum sizes of each dimension of a grid: %d x %d x %d\n",
+ fprintf(fp," Maximum sizes of each dimension of a grid: %d x %d x %d\n",
deviceProp.maxGridSize[0],deviceProp.maxGridSize[1],deviceProp.maxGridSize[2]);
- fprintf(fp,"Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
+ fprintf(fp,"features:\n");
+ fprintf(fp," Compute capability of the device = %d.%d\n", deviceProp.major, deviceProp.minor);
+ fprintf(fp," multiProcessorCount: %d\n",deviceProp.multiProcessorCount);
if(deviceProp.canMapHostMemory){
- fprintf(fp,"canMapHostMemory: TRUE\n");
+ fprintf(fp," canMapHostMemory: TRUE\n");
}else{
- fprintf(fp,"canMapHostMemory: FALSE\n");
+ fprintf(fp," canMapHostMemory: FALSE\n");
}
if(deviceProp.deviceOverlap){
- fprintf(fp,"deviceOverlap: TRUE\n");
+ fprintf(fp," deviceOverlap: TRUE\n");
}else{
- fprintf(fp,"deviceOverlap: FALSE\n");
+ fprintf(fp," deviceOverlap: FALSE\n");
}
-
+ if(deviceProp.concurrentKernels){
+ fprintf(fp," concurrentKernels: TRUE\n");
+ }else{
+ fprintf(fp," concurrentKernels: FALSE\n");
+ }
// outputs initial memory infos via cudaMemGetInfo()
double free_db,used_db,total_db;
get_free_memory(&free_db,&used_db,&total_db);
- fprintf(fp,"%d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n",myrank,
+ fprintf(fp,"memory usage:\n");
+ fprintf(fp," rank %d: GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n",myrank,
used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
// closes output file
@@ -223,9 +234,38 @@
fprintf(stderr,"Compute capability should be at least 1.3, exiting...\n");
exit_on_error("CUDA Compute capability major number should be at least 1.3\n");
}
+
// we use pinned memory for asynchronous copy
- if( ! deviceProp.canMapHostMemory){
- fprintf(stderr,"Device capability should allow to map host memory, exiting...\n");
- exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n");
+ if( GPU_ASYNC_COPY ){
+ if( ! deviceProp.canMapHostMemory){
+ fprintf(stderr,"Device capability should allow to map host memory, exiting...\n");
+ exit_on_error("CUDA Device capability canMapHostMemory should be TRUE\n");
+ }
}
+
+ // checks kernel optimization setting
+#ifdef USE_LAUNCH_BOUNDS
+ // see: mesh_constants_cuda.h
+ // performance statistics: main kernel Kernel_2_crust_mantle_impl():
+ // shared memory per block = 6200 for Kepler: total = 49152 -> limits active blocks to 7
+ // registers per thread = 72 (limited by LAUNCH_MIN_BLOCKS 7)
+ // registers per block = 9216 total = 65536 (limited by LAUNCH_MIN_BLOCKS 7)
+
+ // shared memory
+ if( deviceProp.sharedMemPerBlock > 49152 && LAUNCH_MIN_BLOCKS <= 7 ){
+ if(myrank == 0 ){
+ printf("GPU non-optimal settings: your setting of using LAUNCH_MIN_BLOCK %i is too low and limits the register usage\n",
+ LAUNCH_MIN_BLOCKS);
+ }
+ }
+
+ // registers
+ if( deviceProp.regsPerBlock > 65536 && LAUNCH_MIN_BLOCKS <= 7 ){
+ if(myrank == 0 ){
+ printf("GPU non-optimal settings: your setting of using LAUNCH_MIN_BLOCK %i is too low and limits the register usage\n",
+ LAUNCH_MIN_BLOCKS);
+ }
+ }
+#endif
+
}
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h 2014-01-13 16:06:21 UTC (rev 22991)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h 2014-01-17 15:36:14 UTC (rev 22992)
@@ -164,7 +164,7 @@
// Texture memory usage:
// requires CUDA version >= 4.0, see check below
-// Use textures for d_displ and d_accel -- 10% performance boost
+// Use textures for d_displ and d_accel ~ 1% performance boost
//#define USE_TEXTURES_FIELDS
// Using texture memory for the hprime-style constants is slower on
@@ -191,6 +191,24 @@
// leads up to ~1% performance increase
//#define MANUALLY_UNROLLED_LOOPS
+// compiler specifications
+// (optional) use launch_bounds specification to increase compiler optimization
+//
+// note: main kernel is Kernel_2_crust_mantle_impl() which is limited by register usage to only 5 active blocks
+// while shared memory usage would allow up to 7 blocks (see profiling with nvcc...)
+// here we specifiy to launch 7 blocks to increase occupancy and let the compiler reduce registers
+// (depending on GPU type, register spilling might slow down the performance)
+//
+// performance statistics: main kernel Kernel_2_crust_mantle_impl():
+// shared memory per block = 6200 for Kepler: total = 49152 -> limits active blocks to 7
+// registers per thread = 72 (limited by LAUNCH_MIN_BLOCKS 7)
+// registers per block = 9216 total = 65536 (limited by LAUNCH_MIN_BLOCKS 7)
+//
+// using launch_bounds leads to ~ 20% performance increase on Kepler GPUs
+// (uncomment if not desired)
+#define USE_LAUNCH_BOUNDS
+#define LAUNCH_MIN_BLOCKS 7
+
/* ----------------------------------------------------------------------------------------------- */
// cuda kernel block size for updating displacements/potential (newmark time scheme)
@@ -224,6 +242,10 @@
// textures
typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture;
+// restricted pointers: improves performance on Kepler ~ 10%
+// see: http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#restrict
+typedef const float* __restrict__ realw_const_p; // otherwise use: //typedef const float* realw_const_p;
+typedef float* __restrict__ realw_p; // otherwise use: //typedef float* realw_p;
/* ----------------------------------------------------------------------------------------------- */
@@ -239,11 +261,11 @@
void exit_on_error(char* info);
void synchronize_cuda();
void synchronize_mpi();
+void start_timing_cuda(cudaEvent_t* start,cudaEvent_t* stop);
+void stop_timing_cuda(cudaEvent_t* start,cudaEvent_t* stop, char* info_str);
void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y);
realw get_device_array_maximum_value(realw* array,int size);
-
-
/* ----------------------------------------------------------------------------------------------- */
// mesh pointer wrapper structure
@@ -765,4 +787,5 @@
} Mesh;
-#endif // CUDA_MESH_H
+#endif
+// CUDA_MESH_H
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_constants_cuda.h 2014-01-13 16:06:21 UTC (rev 22991)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_constants_cuda.h 2014-01-17 15:36:14 UTC (rev 22992)
@@ -80,10 +80,12 @@
// note: we use definition __device__ to use global memory rather than constant memory registers
// to avoid over-loading registers; this should help increasing the occupancy on the GPU
+// note: using __constant__ here makes hardly any performance improvement < 0.1%
__device__ realw d_hprime_xx[NGLL2];
//__device__ realw d_hprime_yy[NGLL2]; // only needed if NGLLX != NGLLY != NGLLZ
//__device__ realw d_hprime_zz[NGLL2]; // only needed if NGLLX != NGLLY != NGLLZ
+// note: using __constant__ here makes hardly any performance improvement < 0.1%
__device__ realw d_hprimewgll_xx[NGLL2];
//__device__ realw d_hprimewgll_yy[NGLL2]; // only needed if NGLLX != NGLLY != NGLLZ
//__device__ realw d_hprimewgll_zz[NGLL2]; // only needed if NGLLX != NGLLY != NGLLZ
@@ -95,6 +97,7 @@
// wgll_cube: needed only for gravity case
__device__ realw d_wgll_cube[NGLL3];
+/* ----------------------------------------------------------------------------------------------- */
// setup functions
void setConst_hprime_xx(realw* array,Mesh* mp)
@@ -104,7 +107,7 @@
if (err != cudaSuccess)
{
fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err));
- fprintf(stderr, "The problem is maybe the target architecture: -arch sm_** in src/specfem3D/Makefile\n");
+ fprintf(stderr, "The problem is maybe the target architecture: -arch sm_** in your Makefile\n");
fprintf(stderr, "Please double-check with your GPU card\n");
exit(1);
}
@@ -302,5 +305,5 @@
}
-
-#endif //CUDA_PREPARE_H
+#endif
+//CUDA_PREPARE_H
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu 2014-01-17 15:36:14 UTC (rev 22992)
@@ -62,9 +62,12 @@
#endif
#ifdef USE_TEXTURES_CONSTANTS
- extern realw_texture d_hprime_xx_cm_tex;
- extern realw_texture d_hprime_xx_oc_tex;
- extern realw_texture d_hprime_xx_ic_tex;
+ // hprime
+ extern realw_texture d_hprime_xx_tex;
+ extern __constant__ size_t d_hprime_xx_tex_offset;
+ // weighted hprime
+ extern realw_texture d_hprimewgll_xx_tex;
+ extern __constant__ size_t d_hprimewgll_xx_tex_offset;
#endif
#endif
@@ -159,6 +162,9 @@
exit_on_error("NGLLX must be 5 for CUDA devices");
}
+ // mpi process rank
+ mp->myrank = *myrank_f;
+
// sets constant arrays
setConst_hprime_xx(h_hprime_xx,mp);
//setConst_hprime_yy(h_hprime_yy,mp); // only needed if NGLLX != NGLLY != NGLLZ
@@ -178,35 +184,46 @@
// in the code with #USE_TEXTURES_FIELDS not-defined.
#ifdef USE_TEXTURES_CONSTANTS
{
+ // checks that realw is a float
+ if( sizeof(realw) != sizeof(float)) exit_on_error("TEXTURES only work with realw selected as float");
+
+ // note: device memory returned by cudaMalloc guarantee that the offset is 0,
+ // however here we use the global memory array d_hprime_xx and need to provide an offset variable
+ size_t offset;
+
+ // binds textures
#ifdef USE_OLDER_CUDA4_GPU
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
- const textureReference* d_hprime_xx_cm_tex_ptr;
- print_CUDA_error_if_any(cudaGetTextureReference(&d_hprime_xx_cm_tex_ptr, "d_hprime_xx_cm_tex"), 1101);
- print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_cm_tex_ptr, mp->d_hprime_xx,
+ const textureReference* d_hprime_xx_tex_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_hprime_xx_tex_ptr, "d_hprime_xx_tex"), 1101);
+ print_CUDA_error_if_any(cudaBindTexture(&offset, d_hprime_xx_tex_ptr, mp->d_hprime_xx,
&channelDesc, sizeof(realw)*(NGLL2)), 1102);
+ print_CUDA_error_if_any(cudaMemcpyToSymbol(d_hprime_xx_tex_offset,&offset,sizeof(offset)),11202);
- const textureReference* d_hprime_xx_oc_tex_ptr;
- print_CUDA_error_if_any(cudaGetTextureReference(&d_hprime_xx_oc_tex_ptr, "d_hprime_xx_oc_tex"), 1103);
- print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_oc_tex_ptr, mp->d_hprime_xx,
- &channelDesc, sizeof(realw)*(NGLL2)), 1104);
-
- const textureReference* d_hprime_xx_ic_tex_ptr;
- print_CUDA_error_if_any(cudaGetTextureReference(&d_hprime_xx_ic_tex_ptr, "d_hprime_xx_ic_tex"), 1105);
- print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_ic_tex_ptr, mp->d_hprime_xx,
- &channelDesc, sizeof(realw)*(NGLL2)), 1106);
+ // weighted
+ const textureReference* d_hprimewgll_xx_tex_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_hprimewgll_xx_tex_ptr, "d_hprimewgll_xx_tex"), 1107);
+ print_CUDA_error_if_any(cudaBindTexture(&offset, d_hprimewgll_xx_tex_ptr, mp->d_hprimewgll_xx,
+ &channelDesc, sizeof(realw)*(NGLL2)), 1108);
+ print_CUDA_error_if_any(cudaMemcpyToSymbol(d_hprimewgll_xx_tex_offset,&offset,sizeof(offset)),11205);
#else
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
- print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx_cm_tex, mp->d_hprime_xx,
- &channelDesc, sizeof(realw)*(NGLL2)), 1102);
- print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx_oc_tex, mp->d_hprime_xx,
- &channelDesc, sizeof(realw)*(NGLL2)), 1104);
- print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx_ic_tex, mp->d_hprime_xx,
- &channelDesc, sizeof(realw)*(NGLL2)), 1106);
+ print_CUDA_error_if_any(cudaBindTexture(&offset, &d_hprime_xx_tex, mp->d_hprime_xx,
+ &channelDesc, sizeof(realw)*(NGLL2)), 11201);
+ print_CUDA_error_if_any(cudaMemcpyToSymbol(d_hprime_xx_tex_offset,&offset,sizeof(offset)),11202);
+ // debug
+ //if( mp->myrank == 0 ) printf("texture constants hprime_xx: offset = %lu \n",offset);
+
+ // weighted
+ print_CUDA_error_if_any(cudaBindTexture(&offset, &d_hprimewgll_xx_tex, mp->d_hprimewgll_xx,
+ &channelDesc, sizeof(realw)*(NGLL2)), 11204);
+ print_CUDA_error_if_any(cudaMemcpyToSymbol(d_hprimewgll_xx_tex_offset,&offset,sizeof(offset)),11205);
+ // debug
+ //if( mp->myrank == 0 ) printf("texture constants hprimewgll_xx: offset = %lu \n",offset);
#endif
}
#endif
-
// sets global parameters
mp->NSPEC_CRUST_MANTLE = *NSPEC_CRUST_MANTLE;
mp->NGLOB_CRUST_MANTLE = *NGLOB_CRUST_MANTLE;
@@ -244,9 +261,6 @@
mp->anisotropic_kl = *ANISOTROPIC_KL_f;
mp->approximate_hess_kl = *APPROXIMATE_HESS_KL_f;
- // mpi process rank
- mp->myrank = *myrank_f;
-
// mesh coloring flag
#ifdef USE_MESH_COLORING_GPU
mp->use_mesh_coloring_gpu = 1;
@@ -1236,10 +1250,6 @@
// global indexing
copy_todevice_int((void**)&mp->d_ibool_crust_mantle,h_ibool,NGLL3*(mp->NSPEC_CRUST_MANTLE));
-// print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_crust_mantle, size_padded*sizeof(int)),1021);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_crust_mantle, h_ibool,
-// NGLL3*(mp->NSPEC_CRUST_MANTLE)*sizeof(int),cudaMemcpyHostToDevice),1022);
-
// transverse isotropic elements
// only needed if not anisotropic 3D mantle
if( ! mp->anisotropic_3D_mantle ){
@@ -1422,6 +1432,9 @@
#ifdef USE_TEXTURES_FIELDS
{
+ // checks single precision
+ if( sizeof(realw) != sizeof(float)) exit_on_error("TEXTURES only work with realw selected as float");
+ // binds textures
#ifdef USE_OLDER_CUDA4_GPU
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
const textureReference* d_displ_cm_tex_ref_ptr;
@@ -2029,6 +2042,34 @@
// frees memory on GPU
//------------------------------------------
+ // textures
+ //------------------------------------------
+#ifdef USE_TEXTURES_CONSTANTS
+ cudaUnbindTexture(d_hprime_xx_tex);
+ cudaUnbindTexture(d_hprimewgll_xx_tex);
+#endif
+#ifdef USE_TEXTURES_FIELDS
+ cudaUnbindTexture(d_displ_cm_tex);
+ cudaUnbindTexture(d_accel_cm_tex);
+ if( mp->simulation_type == 3 ){
+ cudaUnbindTexture(d_b_displ_cm_tex);
+ cudaUnbindTexture(d_b_accel_cm_tex);
+ }
+ cudaUnbindTexture(d_displ_ic_tex);
+ cudaUnbindTexture(d_accel_ic_tex);
+ if( mp->simulation_type == 3 ){
+ cudaUnbindTexture(d_b_displ_ic_tex);
+ cudaUnbindTexture(d_b_accel_ic_tex);
+ }
+ cudaUnbindTexture(d_displ_oc_tex);
+ cudaUnbindTexture(d_accel_oc_tex);
+ if( mp->simulation_type == 3 ){
+ cudaUnbindTexture(d_b_displ_oc_tex);
+ cudaUnbindTexture(d_b_accel_oc_tex);
+ }
+#endif
+
+ //------------------------------------------
// sources
//------------------------------------------
if( mp->simulation_type == 1 || mp->simulation_type == 3 ){
@@ -2321,15 +2362,15 @@
cudaFree(mp->d_gammay_crust_mantle);
cudaFree(mp->d_gammaz_crust_mantle);
- cudaFree(mp->d_muvstore_crust_mantle);
cudaFree(mp->d_ibool_crust_mantle);
if( ! mp->anisotropic_3D_mantle ){
+ cudaFree(mp->d_ispec_is_tiso_crust_mantle);
cudaFree(mp->d_kappavstore_crust_mantle);
cudaFree(mp->d_kappahstore_crust_mantle);
+ cudaFree(mp->d_muvstore_crust_mantle);
cudaFree(mp->d_muhstore_crust_mantle);
cudaFree(mp->d_eta_anisostore_crust_mantle);
- cudaFree(mp->d_ispec_is_tiso_crust_mantle);
}else{
cudaFree(mp->d_c11store_crust_mantle);
cudaFree(mp->d_c12store_crust_mantle);
@@ -2367,6 +2408,7 @@
cudaFree(mp->d_phase_ispec_inner_crust_mantle);
cudaFree(mp->d_ibelm_bottom_crust_mantle);
+ // wavefield
cudaFree(mp->d_displ_crust_mantle);
cudaFree(mp->d_veloc_crust_mantle);
cudaFree(mp->d_accel_crust_mantle);
@@ -2374,22 +2416,32 @@
cudaFree(mp->d_b_displ_crust_mantle);
cudaFree(mp->d_b_veloc_crust_mantle);
cudaFree(mp->d_b_accel_crust_mantle);
- cudaFree(mp->d_rho_kl_crust_mantle);
- if(mp->anisotropic_kl){
- cudaFree(mp->d_cijkl_kl_crust_mantle);
- }else{
- cudaFree(mp->d_alpha_kl_crust_mantle);
- cudaFree(mp->d_beta_kl_crust_mantle);
- }
- if(mp->approximate_hess_kl){ cudaFree(mp->d_hess_kl_crust_mantle);}
}
+
// mass matrix
+ cudaFree(mp->d_rmassz_crust_mantle);
if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
cudaFree(mp->d_rmassx_crust_mantle);
cudaFree(mp->d_rmassy_crust_mantle);
}
- cudaFree(mp->d_rmassz_crust_mantle);
+ // kernel simulations
+ if( mp->simulation_type == 3 ){
+ if( mp->rotation && mp->exact_mass_matrix_for_rotation ){
+ cudaFree(mp->d_b_rmassx_crust_mantle);
+ cudaFree(mp->d_b_rmassy_crust_mantle);
+ }
+ // kernels
+ cudaFree(mp->d_rho_kl_crust_mantle);
+ if(! mp->anisotropic_kl){
+ cudaFree(mp->d_alpha_kl_crust_mantle);
+ cudaFree(mp->d_beta_kl_crust_mantle);
+ }else{
+ cudaFree(mp->d_cijkl_kl_crust_mantle);
+ }
+ if(mp->approximate_hess_kl){ cudaFree(mp->d_hess_kl_crust_mantle);}
+ }
+
//------------------------------------------
// outer_core
//------------------------------------------
@@ -2408,11 +2460,12 @@
cudaFree(mp->d_rhostore_outer_core);
}
+ cudaFree(mp->d_ibool_outer_core);
+
cudaFree(mp->d_xstore_outer_core);
cudaFree(mp->d_ystore_outer_core);
cudaFree(mp->d_zstore_outer_core);
- cudaFree(mp->d_ibool_outer_core);
cudaFree(mp->d_phase_ispec_inner_outer_core);
cudaFree(mp->d_ibelm_top_outer_core);
@@ -2423,6 +2476,7 @@
cudaFree(mp->d_normal_bottom_outer_core);
cudaFree(mp->d_jacobian2D_bottom_outer_core);
+ // wavefield
cudaFree(mp->d_displ_outer_core);
cudaFree(mp->d_veloc_outer_core);
cudaFree(mp->d_accel_outer_core);
@@ -2430,12 +2484,15 @@
cudaFree(mp->d_b_displ_outer_core);
cudaFree(mp->d_b_veloc_outer_core);
cudaFree(mp->d_b_accel_outer_core);
- cudaFree(mp->d_rho_kl_outer_core);
- cudaFree(mp->d_alpha_kl_outer_core);
}
// mass matrix
cudaFree(mp->d_rmass_outer_core);
+ if( mp->simulation_type == 3 ){
+ cudaFree(mp->d_rho_kl_outer_core);
+ cudaFree(mp->d_alpha_kl_outer_core);
+ }
+
//------------------------------------------
// inner_core
//------------------------------------------
@@ -2450,17 +2507,6 @@
cudaFree(mp->d_gammaz_inner_core);
cudaFree(mp->d_muvstore_inner_core);
- cudaFree(mp->d_ibool_inner_core);
-
- // gravity
- if( mp->gravity ){
- cudaFree(mp->d_xstore_inner_core);
- cudaFree(mp->d_ystore_inner_core);
- cudaFree(mp->d_zstore_inner_core);
- }
-
- cudaFree(mp->d_ibelm_top_inner_core);
-
if( ! mp->anisotropic_inner_core ){
cudaFree(mp->d_kappavstore_inner_core);
}else{
@@ -2474,14 +2520,21 @@
if( mp->simulation_type == 3 && mp->save_boundary_mesh ){
cudaFree(mp->d_rhostore_inner_core);
}
+
+ cudaFree(mp->d_ibool_inner_core);
cudaFree(mp->d_idoubling_inner_core);
+
+ // gravity
if( mp->gravity ){
cudaFree(mp->d_xstore_inner_core);
cudaFree(mp->d_ystore_inner_core);
cudaFree(mp->d_zstore_inner_core);
}
+
cudaFree(mp->d_phase_ispec_inner_inner_core);
+ cudaFree(mp->d_ibelm_top_inner_core);
+ // wavefield
cudaFree(mp->d_displ_inner_core);
cudaFree(mp->d_veloc_inner_core);
cudaFree(mp->d_accel_inner_core);
@@ -2489,20 +2542,31 @@
cudaFree(mp->d_b_displ_inner_core);
cudaFree(mp->d_b_veloc_inner_core);
cudaFree(mp->d_b_accel_inner_core);
+ }
+ // mass matrix
+ cudaFree(mp->d_rmassz_inner_core);
+ if( mp->rotation && mp->exact_mass_matrix_for_rotation ){
+ cudaFree(mp->d_rmassx_inner_core);
+ cudaFree(mp->d_rmassy_inner_core);
+ }
+
+ // kernel simulations
+ if( mp->simulation_type == 3 ) {
+ if( mp->rotation && mp->exact_mass_matrix_for_rotation ){
+ cudaFree(mp->d_b_rmassx_inner_core);
+ cudaFree(mp->d_b_rmassy_inner_core);
+ }
+ // kernels
cudaFree(mp->d_rho_kl_inner_core);
cudaFree(mp->d_alpha_kl_inner_core);
cudaFree(mp->d_beta_kl_inner_core);
}
- // mass matrix
- cudaFree(mp->d_rmassx_inner_core);
- cudaFree(mp->d_rmassy_inner_core);
- cudaFree(mp->d_rmassz_inner_core);
// oceans
if( mp->oceans ){
+ cudaFree(mp->d_ibool_ocean_load);
cudaFree(mp->d_rmass_ocean_load);
- cudaFree(mp->d_ibool_ocean_load);
cudaFree(mp->d_normal_ocean_load);
}
More information about the CIG-COMMITS
mailing list