[cig-commits] r19053 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Sat Oct 8 17:52:56 PDT 2011
Author: danielpeter
Date: 2011-10-08 17:52:55 -0700 (Sat, 08 Oct 2011)
New Revision: 19053
Modified:
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
Log:
updates hprime and hprimewgll in compute forces
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu 2011-10-09 00:52:55 UTC (rev 19053)
@@ -267,7 +267,10 @@
/* ----------------------------------------------------------------------------------------------- */
+// ACOUSTIC simulations
+/* ----------------------------------------------------------------------------------------------- */
+
__global__ void get_maximum_kernel(float* array, int size, float* d_max){
/* simplest version: uses only 1 thread
@@ -439,8 +442,110 @@
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//double end_time = get_time();
//printf("Elapsed time: %e\n",end_time-start_time);
- exit_on_cuda_error("after get_maximum_kernel");
+ exit_on_cuda_error("after get_norm_acoustic_from_device_cuda");
#endif
}
+/* ----------------------------------------------------------------------------------------------- */
+// ELASTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void get_maximum_vector_kernel(float* array, int size, float* d_max){
+
+ // reduction example:
+ __shared__ float sdata[256] ;
+
+ // load shared mem
+ unsigned int tid = threadIdx.x;
+ unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+
+ // loads values into shared memory: assume array is a vector array
+ sdata[tid] = (i < size) ? sqrt(array[i*3]*array[i*3]
+ + array[i*3+1]*array[i*3+1]
+ + array[i*3+2]*array[i*3+2]) : 0.0 ;
+
+ __syncthreads();
+
+ // do reduction in shared mem
+ for(unsigned int s=blockDim.x/2; s>0; s>>=1)
+ {
+ if (tid < s){
+ // summation:
+ //sdata[tid] += sdata[tid + s];
+ // maximum:
+ if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
+ }
+ __syncthreads();
+ }
+
+ // write result for this block to global mem
+ if (tid == 0) d_max[blockIdx.x] = sdata[0];
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_norm_elastic_from_device_cuda,
+ GET_NORM_ELASTIC_FROM_DEVICE_CUDA)(float* norm,
+ long* Mesh_pointer_f,
+ int* SIMULATION_TYPE) {
+
+ TRACE("get_norm_elastic_from_device_cuda");
+ //double start_time = get_time();
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ float max;
+ float *d_max;
+
+ max = 0;
+
+ // launch simple reduction kernel
+ float* h_max;
+ int blocksize = 256;
+
+ int num_blocks_x = ceil(mp->NGLOB_AB/blocksize);
+ //printf("num_blocks_x %i \n",num_blocks_x);
+
+ h_max = (float*) calloc(num_blocks_x,sizeof(float));
+ cudaMalloc((void**)&d_max,num_blocks_x*sizeof(float));
+
+ dim3 grid(num_blocks_x,1);
+ dim3 threads(blocksize,1,1);
+
+ if(*SIMULATION_TYPE == 1 ){
+ get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ,
+ mp->NGLOB_AB,
+ d_max);
+ }
+
+ if(*SIMULATION_TYPE == 3 ){
+ get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ,
+ mp->NGLOB_AB,
+ d_max);
+ }
+
+ print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost),222);
+
+ // determines max for all blocks
+ max = h_max[0];
+ for(int i=1;i<num_blocks_x;i++) {
+ if( max < h_max[i]) max = h_max[i];
+ }
+
+ cudaFree(d_max);
+ free(h_max);
+
+ // return result
+ *norm = max;
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("after get_norm_elastic_from_device_cuda");
+#endif
+}
+
+
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu 2011-10-09 00:52:55 UTC (rev 19053)
@@ -247,7 +247,8 @@
float* d_potential_acoustic, float* d_potential_dot_dot_acoustic,
float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz,
float* d_gammax, float* d_gammay, float* d_gammaz,
- float* hprime_xx, float* hprimewgll_xx,
+ float* hprime_xx, float* hprime_yy, float* hprime_zz,
+ float* hprimewgll_xx, float* hprimewgll_yy, float* hprimewgll_zz,
float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
float* d_rhostore);
@@ -335,7 +336,8 @@
mp->d_xix, mp->d_xiy, mp->d_xiz,
mp->d_etax, mp->d_etay, mp->d_etaz,
mp->d_gammax, mp->d_gammay, mp->d_gammaz,
- mp->d_hprime_xx, mp->d_hprimewgll_xx,
+ mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+ mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
mp->d_rhostore);
@@ -346,7 +348,8 @@
mp->d_xix, mp->d_xiy, mp->d_xiz,
mp->d_etax, mp->d_etay, mp->d_etaz,
mp->d_gammax, mp->d_gammay, mp->d_gammaz,
- mp->d_hprime_xx, mp->d_hprimewgll_xx,
+ mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+ mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
mp->d_rhostore);
}
@@ -379,7 +382,8 @@
float* d_potential_acoustic, float* d_potential_dot_dot_acoustic,
float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz,
float* d_gammax, float* d_gammay, float* d_gammaz,
- float* hprime_xx, float* hprimewgll_xx,
+ float* hprime_xx, float* hprime_yy, float* hprime_zz,
+ float* hprimewgll_xx, float* hprimewgll_yy, float* hprimewgll_zz,
float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
float* d_rhostore){
@@ -458,12 +462,12 @@
offset1 = K*NGLL2+J*NGLLX+l;
temp1l += s_dummy_loc[offset1]*hp1;
- // daniel: assumes that hprime_xx = hprime_yy = hprime_zz
- hp2 = hprime_xx[l*NGLLX+J];
+ // daniel: not more assumes that hprime_xx = hprime_yy = hprime_zz
+ hp2 = hprime_yy[l*NGLLX+J];
offset2 = K*NGLL2+l*NGLLX+I;
temp2l += s_dummy_loc[offset2]*hp2;
- hp3 = hprime_xx[l*NGLLX+K];
+ hp3 = hprime_zz[l*NGLLX+K];
offset3 = l*NGLL2+J*NGLLX+I;
temp3l += s_dummy_loc[offset3]*hp3;
}
@@ -475,17 +479,17 @@
+ s_dummy_loc[K*NGLL2+J*NGLLX+3]*hprime_xx[3*NGLLX+I]
+ s_dummy_loc[K*NGLL2+J*NGLLX+4]*hprime_xx[4*NGLLX+I];
- temp2l = s_dummy_loc[K*NGLL2+I]*hprime_xx[J]
- + s_dummy_loc[K*NGLL2+NGLLX+I]*hprime_xx[NGLLX+J]
- + s_dummy_loc[K*NGLL2+2*NGLLX+I]*hprime_xx[2*NGLLX+J]
- + s_dummy_loc[K*NGLL2+3*NGLLX+I]*hprime_xx[3*NGLLX+J]
- + s_dummy_loc[K*NGLL2+4*NGLLX+I]*hprime_xx[4*NGLLX+J];
+ temp2l = s_dummy_loc[K*NGLL2+I]*hprime_yy[J]
+ + s_dummy_loc[K*NGLL2+NGLLX+I]*hprime_yy[NGLLX+J]
+ + s_dummy_loc[K*NGLL2+2*NGLLX+I]*hprime_yy[2*NGLLX+J]
+ + s_dummy_loc[K*NGLL2+3*NGLLX+I]*hprime_yy[3*NGLLX+J]
+ + s_dummy_loc[K*NGLL2+4*NGLLX+I]*hprime_yy[4*NGLLX+J];
- temp3l = s_dummy_loc[J*NGLLX+I]*hprime_xx[K]
- + s_dummy_loc[NGLL2+J*NGLLX+I]*hprime_xx[NGLLX+K]
- + s_dummy_loc[2*NGLL2+J*NGLLX+I]*hprime_xx[2*NGLLX+K]
- + s_dummy_loc[3*NGLL2+J*NGLLX+I]*hprime_xx[3*NGLLX+K]
- + s_dummy_loc[4*NGLL2+J*NGLLX+I]*hprime_xx[4*NGLLX+K];
+ temp3l = s_dummy_loc[J*NGLLX+I]*hprime_zz[K]
+ + s_dummy_loc[NGLL2+J*NGLLX+I]*hprime_zz[NGLLX+K]
+ + s_dummy_loc[2*NGLL2+J*NGLLX+I]*hprime_zz[2*NGLLX+K]
+ + s_dummy_loc[3*NGLL2+J*NGLLX+I]*hprime_zz[3*NGLLX+K]
+ + s_dummy_loc[4*NGLL2+J*NGLLX+I]*hprime_zz[4*NGLLX+K];
#endif
@@ -538,12 +542,12 @@
offset1 = K*NGLL2+J*NGLLX+l;
temp1l += s_temp1[offset1]*fac1;
- //daniel: assumes hprimewgll_xx = hprimewgll_yy = hprimewgll_zz
- fac2 = hprimewgll_xx[J*NGLLX+l];
+ //daniel: not more assumes hprimewgll_xx = hprimewgll_yy = hprimewgll_zz
+ fac2 = hprimewgll_yy[J*NGLLX+l];
offset2 = K*NGLL2+l*NGLLX+I;
temp2l += s_temp2[offset2]*fac2;
- fac3 = hprimewgll_xx[K*NGLLX+l];
+ fac3 = hprimewgll_zz[K*NGLLX+l];
offset3 = l*NGLL2+J*NGLLX+I;
temp3l += s_temp3[offset3]*fac3;
}
@@ -556,18 +560,18 @@
+ s_temp1[K*NGLL2+J*NGLLX+4]*hprimewgll_xx[I*NGLLX+4];
- temp2l = s_temp2[K*NGLL2+I]*hprimewgll_xx[J*NGLLX]
- + s_temp2[K*NGLL2+NGLLX+I]*hprimewgll_xx[J*NGLLX+1]
- + s_temp2[K*NGLL2+2*NGLLX+I]*hprimewgll_xx[J*NGLLX+2]
- + s_temp2[K*NGLL2+3*NGLLX+I]*hprimewgll_xx[J*NGLLX+3]
- + s_temp2[K*NGLL2+4*NGLLX+I]*hprimewgll_xx[J*NGLLX+4];
+ temp2l = s_temp2[K*NGLL2+I]*hprimewgll_yy[J*NGLLX]
+ + s_temp2[K*NGLL2+NGLLX+I]*hprimewgll_yy[J*NGLLX+1]
+ + s_temp2[K*NGLL2+2*NGLLX+I]*hprimewgll_yy[J*NGLLX+2]
+ + s_temp2[K*NGLL2+3*NGLLX+I]*hprimewgll_yy[J*NGLLX+3]
+ + s_temp2[K*NGLL2+4*NGLLX+I]*hprimewgll_yy[J*NGLLX+4];
- temp3l = s_temp3[J*NGLLX+I]*hprimewgll_xx[K*NGLLX]
- + s_temp3[NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+1]
- + s_temp3[2*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+2]
- + s_temp3[3*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+3]
- + s_temp3[4*NGLL2+J*NGLLX+I]*hprimewgll_xx[K*NGLLX+4];
+ temp3l = s_temp3[J*NGLLX+I]*hprimewgll_zz[K*NGLLX]
+ + s_temp3[NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+1]
+ + s_temp3[2*NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+2]
+ + s_temp3[3*NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+3]
+ + s_temp3[4*NGLL2+J*NGLLX+I]*hprimewgll_zz[K*NGLLX+4];
#endif
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-10-09 00:52:55 UTC (rev 19053)
@@ -41,9 +41,11 @@
// cuda constant arrays
__constant__ float d_hprime_xx[NGLL2];
-__constant__ float d_hprime_yy[NGLL2]; // daniel: check if hprime_yy == hprime_xx
+__constant__ float d_hprime_yy[NGLL2]; // daniel: remove only if certain: check if hprime_yy == hprime_xx
__constant__ float d_hprime_zz[NGLL2]; // daniel: check if hprime_zz == hprime_xx
__constant__ float d_hprimewgll_xx[NGLL2];
+__constant__ float d_hprimewgll_yy[NGLL2]; // daniel: check if hprimewgll_yy == hprimewgll_xx
+__constant__ float d_hprimewgll_zz[NGLL2]; // daniel: remove only if certain...
__constant__ float d_wgllwgll_xy[NGLL2];
__constant__ float d_wgllwgll_xz[NGLL2];
__constant__ float d_wgllwgll_yz[NGLL2];
@@ -52,8 +54,8 @@
void Kernel_2(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION);
-__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
- int num_phase_ispec_elastic, int d_iphase, int* d_ibool);
+//__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
+// int num_phase_ispec_elastic, int d_iphase, int* d_ibool);
__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,
@@ -115,10 +117,9 @@
TRACE("transfer_boundary_accel_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
+
+ int blocksize = 256;
-
- int blocksize = 256;
int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
@@ -459,10 +460,11 @@
// debugging
//printf("Starting with grid %dx%d for %d blocks\n",num_blocks_x,num_blocks_y,nb_blocks_to_compute);
- float* d_debug, *h_debug;
- h_debug = (float*)calloc(128,sizeof(float));
- cudaMalloc((void**)&d_debug,128*sizeof(float));
- cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
+ float* d_debug;
+// float* h_debug;
+// h_debug = (float*)calloc(128,sizeof(float));
+// cudaMalloc((void**)&d_debug,128*sizeof(float));
+// cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
// Cuda timing
// cudaEvent_t start, stop;
@@ -503,8 +505,8 @@
// printf("cudadebug[%d] = %e\n",i,h_debug[i]);
// }
// }
- free(h_debug);
- cudaFree(d_debug);
+// free(h_debug);
+// cudaFree(d_debug);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("Kernel_2_impl");
#endif
@@ -556,28 +558,28 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
- int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
- int bx = blockIdx.x;
- int tx = threadIdx.x;
- int working_element;
- //int ispec;
- //int NGLL3_ALIGN = 128;
- if(tx==0 && bx==0) {
+//__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
+// int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
+// int bx = blockIdx.x;
+// int tx = threadIdx.x;
+// int working_element;
+// //int ispec;
+// //int NGLL3_ALIGN = 128;
+// if(tx==0 && bx==0) {
+//
+// d_debug_output[0] = 420.0;
+//
+// d_debug_output[2] = num_phase_ispec_elastic;
+// d_debug_output[3] = d_iphase;
+// working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
+// d_debug_output[4] = working_element;
+// d_debug_output[5] = d_phase_ispec_inner_elastic[0];
+// /* d_debug_output[1] = d_ibool[working_element*NGLL3_ALIGN + tx]-1; */
+// }
+// /* d_debug_output[1+tx+128*bx] = 69.0; */
+//
+//}
- d_debug_output[0] = 420.0;
-
- d_debug_output[2] = num_phase_ispec_elastic;
- d_debug_output[3] = d_iphase;
- working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
- d_debug_output[4] = working_element;
- d_debug_output[5] = d_phase_ispec_inner_elastic[0];
- /* d_debug_output[1] = d_ibool[working_element*NGLL3_ALIGN + tx]-1; */
- }
- /* d_debug_output[1+tx+128*bx] = 69.0; */
-
-}
-
/* ----------------------------------------------------------------------------------------------- */
// double precision temporary variables leads to 10% performance
@@ -917,13 +919,13 @@
tempy1l += s_tempy1[offset]*fac1;
tempz1l += s_tempz1[offset]*fac1;
- fac2 = d_hprimewgll_xx[J*NGLLX+l];
+ fac2 = d_hprimewgll_yy[J*NGLLX+l];
offset = K*NGLL2+l*NGLLX+I;
tempx2l += s_tempx2[offset]*fac2;
tempy2l += s_tempy2[offset]*fac2;
tempz2l += s_tempz2[offset]*fac2;
- fac3 = d_hprimewgll_xx[K*NGLLX+l];
+ fac3 = d_hprimewgll_zz[K*NGLLX+l];
offset = l*NGLL2+J*NGLLX+I;
tempx3l += s_tempx3[offset]*fac3;
tempy3l += s_tempy3[offset]*fac3;
@@ -958,41 +960,41 @@
+ s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
- tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
- + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
- + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
- + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
- + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+ tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+ + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+ + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+ + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+ + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
- tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
- + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
- + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
- + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
- + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+ tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+ + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+ + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+ + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+ + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
- tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
- + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
- + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
- + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
- + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+ tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+ + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+ + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+ + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+ + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
- tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
- + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
- + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
- + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
- + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+ tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+ + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+ + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+ + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+ + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
- tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
- + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
- + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
- + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
- + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+ tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+ + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+ + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+ + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+ + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
- tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
- + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
- + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
- + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
- + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+ tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+ + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+ + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+ + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+ + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
#endif
@@ -1301,7 +1303,7 @@
cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(float));
if (err != cudaSuccess)
{
- fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err));
+ fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err));
exit(1);
}
@@ -1312,6 +1314,38 @@
}
}
+void setConst_hprimewgll_yy(float* array,Mesh* mp)
+{
+ cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(float));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+
+ err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_yy),"d_hprimewgll_yy");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_hprimewgll_yy: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+}
+
+void setConst_hprimewgll_zz(float* array,Mesh* mp)
+{
+ cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(float));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+
+ err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_zz),"d_hprimewgll_zz");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_hprimewgll_zz: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+}
+
void setConst_wgllwgll_xy(float* array,Mesh* mp)
{
cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(float));
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-10-09 00:52:55 UTC (rev 19053)
@@ -46,70 +46,78 @@
/* ----------------------------------------------------------------------------------------------- */
__global__ void compute_kernels_cudakernel(int* ispec_is_elastic, int* ibool,
- float* accel,
- float* b_displ,
- float* epsilondev_xx,
- float* epsilondev_yy,
- float* epsilondev_xy,
- float* epsilondev_xz,
- float* epsilondev_yz,
- float* b_epsilondev_xx,
- float* b_epsilondev_yy,
- float* b_epsilondev_xy,
- float* b_epsilondev_xz,
- float* b_epsilondev_yz,
- float* rho_kl,
- float deltat,
- float* mu_kl,
- float* kappa_kl,
- float* epsilon_trace_over_3,
- float* b_epsilon_trace_over_3,
- int NSPEC_AB,
- float* d_debug) {
+ float* accel,
+ float* b_displ,
+ float* epsilondev_xx,
+ float* epsilondev_yy,
+ float* epsilondev_xy,
+ float* epsilondev_xz,
+ float* epsilondev_yz,
+ float* b_epsilondev_xx,
+ float* b_epsilondev_yy,
+ float* b_epsilondev_xy,
+ float* b_epsilondev_xz,
+ float* b_epsilondev_yz,
+ float* rho_kl,
+ float deltat,
+ float* mu_kl,
+ float* kappa_kl,
+ float* epsilon_trace_over_3,
+ float* b_epsilon_trace_over_3,
+ int NSPEC_AB,
+ float* d_debug) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
- if(ispec<NSPEC_AB) { // handles case when there is 1 extra block (due to rectangular grid)
- int ijk = threadIdx.x;
- int ijk_ispec = ijk + 125*ispec;
- int iglob = ibool[ijk_ispec]-1;
- // if(ispec_is_elastic[ispec]) { // leave out until have acoustic coupling
- if(1) {
+ // handles case when there is 1 extra block (due to rectangular grid)
+ if(ispec < NSPEC_AB) {
+
+ // elastic elements only
+ if(ispec_is_elastic[ispec] == 1) {
+
+ int ijk = threadIdx.x;
+ int ijk_ispec = ijk + 125*ispec;
+ int iglob = ibool[ijk_ispec] - 1 ;
+
+ // debug
+// if(ijk_ispec == 9480531) {
+// d_debug[0] = rho_kl[ijk_ispec];
+// d_debug[1] = accel[3*iglob];
+// d_debug[2] = b_displ[3*iglob];
+// d_debug[3] = deltat * (accel[3*iglob]*b_displ[3*iglob]+
+// accel[3*iglob+1]*b_displ[3*iglob+1]+
+// accel[3*iglob+2]*b_displ[3*iglob+2]);
+// }
- if(ijk_ispec == 9480531) {
- d_debug[0] = rho_kl[ijk_ispec];
- d_debug[1] = accel[3*iglob];
- d_debug[2] = b_displ[3*iglob];
- d_debug[3] = deltat * (accel[3*iglob]*b_displ[3*iglob]+
- accel[3*iglob+1]*b_displ[3*iglob+1]+
- accel[3*iglob+2]*b_displ[3*iglob+2]);
- }
-
+ // isotropic kernels:
+ // density kernel
rho_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_displ[3*iglob]+
- accel[3*iglob+1]*b_displ[3*iglob+1]+
- accel[3*iglob+2]*b_displ[3*iglob+2]);
+ accel[3*iglob+1]*b_displ[3*iglob+1]+
+ accel[3*iglob+2]*b_displ[3*iglob+2]);
-
+ // debug
// if(rho_kl[ijk_ispec] < 1.9983e+18) {
// atomicAdd(&d_debug[3],1.0);
// d_debug[4] = ijk_ispec;
- // d_debug[0] = rho_kl[ijk_ispec];
- // d_debug[1] = accel[3*iglob];
- // d_debug[2] = b_displ[3*iglob];
+ // d_debug[0] = rho_kl[ijk_ispec];
+ // d_debug[1] = accel[3*iglob];
+ // d_debug[2] = b_displ[3*iglob];
// }
+ // shear modulus kernel
mu_kl[ijk_ispec] += deltat * (epsilondev_xx[ijk_ispec]*b_epsilondev_xx[ijk_ispec]+ // 1*b1
- epsilondev_yy[ijk_ispec]*b_epsilondev_yy[ijk_ispec]+ // 2*b2
- (epsilondev_xx[ijk_ispec]+epsilondev_yy[ijk_ispec])*
- (b_epsilondev_xx[ijk_ispec]+b_epsilondev_yy[ijk_ispec])+
- 2*(epsilondev_xy[ijk_ispec]*b_epsilondev_xy[ijk_ispec]+
- epsilondev_xz[ijk_ispec]*b_epsilondev_xz[ijk_ispec]+
- epsilondev_yz[ijk_ispec]*b_epsilondev_yz[ijk_ispec]));
+ epsilondev_yy[ijk_ispec]*b_epsilondev_yy[ijk_ispec]+ // 2*b2
+ (epsilondev_xx[ijk_ispec]+epsilondev_yy[ijk_ispec])*
+ (b_epsilondev_xx[ijk_ispec]+b_epsilondev_yy[ijk_ispec])+
+ 2*(epsilondev_xy[ijk_ispec]*b_epsilondev_xy[ijk_ispec]+
+ epsilondev_xz[ijk_ispec]*b_epsilondev_xz[ijk_ispec]+
+ epsilondev_yz[ijk_ispec]*b_epsilondev_yz[ijk_ispec]));
+ // bulk modulus kernel
kappa_kl[ijk_ispec] += deltat*(9*epsilon_trace_over_3[ijk_ispec]*
- b_epsilon_trace_over_3[ijk_ispec]);
+ b_epsilon_trace_over_3[ijk_ispec]);
}
}
@@ -118,57 +126,63 @@
/* ----------------------------------------------------------------------------------------------- */
extern "C"
-void FC_FUNC_(compute_kernels_cuda,
- COMPUTE_KERNELS_CUDA)(long* Mesh_pointer, int* NOISE_TOMOGRAPHY,
- int* ELASTIC_SIMULATION, int* SAVE_MOHO_MESH,float* deltat) {
+void FC_FUNC_(compute_kernels_elastic_cuda,
+ COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
+ float* deltat) {
+TRACE("compute_kernels_elastic_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
int blocksize = 125; // NGLLX*NGLLY*NGLLZ
+
int num_blocks_x = mp->NSPEC_AB;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
num_blocks_x = ceil(num_blocks_x/2.0);
num_blocks_y = num_blocks_y*2;
}
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
float* d_debug;
+ /*
float* h_debug;
h_debug = (float*)calloc(128,sizeof(float));
cudaMalloc((void**)&d_debug,128*sizeof(float));
cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
+ */
-
compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
- mp->d_accel, mp->d_b_displ,
- mp->d_epsilondev_xx,
- mp->d_epsilondev_yy,
- mp->d_epsilondev_xy,
- mp->d_epsilondev_xz,
- mp->d_epsilondev_yz,
- mp->d_b_epsilondev_xx,
- mp->d_b_epsilondev_yy,
- mp->d_b_epsilondev_xy,
- mp->d_b_epsilondev_xz,
- mp->d_b_epsilondev_yz,
- mp->d_rho_kl,
- *deltat,
- mp->d_mu_kl,
- mp->d_kappa_kl,
- mp->d_epsilon_trace_over_3,
- mp->d_b_epsilon_trace_over_3,
- mp->NSPEC_AB,
- d_debug);
-
+ mp->d_accel, mp->d_b_displ,
+ mp->d_epsilondev_xx,
+ mp->d_epsilondev_yy,
+ mp->d_epsilondev_xy,
+ mp->d_epsilondev_xz,
+ mp->d_epsilondev_yz,
+ mp->d_b_epsilondev_xx,
+ mp->d_b_epsilondev_yy,
+ mp->d_b_epsilondev_xy,
+ mp->d_b_epsilondev_xz,
+ mp->d_b_epsilondev_yz,
+ mp->d_rho_kl,
+ *deltat,
+ mp->d_mu_kl,
+ mp->d_kappa_kl,
+ mp->d_epsilon_trace_over_3,
+ mp->d_b_epsilon_trace_over_3,
+ mp->NSPEC_AB,
+ d_debug);
+ /*
cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
cudaFree(d_debug);
+ */
// for(int i=0;i<5;i++) {
// printf("d_debug[%d]=%e\n",i,h_debug[i]);
// }
+ /*
free(h_debug);
-
+ */
// float* h_rho = (float*)malloc(sizeof(float)*mp->NSPEC_AB*125);
// float maxval = 0;
// cudaMemcpy(h_rho,mp->d_rho_kl,sizeof(float)*mp->NSPEC_AB*125,cudaMemcpyDeviceToHost);
@@ -183,7 +197,7 @@
// printf("maval rho = %e, number>1e10 = %d vs. %d\n",maxval,number_big_values,mp->NSPEC_AB*125);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_kernels_cudakernel");
+ exit_on_cuda_error("compute_kernels_elastic_cuda");
#endif
}
@@ -208,16 +222,17 @@
int num_free_surface_faces,
float* d_debug) {
int iface = blockIdx.x + blockIdx.y*gridDim.x;
- if(iface<num_free_surface_faces) {
+
+ if(iface < num_free_surface_faces) {
int ispec = free_surface_ispec[iface]-1;
int igll = threadIdx.x;
int ipoin = igll + 25*iface;
- int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
- int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
- int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+ int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1 ;
+ int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+ int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
- int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+ int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1 ;
float eta = ( noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x_noise[ipoin]+
noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y_noise[ipoin]+
@@ -235,8 +250,8 @@
// }
Sigma_kl[INDEX4(5,5,5,i,j,k,ispec)] += deltat*eta*(normal_x_noise[ipoin]*displ[3*iglob]+
- normal_y_noise[ipoin]*displ[1+3*iglob]+
- normal_z_noise[ipoin]*displ[2+3*iglob]);
+ normal_y_noise[ipoin]*displ[1+3*iglob]+
+ normal_z_noise[ipoin]*displ[2+3*iglob]);
}
}
@@ -274,16 +289,16 @@
// cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
compute_kernels_strength_noise_cuda_kernel<<<grid,threads>>>(mp->d_displ,
- mp->d_free_surface_ispec,
- mp->d_free_surface_ijk,
- mp->d_ibool,
- mp->d_noise_surface_movie,
- mp->d_normal_x_noise,
- mp->d_normal_y_noise,
- mp->d_normal_z_noise,
- mp->d_Sigma_kl,*deltat,
- num_free_surface_faces,
- d_debug);
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->d_ibool,
+ mp->d_noise_surface_movie,
+ mp->d_normal_x_noise,
+ mp->d_normal_y_noise,
+ mp->d_normal_z_noise,
+ mp->d_Sigma_kl,*deltat,
+ num_free_surface_faces,
+ d_debug);
// cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
// for(int i=0;i<8;i++) {
@@ -410,24 +425,28 @@
int NSPEC_AB) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
- int ijk = threadIdx.x;
+
+ // handles case when there is 1 extra block (due to rectangular grid)
+ if( ispec < NSPEC_AB ){
- // local and global indices
- int ijk_ispec = ijk + 125*ispec;
- int ijk_ispec_padded = ijk + 128*ispec;
- int iglob = ibool[ijk_ispec]-1;
-
- float accel_elm[3];
- float b_displ_elm[3];
- float rhol,kappal;
-
- // shared memory between all threads within this block
- __shared__ float scalar_field_displ[125];
- __shared__ float scalar_field_accel[125];
-
- if( ispec < NSPEC_AB ){
+ // acoustic elements only
if( ispec_is_acoustic[ispec] == 1) {
+
+ int ijk = threadIdx.x;
+ // local and global indices
+ int ijk_ispec = ijk + 125*ispec;
+ int ijk_ispec_padded = ijk + 128*ispec;
+ int iglob = ibool[ijk_ispec] - 1;
+
+ float accel_elm[3];
+ float b_displ_elm[3];
+ float rhol,kappal;
+
+ // shared memory between all threads within this block
+ __shared__ float scalar_field_displ[125];
+ __shared__ float scalar_field_accel[125];
+
// copy field values
scalar_field_displ[ijk] = b_potential_acoustic[iglob];
scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
@@ -456,7 +475,7 @@
// bulk modulus kernel
kappal = kappastore[ijk_ispec];
kappa_ac_kl[ijk_ispec] -= deltat / kappal * potential_dot_dot_acoustic[iglob]
- * b_potential_dot_dot_acoustic[iglob];
+ * b_potential_dot_dot_acoustic[iglob];
}
}
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2011-10-09 00:52:55 UTC (rev 19053)
@@ -132,10 +132,8 @@
// pointers to constant memory arrays
float* d_hprime_xx; float* d_hprime_yy; float* d_hprime_zz;
- float* d_hprimewgll_xx;
- float* d_wgllwgll_xy;
- float* d_wgllwgll_xz;
- float* d_wgllwgll_yz;
+ float* d_hprimewgll_xx; float* d_hprimewgll_yy; float* d_hprimewgll_zz;
+ float* d_wgllwgll_xy; float* d_wgllwgll_xz; float* d_wgllwgll_yz;
// ------------------------------------------------------------------ //
// elastic wavefield parameters
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2011-10-09 00:52:55 UTC (rev 19053)
@@ -29,6 +29,25 @@
#ifndef CUDA_HEADER_H
#define CUDA_HEADER_H
+/* ----------------------------------------------------------------------------------------------- */
+
+// setters for these const arrays (very ugly hack, but will have to do)
+
+// elastic
+void setConst_hprime_xx(float* array,Mesh* mp);
+void setConst_hprime_yy(float* array,Mesh* mp);
+void setConst_hprime_zz(float* array,Mesh* mp);
+
+void setConst_hprimewgll_xx(float* array,Mesh* mp);
+void setConst_hprimewgll_yy(float* array,Mesh* mp);
+void setConst_hprimewgll_zz(float* array,Mesh* mp);
+
+void setConst_wgllwgll_xy(float* array,Mesh* mp);
+void setConst_wgllwgll_xz(float* array, Mesh* mp);
+void setConst_wgllwgll_yz(float* array, Mesh* mp);
+
+/* ----------------------------------------------------------------------------------------------- */
+
/* CUDA specific things from specfem3D_kernels.cu */
#ifdef USE_TEXTURES
@@ -99,22 +118,7 @@
}
}
-#endif
+#endif // USE_TEXTURES
-/* ----------------------------------------------------------------------------------------------- */
-// setters for these const arrays (very ugly hack, but will have to do)
-
-// elastic
-void setConst_hprime_xx(float* array,Mesh* mp);
-void setConst_hprime_yy(float* array,Mesh* mp);
-void setConst_hprime_zz(float* array,Mesh* mp);
-
-void setConst_hprimewgll_xx(float* array,Mesh* mp);
-
-void setConst_wgllwgll_xy(float* array,Mesh* mp);
-void setConst_wgllwgll_xz(float* array, Mesh* mp);
-void setConst_wgllwgll_yz(float* array, Mesh* mp);
-
-
#endif //CUDA_HEADER_H
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-10-09 00:52:55 UTC (rev 19053)
@@ -239,13 +239,9 @@
int* h_ibool,
int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
- float* h_hprime_xx,
- float* h_hprime_yy,
- float* h_hprime_zz,
- float* h_hprimewgll_xx,
- float* h_wgllwgll_xy,
- float* h_wgllwgll_xz,
- float* h_wgllwgll_yz,
+ float* h_hprime_xx,float* h_hprime_yy,float* h_hprime_zz,
+ float* h_hprimewgll_xx,float* h_hprimewgll_yy,float* h_hprimewgll_zz,
+ float* h_wgllwgll_xy,float* h_wgllwgll_xz,float* h_wgllwgll_yz,
int* ABSORBING_CONDITIONS,
int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
float* h_abs_boundary_normal,
@@ -309,6 +305,8 @@
setConst_hprime_yy(h_hprime_yy,mp);
setConst_hprime_zz(h_hprime_zz,mp);
setConst_hprimewgll_xx(h_hprimewgll_xx,mp);
+ setConst_hprimewgll_yy(h_hprimewgll_yy,mp);
+ setConst_hprimewgll_zz(h_hprimewgll_zz,mp);
setConst_wgllwgll_xy(h_wgllwgll_xy,mp);
setConst_wgllwgll_xz(h_wgllwgll_xz,mp);
setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2011-10-09 00:52:55 UTC (rev 19053)
@@ -503,9 +503,39 @@
#endif
}
+/* ----------------------------------------------------------------------------------------------- */
+extern "C"
+void FC_FUNC_(transfer_potential_dot_dot_from_device,
+ TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, float* potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+
+ TRACE("transfer_potential_dot_dot_from_device");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
+ sizeof(float)*(*size),cudaMemcpyDeviceToHost),50041);
+
+}
+
/* ----------------------------------------------------------------------------------------------- */
+extern "C"
+void FC_FUNC_(transfer_b_potential_dot_dot_from_device,
+ TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, float* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+
+ TRACE("transfer_b_potential_dot_dot_from_device");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
+ sizeof(float)*(*size),cudaMemcpyDeviceToHost),50042);
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
extern "C"
void FC_FUNC_(transfer_sensitivity_kernels_acoustic_to_host,
TRANSFER_SENSITIVITY_KERNELS_ACOUSTIC_TO_HOST)(long* Mesh_pointer,
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90 2011-10-09 00:52:55 UTC (rev 19053)
@@ -71,8 +71,7 @@
! updates kernels on GPU
if(GPU_MODE) then
- call compute_kernels_cuda(Mesh_pointer,NOISE_TOMOGRAPHY,ELASTIC_SIMULATION,SAVE_MOHO_MESH, &
- deltat)
+ call compute_kernels_elastic_cuda(Mesh_pointer,deltat)
! for noise simulations --- source strength kernel
if (NOISE_TOMOGRAPHY == 3) &
@@ -188,13 +187,6 @@
! kernels are done
return
-
- !daniel
- !call transfer_fields_acoustic_from_device(NGLOB_AB,potential_acoustic, &
- ! potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
- !call transfer_b_fields_acoustic_from_device(NGLOB_AB,b_potential_acoustic, &
- ! b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
-
endif
! updates kernels
@@ -257,6 +249,18 @@
real(kind=CUSTOM_REAL),dimension(NDIM,NGLLX,NGLLY,NGLLZ):: b_accel_elm,accel_elm
integer :: i,j,k,ispec,iglob
+ !daniel: todo - workaround to do this on GPU?
+ if( GPU_MODE) then
+ if( ACOUSTIC_SIMULATION) then
+ call transfer_potential_dot_dot_from_device(NGLOB_AB,potential_dot_dot_acoustic, Mesh_pointer)
+ call transfer_b_potential_dot_dot_from_device(NGLOB_AB,b_potential_dot_dot_acoustic, Mesh_pointer)
+ endif
+ if( ELASTIC_SIMULATION ) then
+ call transfer_accel_from_device(NGLOB_AB*NDIM,accel,Mesh_pointer)
+ call transfer_b_accel_from_device(NGLOB_AB*NDIM,b_accel,Mesh_pointer)
+ endif
+ endif
+
! loops over all elements
do ispec = 1, NSPEC_AB
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90 2011-10-09 00:52:55 UTC (rev 19053)
@@ -149,21 +149,25 @@
ihours_remain,iminutes_remain,iseconds_remain,int_t_remain, &
ihours_total,iminutes_total,iseconds_total,int_t_total
- if(GPU_MODE) then
- ! way 1: copy whole fields
- ! elastic wavefield
- if( ELASTIC_SIMULATION ) then
- call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
- ! backward/reconstructed wavefield
- if(SIMULATION_TYPE==3) &
- call transfer_b_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
- endif
-
- endif
+! if(GPU_MODE) then
+! ! way 1: copy whole fields
+! ! elastic wavefield
+! if( ELASTIC_SIMULATION ) then
+! call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
+! ! backward/reconstructed wavefield
+! if(SIMULATION_TYPE==3) &
+! call transfer_b_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
+! endif
+! endif
! compute maximum of norm of displacement in each slice
if( ELASTIC_SIMULATION ) then
- Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2))
+ if( GPU_MODE) then
+ ! way 2: just get maximum of field from GPU
+ call get_norm_elastic_from_device_cuda(Usolidnorm,Mesh_pointer,1)
+ else
+ Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2))
+ endif
else
if( ACOUSTIC_SIMULATION ) then
if(GPU_MODE) then
@@ -181,7 +185,12 @@
! adjoint simulations
if( SIMULATION_TYPE == 3 ) then
if( ELASTIC_SIMULATION ) then
- b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2))
+ ! way 2
+ if(GPU_MODE) then
+ call get_norm_elastic_from_device_cuda(b_Usolidnorm,Mesh_pointer,3)
+ else
+ b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2))
+ endif
else
if( ACOUSTIC_SIMULATION ) then
! way 2
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-10-08 23:49:07 UTC (rev 19052)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-10-09 00:52:55 UTC (rev 19053)
@@ -754,7 +754,8 @@
num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
hprime_xx, hprime_yy, hprime_zz, &
- hprimewgll_xx, wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
+ hprimewgll_xx, hprimewgll_yy, hprimewgll_zz, &
+ wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
ABSORBING_CONDITIONS, &
abs_boundary_ispec, abs_boundary_ijk, &
abs_boundary_normal, &
More information about the CIG-COMMITS
mailing list