[cig-commits] r20365 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda decompose_mesh_SCOTCH generate_databases shared specfem3D
rietmann at geodynamics.org
rietmann at geodynamics.org
Wed Jun 13 05:54:51 PDT 2012
Author: rietmann
Date: 2012-06-13 05:54:51 -0700 (Wed, 13 Jun 2012)
New Revision: 20365
Modified:
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90
Log:
This commit merges several optimization changes back into the working
svn sunflower release including:
(1) Texture memory for accel and displ field
(2) __device__ instead of __constant memory
(3) asyncMemcopy to further hide communications, which is paired with similar MPI routines.
improvements to Kernel_2_impl
There seem to be several functional regressions in the trunk, which
are commented out in this commit. The CPU version, (i.e.,
GPU_MODE=.false.), does not seem to run.
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu 2012-06-13 12:54:51 UTC (rev 20365)
@@ -125,7 +125,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(5,5,5);
- compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel,
+ compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
mp->d_ibool,
mp->d_ispec_is_inner,
phase_is_inner,
@@ -175,7 +175,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(5,5,5);
- compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel,mp->d_ibool,
+ compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_ibool,
mp->d_ispec_is_inner, *phase_is_inner,
mp->d_sourcearrays,
mp->d_stf_pre_compute,
@@ -237,7 +237,7 @@
dim3 threads(NGLL3,1,1);
if(myrank == islice_selected_rec[irec_master_noise-1]) {
- add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool,
+ add_source_master_rec_noise_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool,
mp->d_ispec_selected_rec,
irec_master_noise,
mp->d_accel,
@@ -404,7 +404,7 @@
// h_pre_comp..), because normally it is in the loop updating accel,
// and due to how it's incremented, it cannot be parallelized
- add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel,
+ add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
*nrec,
mp->d_adj_sourcearrays,
mp->d_ibool,
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2012-06-13 12:54:51 UTC (rev 20365)
@@ -39,19 +39,22 @@
// cuda constant arrays
-__constant__ realw d_hprime_xx[NGLL2];
-__constant__ realw d_hprime_yy[NGLL2];
-__constant__ realw d_hprime_zz[NGLL2];
-__constant__ realw d_hprimewgll_xx[NGLL2];
-__constant__ realw d_hprimewgll_yy[NGLL2];
-__constant__ realw d_hprimewgll_zz[NGLL2];
-__constant__ realw d_wgllwgll_xy[NGLL2];
-__constant__ realw d_wgllwgll_xz[NGLL2];
-__constant__ realw d_wgllwgll_yz[NGLL2];
+__device__ realw d_hprime_xx[NGLL2];
+// only needed if NGLLX != NGLLY != NGLLZ
+// __device__ realw d_hprime_yy[NGLL2];
+// __device__ realw d_hprime_zz[NGLL2];
+__device__ realw d_hprimewgll_xx[NGLL2];
+__device__ realw d_hprimewgll_yy[NGLL2];
+__device__ realw d_hprimewgll_zz[NGLL2];
+__device__ realw d_wgllwgll_xy[NGLL2];
+__device__ realw d_wgllwgll_xz[NGLL2];
+__device__ realw d_wgllwgll_yz[NGLL2];
+
__constant__ realw d_wgll_cube[NGLL3]; // needed only for gravity case
-
+// prototype for the fortran function to do non-blocking mpi send
+extern "C" void assemble_mpi_vector_send_cuda_(void*,void*,void*,void*,void*,void*,void*,void*,void*);
/* ----------------------------------------------------------------------------------------------- */
// prepares a device array with with all inter-element edge-nodes -- this
@@ -116,14 +119,14 @@
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
if(*FORWARD_OR_ADJOINT == 1) {
- prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel,mp->d_send_accel_buffer,
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
mp->num_interfaces_ext_mesh,
mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
}
else if(*FORWARD_OR_ADJOINT == 3) {
- prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel,mp->d_send_accel_buffer,
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_send_accel_buffer,
mp->num_interfaces_ext_mesh,
mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
@@ -148,6 +151,41 @@
/* ----------------------------------------------------------------------------------------------- */
+extern "C" void FC_FUNC_(transfer_boundary_from_device_asynchronously,TRANSFER_BOUNDARY_FROM_DEVICE_ASYNCHRONOUSLY)(long* Mesh_pointer,int* nspec_outer_elastic) {
+
+ TRACE("transfer_boundary_from_device_asynchronously");
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+ int num_blocks_x = *nspec_outer_elastic;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ int blocksize = NGLL3_PADDED;
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ // wait until kernel is finished before starting async memcpy
+ cudaDeviceSynchronize();
+
+ cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
+ 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
+ cudaMemcpyDeviceToHost,mp->copy_stream);
+ // cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
+ // 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
+ // cudaMemcpyDeviceToHost,mp->compute_stream);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
__global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
int num_interfaces_ext_mesh,
int max_nibool_interfaces_ext_mesh,
@@ -190,6 +228,97 @@
/* ----------------------------------------------------------------------------------------------- */
+extern "C"
+void FC_FUNC_(transfer_boundary_to_device_asynchronously,TRANSFER_BOUNDARY_TO_DEVICE_ASYNCHRONOUSLY)(long* Mesh_pointer,
+ realw* buffer_recv_vector_ext_mesh,
+ int* num_interfaces_ext_mesh,
+ int* max_nibool_interfaces_ext_mesh) {
+
+ TRACE("transfer_boundary_to_device_asynchronously");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+ memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_recv_buffer*sizeof(realw));
+
+ // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+ // 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
+ // cudaMemcpyHostToDevice,mp->compute_stream);
+ printf("xfer to device\n");
+ cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+ 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
+ cudaMemcpyHostToDevice,mp->copy_stream);
+
+
+
+
+}
+
+
+extern "C"
+void FC_FUNC_(assemble_accel_on_device,
+ ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel,
+ realw* buffer_recv_vector_ext_mesh,
+ int* num_interfaces_ext_mesh,
+ int* max_nibool_interfaces_ext_mesh,
+ int* nibool_interfaces_ext_mesh,
+ int* ibool_interfaces_ext_mesh,
+ int* FORWARD_OR_ADJOINT) {
+ TRACE("assemble_accel_on_device");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ //double start_time = get_time();
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+ // cudaEvent_t start, stop;
+ // realw time;
+ // cudaEventCreate(&start);
+ // cudaEventCreate(&stop);
+ // cudaEventRecord( start, 0 );
+
+
+ // ***************************************************************************
+ // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+ cudaStreamSynchronize(mp->copy_stream);
+
+ // Assembling on the copy_stream breaks the solution and it "blows up"
+ if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ }
+ else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->copy_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ }
+
+ // cudaEventRecord( stop, 0 );
+ // cudaEventSynchronize( stop );
+ // cudaEventElapsedTime( &time, start, stop );
+ // cudaEventDestroy( start );
+ // cudaEventDestroy( stop );
+ // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("transfer_asmbl_accel_to_device");
+#endif
+}
+
// FORWARD_OR_ADJOINT == 1 for accel, and == 3 for b_accel
extern "C"
void FC_FUNC_(transfer_asmbl_accel_to_device,
@@ -225,14 +354,14 @@
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
- assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel, mp->d_send_accel_buffer,
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
mp->num_interfaces_ext_mesh,
mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
}
else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
- assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel, mp->d_send_accel_buffer,
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
mp->num_interfaces_ext_mesh,
mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
@@ -497,7 +626,15 @@
// double precision temporary variables leads to 10% performance
// decrease in Kernel_2_impl (not very much..)
//typedef realw reald;
+#ifdef USE_TEXTURES_FIELDS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_tex;
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_tex;
+#endif
+#ifdef USE_TEXTURES_CONSTANTS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_tex;
+#endif
+
__global__ void Kernel_2_impl(int nb_blocks_to_compute,
int NGLOB,
int* d_ibool,
@@ -599,6 +736,8 @@
__shared__ reald s_tempz2[NGLL3];
__shared__ reald s_tempz3[NGLL3];
+ __shared__ reald sh_hprime_xx[NGLL2];
+
// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
@@ -622,10 +761,10 @@
// iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
iglob = d_ibool[working_element*NGLL3 + tx]-1;
-#ifdef USE_TEXTURES
- s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
- s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
- s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
+#ifdef USE_TEXTURES_FIELDS
+ s_dummyx_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3);
+ s_dummyy_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3 + 1);
+ s_dummyz_loc[tx] = tex1Dfetch(d_displ_tex, iglob*3 + 2);
#else
// changing iglob indexing to match fortran row changes fast style
s_dummyx_loc[tx] = d_displ[iglob*3];
@@ -634,13 +773,19 @@
#endif
}
+ if (tx < NGLL2) {
+ #ifdef USE_TEXTURES_CONSTANTS
+ sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
+ #else
+ sh_hprime_xx[tx] = d_hprime_xx[tx];
+ #endif
+ }
+
// synchronize all the threads (one thread for each of the NGLL grid points of the
// current spectral element) because we need the whole element to be ready in order
// to be able to compute the matrix products along cut planes of the 3D element below
__syncthreads();
-#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-
if (active) {
#ifndef MANUALLY_UNROLLED_LOOPS
@@ -658,19 +803,19 @@
tempz3l = 0.f;
for (l=0;l<NGLLX;l++) {
- hp1 = d_hprime_xx[l*NGLLX+I];
+ hp1 = sh_hprime_xx[l*NGLLX+I];
offset = K*NGLL2+J*NGLLX+l;
tempx1l += s_dummyx_loc[offset]*hp1;
tempy1l += s_dummyy_loc[offset]*hp1;
tempz1l += s_dummyz_loc[offset]*hp1;
- hp2 = d_hprime_xx[l*NGLLX+J];
+ hp2 = sh_hprime_xx[l*NGLLX+J];
offset = K*NGLL2+l*NGLLX+I;
tempx2l += s_dummyx_loc[offset]*hp2;
tempy2l += s_dummyy_loc[offset]*hp2;
tempz2l += s_dummyz_loc[offset]*hp2;
- hp3 = d_hprime_xx[l*NGLLX+K];
+ hp3 = sh_hprime_xx[l*NGLLX+K];
offset = l*NGLL2+J*NGLLX+I;
tempx3l += s_dummyx_loc[offset]*hp3;
tempy3l += s_dummyy_loc[offset]*hp3;
@@ -1008,29 +1153,40 @@
sum_terms3 += rho_s_H3;
}
-#ifdef USE_TEXTURES
- d_accel[iglob] = tex1Dfetch(tex_accel, iglob) + sum_terms1);
- d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) + sum_terms2);
- d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) + sum_terms3);
-#else
- /* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
-
#ifdef USE_MESH_COLORING_GPU
// no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+ d_accel[iglob*3] = tex1Dfetch(d_accel_tex, iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_tex, iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_tex, iglob*3 + 2) + sum_terms3;
+#else
d_accel[iglob*3] += sum_terms1;
d_accel[iglob*3 + 1] += sum_terms2;
d_accel[iglob*3 + 2] += sum_terms3;
-#else
+#endif // USE_TEXTURES_FIELDS
+
+#else // MESH_COLORING
//mesh coloring
if( use_mesh_coloring_gpu ){
- // no atomic operation needed, colors don't share global points between elements
+ // no atomic operation needed, colors don't share global points between elements
+ // d_accel[iglob*3] += sum_terms1;
+ // d_accel[iglob*3 + 1] += sum_terms2;
+ // d_accel[iglob*3 + 2] += sum_terms3;
+#ifdef USE_TEXTURES_FIELDS
+ d_accel[iglob*3] = tex1Dfetch(d_accel_tex, iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_tex, iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_tex, iglob*3 + 2) + sum_terms3;
+#else
d_accel[iglob*3] += sum_terms1;
d_accel[iglob*3 + 1] += sum_terms2;
d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
- }else{
+ }
+ else {
// for testing purposes only: w/out atomic updates
//d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
@@ -1041,11 +1197,11 @@
atomicAdd(&d_accel[iglob*3+1], sum_terms2);
atomicAdd(&d_accel[iglob*3+2], sum_terms3);
- }
-#endif
+ } // if(use_mesh_coloring_gpu)
-#endif
+#endif // MESH_COLORING
+
// update memory variables based upon the Runge-Kutta scheme
if( ATTENUATION ){
compute_element_att_memory(tx,working_element,NSPEC,
@@ -1068,14 +1224,9 @@
epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
}
- }
+ } // if(active)
-#else // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
- d_accel[iglob] -= 0.00000001f;
- d_accel[iglob + NGLOB] -= 0.00000001f;
- d_accel[iglob + 2*NGLOB] -= 0.00000001f;
-#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-}
+} // kernel_2_impl()
/* ----------------------------------------------------------------------------------------------- */
@@ -1167,7 +1318,7 @@
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
- Kernel_2_impl<<<grid,threads>>>(nb_blocks_to_compute,
+ Kernel_2_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
mp->NGLOB_AB,
d_ibool,
mp->d_phase_ispec_inner_elastic,
@@ -1222,7 +1373,7 @@
if(SIMULATION_TYPE == 3) {
- Kernel_2_impl<<< grid,threads>>>(nb_blocks_to_compute,
+ Kernel_2_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
mp->NGLOB_AB,
d_ibool,
mp->d_phase_ispec_inner_elastic,
@@ -1500,6 +1651,33 @@
mp->d_c66store,
mp->d_rhostore);
}
+
+ // Wait until async-memcpy of outer elements is finished and start MPI.
+ if(*iphase==2) {
+ cudaStreamSynchronize(mp->copy_stream);
+
+ // There have been problems using the pinned-memory with MPI, so
+ // we copy the buffer into a non-pinned region.
+ memcpy(mp->send_buffer,mp->h_send_accel_buffer,
+ mp->size_mpi_send_buffer*sizeof(float));
+
+ // memory copy is now finished, so non-blocking MPI send can proceed
+ // MPI based halo exchange
+
+ assemble_mpi_vector_send_cuda_(&(mp->NPROCS),
+ mp->send_buffer, /* "regular" memory */
+ // mp->h_send_accel_buffer, /* pinned memory **CRASH** */
+ mp->buffer_recv_vector_ext_mesh,
+ &mp->num_interfaces_ext_mesh,
+ &mp->max_nibool_interfaces_ext_mesh,
+ mp->nibool_interfaces_ext_mesh,
+ mp->my_neighbours_ext_mesh,
+ mp->request_send_vector_ext_mesh,
+ mp->request_recv_vector_ext_mesh);
+
+ // Decided to keep launching kernels and to wait for MPI & do memcpy while other kernels launch.
+ // cudaDeviceSynchronize();
+ }
}
@@ -1519,6 +1697,10 @@
/* because of block and grid sizing problems, there is a small */
/* amount of buffer at the end of the calculation */
if(id < size) {
+ realw new_accel = accel[id] * rmass[id / 3];
+ veloc[id] += deltatover2 * new_accel;
+ accel[id] = new_accel;
+/*
accel[3*id] = accel[3*id]*rmass[id];
accel[3*id+1] = accel[3*id+1]*rmass[id];
accel[3*id+2] = accel[3*id+2]*rmass[id];
@@ -1526,6 +1708,7 @@
veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
+*/
}
}
@@ -1539,9 +1722,12 @@
/* because of block and grid sizing problems, there is a small */
/* amount of buffer at the end of the calculation */
if(id < size) {
+ accel[id] *= rmass[id / 3];
+/*
accel[3*id] = accel[3*id]*rmass[id];
accel[3*id+1] = accel[3*id+1]*rmass[id];
accel[3*id+2] = accel[3*id+2]*rmass[id];
+*/
}
}
@@ -1575,7 +1761,8 @@
TRACE("kernel_3_a_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- int size = *size_F;
+ //int size = *size_F;
+ int size = *size_F * 3;
int SIMULATION_TYPE = *SIMULATION_TYPE_f;
realw deltatover2 = *deltatover2_F;
realw b_deltatover2 = *b_deltatover2_F;
@@ -1596,17 +1783,17 @@
// check whether we can update accel and veloc, or only accel at this point
if( *OCEANS == 0 ){
// updates both, accel and veloc
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc, mp->d_accel, size, deltatover2, mp->d_rmass);
+ kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc, mp->d_accel, size, deltatover2, mp->d_rmass);
if(SIMULATION_TYPE == 3) {
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc, mp->d_b_accel, size, b_deltatover2,mp->d_rmass);
+ kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc, mp->d_b_accel, size, b_deltatover2,mp->d_rmass);
}
}else{
// updates only accel
- kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel, size, mp->d_rmass);
+ kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_accel, size, mp->d_rmass);
if(SIMULATION_TYPE == 3) {
- kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel, size, mp->d_rmass);
+ kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_accel, size, mp->d_rmass);
}
}
@@ -1647,10 +1834,10 @@
dim3 threads(blocksize,1,1);
// updates only veloc at this point
- kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_veloc,mp->d_accel,size,deltatover2);
+ kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,mp->d_accel,size,deltatover2);
if(SIMULATION_TYPE == 3) {
- kernel_3_veloc_cuda_device<<< grid, threads>>>(mp->d_b_veloc,mp->d_b_accel,size,b_deltatover2);
+ kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,mp->d_b_accel,size,b_deltatover2);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1760,7 +1947,7 @@
exit_on_cuda_error("before kernel elastic_ocean_load_cuda");
#endif
- elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_accel,
+ elastic_ocean_load_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
mp->d_rmass,
mp->d_rmass_ocean_load,
mp->num_free_surface_faces,
@@ -1775,7 +1962,7 @@
print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
sizeof(int)*mp->NGLOB_AB),88502);
- elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
+ elastic_ocean_load_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,
mp->d_rmass,
mp->d_rmass_ocean_load,
mp->num_free_surface_faces,
@@ -1837,41 +2024,41 @@
}
}
-void setConst_hprime_yy(realw* array,Mesh* mp)
-{
+// void setConst_hprime_yy(realw* array,Mesh* mp)
+// {
- cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
- fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
- exit(1);
- }
+// cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
+// if (err != cudaSuccess)
+// {
+// fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
+// fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+// exit(1);
+// }
- err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-}
+// err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
+// if(err != cudaSuccess) {
+// fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
+// exit(1);
+// }
+// }
-void setConst_hprime_zz(realw* array,Mesh* mp)
-{
+// void setConst_hprime_zz(realw* array,Mesh* mp)
+// {
- cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
- fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
- exit(1);
- }
+// cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
+// if (err != cudaSuccess)
+// {
+// fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
+// fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+// exit(1);
+// }
- err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-}
+// err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
+// if(err != cudaSuccess) {
+// fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
+// exit(1);
+// }
+// }
void setConst_hprimewgll_xx(realw* array,Mesh* mp)
@@ -1989,4 +2176,3 @@
}
}
-
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu 2012-06-13 12:54:51 UTC (rev 20365)
@@ -111,7 +111,7 @@
//#endif
//launch kernel
- UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
+ UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
size,deltat,deltatsqover2,deltatover2);
//cudaThreadSynchronize();
@@ -124,7 +124,7 @@
// kernel for backward fields
if(*SIMULATION_TYPE == 3) {
- UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
+ UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
size,b_deltat,b_deltatsqover2,b_deltatover2);
//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -208,13 +208,13 @@
dim3 threads(blocksize,1,1);
//launch kernel
- UpdatePotential_kernel<<<grid,threads>>>(mp->d_potential_acoustic,
+ UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_acoustic,
mp->d_potential_dot_acoustic,
mp->d_potential_dot_dot_acoustic,
size,deltat,deltatsqover2,deltatover2);
if(*SIMULATION_TYPE == 3) {
- UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_potential_acoustic,
+ UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_acoustic,
mp->d_b_potential_dot_acoustic,
mp->d_b_potential_dot_dot_acoustic,
size,b_deltat,b_deltatsqover2,b_deltatover2);
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2012-06-13 12:54:51 UTC (rev 20365)
@@ -71,7 +71,7 @@
#endif
// error checking after cuda function calls
-#define ENABLE_VERY_SLOW_ERROR_CHECKING
+/* #define ENABLE_VERY_SLOW_ERROR_CHECKING */
#define MAX(x,y) (((x) < (y)) ? (y) : (x))
@@ -85,6 +85,8 @@
void exit_on_error(char* info);
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y);
+
/* ----------------------------------------------------------------------------------------------- */
// cuda constant arrays
@@ -118,6 +120,15 @@
// leads up to ~ 5% performance increase
//#define USE_MESH_COLORING_GPU
+// use textures for d_displ and d_accel -- 10% performance boost
+#define USE_TEXTURES_FIELDS
+
+// Using texture memory for the hprime-style constants is slower on
+// Fermi generation hardware, but *may* be faster on Kepler
+// generation.
+/* Use textures for hprime_xx */
+/* #define USE_TEXTURES_CONSTANTS */
+
// (optional) unrolling loops
// leads up to ~1% performance increase
//#define MANUALLY_UNROLLED_LOOPS
@@ -160,6 +171,9 @@
int NSPEC_AB;
int NGLOB_AB;
+ int myrank;
+ int NPROCS;
+
// interpolators
realw* d_xix; realw* d_xiy; realw* d_xiz;
realw* d_etax; realw* d_etay; realw* d_etaz;
@@ -183,10 +197,36 @@
realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz;
realw* d_wgll_cube;
- // mpi buffers
+ // A buffer for mpi-send/recv, which is duplicated in fortran but is
+ // allocated with pinned memory to facilitate asynchronus device <->
+ // host memory transfers
+ float* h_send_accel_buffer;
+ float* h_send_b_accel_buffer;
+ float* send_buffer;
+ float* h_recv_accel_buffer;
+ float* h_recv_b_accel_buffer;
+ float* recv_buffer;
+ int size_mpi_send_buffer;
+ int size_mpi_recv_buffer;
+
+
+
+ // buffers and constants for the MPI-send required for async-memcpy
+ // + non-blocking MPI
+ float* buffer_recv_vector_ext_mesh;
int num_interfaces_ext_mesh;
int max_nibool_interfaces_ext_mesh;
+ int* nibool_interfaces_ext_mesh;
+ int* my_neighbours_ext_mesh;
+ int* request_send_vector_ext_mesh;
+ int* request_recv_vector_ext_mesh;
+
+ // overlapped memcpy streams
+ cudaStream_t compute_stream;
+ cudaStream_t copy_stream;
+ cudaStream_t b_copy_stream;
+
// ------------------------------------------------------------------ //
// elastic wavefield parameters
// ------------------------------------------------------------------ //
@@ -196,6 +236,10 @@
// backward/reconstructed elastic wavefield
realw* d_b_displ; realw* d_b_veloc; realw* d_b_accel;
+ // Texture references for fast non-coalesced scattered access
+ const textureReference* d_displ_tex_ref_ptr;
+ const textureReference* d_accel_tex_ref_ptr;
+
// elastic elements
int* d_ispec_is_elastic;
@@ -244,6 +288,17 @@
realw* d_station_seismo_field;
realw* h_station_seismo_field;
+ double* d_hxir, *d_hetar, *d_hgammar;
+ double* d_dxd, *d_dyd, *d_dzd;
+ double* d_vxd, *d_vyd, *d_vzd;
+ double* d_axd, *d_ayd, *d_azd;
+ realw* d_seismograms_d, *d_seismograms_v, *d_seismograms_a;
+ double* d_nu;
+
+ realw* h_seismograms_d_it;
+ realw* h_seismograms_v_it;
+ realw* h_seismograms_a_it;
+
// adjoint receivers/sources
int nadj_rec_local;
realw* d_adj_sourcearrays;
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2012-06-13 12:54:51 UTC (rev 20365)
@@ -54,44 +54,15 @@
/* CUDA specific things from specfem3D_kernels.cu */
+// older TEXTURE usage. For now just acoustic simulations. See usage
+// of USE_TEXTURES_FIELDS elsewhere in code for elastic case
#ifdef USE_TEXTURES
- // declaration of textures
- texture<realw, 1, cudaReadModeElementType> tex_displ;
- texture<realw, 1, cudaReadModeElementType> tex_accel;
- texture<realw, 1, cudaReadModeElementType> tex_potential_acoustic;
- texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
+// declaration of textures
+texture<realw, 1, cudaReadModeElementType> tex_potential_acoustic;
+texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
- // for binding the textures
- void bindTexturesDispl(realw* d_displ)
- {
- cudaError_t err;
-
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
-
- err = cudaBindTexture(NULL,tex_displ, d_displ, channelDescFloat, NDIM*NGLOB*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in bindTexturesDispl for displ: %s\n", cudaGetErrorString(err));
- exit(1);
- }
- }
-
- void bindTexturesAccel(realw* d_accel)
- {
- cudaError_t err;
-
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
-
- err = cudaBindTexture(NULL,tex_accel, d_accel, channelDescFloat, NDIM*NGLOB*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in bindTexturesAccel for accel: %s\n", cudaGetErrorString(err));
- exit(1);
- }
- }
-
void bindTexturesPotential(realw* d_potential_acoustic)
{
cudaError_t err;
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2012-06-13 12:54:51 UTC (rev 20365)
@@ -608,7 +608,12 @@
int* nrec_local_f,
int* SIMULATION_TYPE,
int* USE_MESH_COLORING_GPU_f,
- int* nspec_acoustic,int* nspec_elastic) {
+ int* nspec_acoustic,int* nspec_elastic,
+ int* my_neighbours_ext_mesh,
+ int* request_send_vector_ext_mesh,
+ int* request_recv_vector_ext_mesh,
+ realw* buffer_recv_vector_ext_mesh
+ ) {
TRACE("prepare_constants_device");
@@ -622,14 +627,25 @@
exit_on_error("NGLLX must be 5 for CUDA devices");
}
+
+#ifdef WITH_MPI
+ int nproc;
+ MPI_Comm_size(MPI_COMM_WORLD,&nproc);
+ mp->NPROCS=nproc;
+#else
+ mp->NPROCS = 1;
+#endif
+
+
// sets global parameters
mp->NSPEC_AB = *NSPEC_AB;
mp->NGLOB_AB = *NGLOB_AB;
// sets constant arrays
setConst_hprime_xx(h_hprime_xx,mp);
- setConst_hprime_yy(h_hprime_yy,mp);
- setConst_hprime_zz(h_hprime_zz,mp);
+ // only needed if NGLLX != NGLLY != NGLLZ
+ // setConst_hprime_yy(h_hprime_yy,mp);
+ // setConst_hprime_zz(h_hprime_zz,mp);
setConst_hprimewgll_xx(h_hprimewgll_xx,mp);
setConst_hprimewgll_yy(h_hprimewgll_yy,mp);
setConst_hprimewgll_zz(h_hprimewgll_zz,mp);
@@ -637,9 +653,51 @@
setConst_wgllwgll_xz(h_wgllwgll_xz,mp);
setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
+ // Using texture memory for the hprime-style constants is slower on
+ // Fermi generation hardware, but *may* be faster on Kepler
+ // generation. We will reevaluate this again, so might as well leave
+ // in the code with with #USE_TEXTURES_FIELDS not-defined.
+ #ifdef USE_TEXTURES_CONSTANTS
+ {
+ const textureReference* d_hprime_xx_tex_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_hprime_xx_tex_ptr, "d_hprime_xx_tex"), 4101);
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_tex_ptr, mp->d_hprime_xx, &channelDesc, sizeof(realw)*(NGLL2)), 4001);
+ }
+ #endif
+
+
+ // Allocate pinned mpi-buffers.
+ // MPI buffers use pinned memory allocated by cudaMallocHost, which
+ // enables the use of asynchronous memory copies from host <->
+ // device
+ int size_mpi_buffer = 3 * (*num_interfaces_ext_mesh) * (*max_nibool_interfaces_ext_mesh);
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+ mp->send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+ mp->size_mpi_send_buffer = size_mpi_buffer;
+
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+ mp->recv_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+ mp->size_mpi_recv_buffer = size_mpi_buffer;
+
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+ // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+
+ mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh;
+ mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh;
+ mp->nibool_interfaces_ext_mesh = h_nibool_interfaces_ext_mesh;
+ mp->my_neighbours_ext_mesh = my_neighbours_ext_mesh;
+ mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh;
+ mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh;
+ mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh;
+
+ // setup two streams, one for compute and one for host<->device memory copies
+ cudaStreamCreate(&mp->compute_stream);
+ cudaStreamCreate(&mp->copy_stream);
+ cudaStreamCreate(&mp->b_copy_stream);
+
/* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
- //int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
// mesh
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(realw)),1001);
@@ -1068,6 +1126,20 @@
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003);
+ #ifdef USE_TEXTURES_FIELDS
+ {
+ print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_tex_ref_ptr, "d_displ_tex"), 4001);
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_tex_ref_ptr, mp->d_displ, &channelDesc, sizeof(realw)*(*size)), 4001);
+ }
+
+ {
+ print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_tex_ref_ptr, "d_accel_tex"), 4003);
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_tex_ref_ptr, mp->d_accel, &channelDesc, sizeof(realw)*(*size)), 4003);
+ }
+ #endif
+
// mpi buffer
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004);
@@ -1690,6 +1762,32 @@
}
+extern "C"
+void FC_FUNC_(prepare_seismogram_fields,
+ PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {
+
+ TRACE("prepare_constants_device");
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_nu),3*3* *nrec_local*sizeof(double)),8100);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hxir),5* *nrec_local*sizeof(double)),8100);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hetar),5* *nrec_local*sizeof(double)),8100);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hgammar),5* *nrec_local*sizeof(double)),8100);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_d,3**nrec_local*sizeof(realw)),8101);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_v,3**nrec_local*sizeof(realw)),8101);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_seismograms_a,3**nrec_local*sizeof(realw)),8101);
+
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_nu,nu,3*3* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_hxir,hxir,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_hetar,hetar,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_hgammar,hgammar,5* *nrec_local*sizeof(double),cudaMemcpyHostToDevice),8101);
+
+ cudaMallocHost((void**)&mp->h_seismograms_d_it,3**nrec_local*sizeof(realw));
+ cudaMallocHost((void**)&mp->h_seismograms_v_it,3**nrec_local*sizeof(realw));
+ cudaMallocHost((void**)&mp->h_seismograms_a_it,3**nrec_local*sizeof(realw));
+}
+
/* ----------------------------------------------------------------------------------------------- */
// cleanup
@@ -1933,4 +2031,3 @@
// mesh pointer - not needed anymore
free(mp);
}
-
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2012-06-13 12:54:51 UTC (rev 20365)
@@ -43,6 +43,220 @@
/* ----------------------------------------------------------------------------------------------- */
+/*
+ ! gets global number of that receiver
+ irec = number_receiver_global(irec_local)
+
+ ! gets local receiver interpolators
+ ! (1-D Lagrange interpolators)
+ hxir(:) = hxir_store(irec_local,:)
+ hetar(:) = hetar_store(irec_local,:)
+ hgammar(:) = hgammar_store(irec_local,:)
+
+*/
+
+
+// Initially sets the blocks_x to be the num_blocks, and adds rows as
+// needed. If an additional row is added, the row length is cut in
+// half. If the block count is odd, there will be 1 too many blocks,
+// which must be managed at runtime with an if statement.
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
+ *num_blocks_x = num_blocks;
+ *num_blocks_y = 1;
+ while(*num_blocks_x > 65535) {
+ *num_blocks_x = (int) ceil(*num_blocks_x*0.5f);
+ *num_blocks_y = *num_blocks_y*2;
+ }
+ return;
+}
+
+__device__ double atomicAdd(double* address, double val)
+{
+ unsigned long long int* address_as_ull =
+ (unsigned long long int*)address;
+ unsigned long long int old = *address_as_ull, assumed;
+ do {
+ assumed = old;
+old = atomicCAS(address_as_ull, assumed,
+ __double_as_longlong(val +
+ __longlong_as_double(assumed)));
+ } while (assumed != old);
+ return __longlong_as_double(old);
+}
+
+__global__ void compute_interpolated_dva_plus_seismogram(int nrec_local,
+ realw* displ, realw* veloc, realw* accel,
+ int* ibool,
+ double* hxir, double* hetar, double* hgammar,
+ realw* seismograms_d, realw* seismograms_v, realw* seismograms_a,
+ double* nu,
+ int* number_receiver_global,
+ int* ispec_selected_rec) {
+ int irec_local = blockIdx.x + blockIdx.y*gridDim.x;
+ int i = threadIdx.x;
+ int j = threadIdx.y;
+ int k = threadIdx.z;
+ int ijk = i+5*(j+5*(k));
+
+ // we do the **d variable reduction in shared memory, because the
+ // atomicAdd() should be faster on the shared memory registers
+ // according to
+ // http://supercomputingblog.com/cuda/cuda-tutorial-4-atomic-operations/
+ __shared__ double sh_dxd[NGLL3];
+ __shared__ double sh_dyd[NGLL3];
+ __shared__ double sh_dzd[NGLL3];
+ __shared__ double sh_vxd[NGLL3];
+ __shared__ double sh_vyd[NGLL3];
+ __shared__ double sh_vzd[NGLL3];
+ __shared__ double sh_axd[NGLL3];
+ __shared__ double sh_ayd[NGLL3];
+ __shared__ double sh_azd[NGLL3];
+
+ if(irec_local < nrec_local) {
+ int irec = number_receiver_global[irec_local]-1;
+ int ispec = ispec_selected_rec[irec]-1;
+ int iglob = ibool[ijk+125*ispec]-1;
+ double hlagrange = hxir[irec_local + nrec_local*i]*hetar[irec_local + nrec_local*j]*hgammar[irec_local + nrec_local*k];
+ sh_dxd[ijk] = hlagrange*displ[0+3*iglob];
+ sh_dyd[ijk] = hlagrange*displ[1+3*iglob];
+ sh_dzd[ijk] = hlagrange*displ[2+3*iglob];
+
+ sh_vxd[ijk] = hlagrange*veloc[0+3*iglob];
+ sh_vyd[ijk] = hlagrange*veloc[1+3*iglob];
+ sh_vzd[ijk] = hlagrange*veloc[2+3*iglob];
+
+ sh_axd[ijk] = hlagrange*accel[0+3*iglob];
+ sh_ayd[ijk] = hlagrange*accel[1+3*iglob];
+ sh_azd[ijk] = hlagrange*accel[2+3*iglob];
+
+ // the reduction has to skip the first element (we don't need to
+ // add element 0 to itself) This reduction serializes the code,
+ // but it should be fast enough --- it can be made faster with a
+ // proper reduction algorithm.
+ __syncthreads();
+
+ // if(ijk>0) {
+ // reduction needs to be done atomically to avoid race conditions
+ // atomicAdd(&sh_dxd[0],sh_dxd[ijk]);
+ // atomicAdd(&sh_dyd[0],sh_dyd[ijk]);
+ // atomicAdd(&sh_dzd[0],sh_dzd[ijk]);
+
+ // atomicAdd(&sh_vxd[0],sh_vxd[ijk]);
+ // atomicAdd(&sh_vyd[0],sh_vyd[ijk]);
+ // atomicAdd(&sh_vzd[0],sh_vzd[ijk]);
+
+ // atomicAdd(&sh_axd[0],sh_axd[ijk]);
+ // atomicAdd(&sh_ayd[0],sh_ayd[ijk]);
+ // atomicAdd(&sh_azd[0],sh_azd[ijk]);
+ // }
+ // __syncthreads();
+ if(ijk==0) {
+ // a loop in thread 0 is 4 times faster than atomic operations
+ for(int i=1;i<125;i++) {
+ sh_dxd[0] += sh_dxd[i];
+ sh_dyd[0] += sh_dyd[i];
+ sh_dzd[0] += sh_dzd[i];
+
+ sh_vxd[0] += sh_vxd[i];
+ sh_vyd[0] += sh_vyd[i];
+ sh_vzd[0] += sh_vzd[i];
+
+ sh_axd[0] += sh_axd[i];
+ sh_ayd[0] += sh_ayd[i];
+ sh_azd[0] += sh_azd[i];
+
+ }
+
+ seismograms_d[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_dxd[0] + nu[0+3*(1+3*irec)]*sh_dyd[0] + nu[0+3*(2+3*irec)]*sh_dzd[0];
+ seismograms_d[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_dxd[0] + nu[1+3*(1+3*irec)]*sh_dyd[0] + nu[1+3*(2+3*irec)]*sh_dzd[0];
+ seismograms_d[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_dxd[0] + nu[2+3*(1+3*irec)]*sh_dyd[0] + nu[2+3*(2+3*irec)]*sh_dzd[0];
+
+ seismograms_v[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_vxd[0] + nu[0+3*(1+3*irec)]*sh_vyd[0] + nu[0+3*(2+3*irec)]*sh_vzd[0];
+ seismograms_v[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_vxd[0] + nu[1+3*(1+3*irec)]*sh_vyd[0] + nu[1+3*(2+3*irec)]*sh_vzd[0];
+ seismograms_v[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_vxd[0] + nu[2+3*(1+3*irec)]*sh_vyd[0] + nu[2+3*(2+3*irec)]*sh_vzd[0];
+
+ seismograms_a[0+3*irec_local] = nu[0+3*(0+3*irec)]*sh_axd[0] + nu[0+3*(1+3*irec)]*sh_ayd[0] + nu[0+3*(2+3*irec)]*sh_azd[0];
+ seismograms_a[1+3*irec_local] = nu[1+3*(0+3*irec)]*sh_axd[0] + nu[1+3*(1+3*irec)]*sh_ayd[0] + nu[1+3*(2+3*irec)]*sh_azd[0];
+ seismograms_a[2+3*irec_local] = nu[2+3*(0+3*irec)]*sh_axd[0] + nu[2+3*(1+3*irec)]*sh_ayd[0] + nu[2+3*(2+3*irec)]*sh_azd[0];
+
+ }
+ }
+}
+
+
+
+extern "C"
+void FC_FUNC_(transfer_seismograms_el_from_device,
+ TRANSFER_SEISMOGRAMS_EL_FROM_DEVICE)(int* nrec_local,
+ long* Mesh_pointer_f,int* SIMULATION_TYPEf,
+ realw* seismograms_d,
+ realw* seismograms_v,
+ realw* seismograms_a,
+ int* it) {
+
+
+
+ TRACE("transfer_seismograms_el_from_device");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(*nrec_local,&num_blocks_x,&num_blocks_y);
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(5,5,5);
+
+ // double h_debug[125]; for(int i=0;i<125;i++) h_debug[i] = 0;
+ // double* d_debug; cudaMalloc((void**)&d_debug,125*sizeof(double));
+ // cudaMemcpy(d_debug,h_debug,125*sizeof(double),cudaMemcpyHostToDevice);
+ // Cuda timing
+ // cudaEvent_t start, stop;
+ // realw time;
+ // cudaEventCreate(&start);
+ // cudaEventCreate(&stop);
+ // cudaEventRecord( start, 0 );
+
+ compute_interpolated_dva_plus_seismogram<<<grid,threads,0,mp->compute_stream>>>(*nrec_local,
+ mp->d_displ,mp->d_veloc,mp->d_accel,
+ mp->d_ibool,
+ mp->d_hxir, mp->d_hetar, mp->d_hgammar,
+ mp->d_seismograms_d,
+ mp->d_seismograms_v,
+ mp->d_seismograms_a,
+ mp->d_nu,
+ mp->d_number_receiver_global,
+ mp->d_ispec_selected_rec
+ );
+
+ // cudaMemcpy(h_debug,d_debug,125*sizeof(double),cudaMemcpyDeviceToHost);
+
+ cudaMemcpy(mp->h_seismograms_d_it,mp->d_seismograms_d,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+ cudaMemcpy(mp->h_seismograms_v_it,mp->d_seismograms_v,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+ cudaMemcpy(mp->h_seismograms_a_it,mp->d_seismograms_a,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+
+ // cudaEventRecord( stop, 0 );
+ // cudaEventSynchronize( stop );
+ // cudaEventElapsedTime( &time, start, stop );
+ // cudaEventDestroy( start );
+ // cudaEventDestroy( stop );
+ // printf("seismogram Execution Time: %f ms\n",time);
+
+ // if(abs(mp->h_seismograms_d_it[0]) < 1e-25) printf("seismo1_x=%e\n",mp->h_seismograms_d_it[0]);
+ // if(abs(mp->h_seismograms_d_it[1]) < 1e-25) printf("seismo1_y=%e\n",mp->h_seismograms_d_it[1]);
+
+ // if(abs(mp->h_seismograms_d_it[2]) < 1e-25) {
+
+ // printf("%d:seismo1_z=%e\n",*it,mp->h_seismograms_d_it[2]);
+
+ // }
+
+
+ memcpy(&seismograms_d[3**nrec_local*(*it-1)],mp->h_seismograms_d_it,3* *nrec_local*sizeof(realw));
+ memcpy(&seismograms_v[3**nrec_local*(*it-1)],mp->h_seismograms_v_it,3* *nrec_local*sizeof(realw));
+ memcpy(&seismograms_a[3**nrec_local*(*it-1)],mp->h_seismograms_a_it,3* *nrec_local*sizeof(realw));
+
+}
+
+
__global__ void transfer_stations_fields_from_device_kernel(int* number_receiver_global,
int* ispec_selected_rec,
int* ibool,
@@ -87,7 +301,7 @@
dim3 threads(blocksize,1,1);
// prepare field transfer array on device
- transfer_stations_fields_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+ transfer_stations_fields_from_device_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_number_receiver_global,
d_ispec_selected,
mp->d_ibool,
mp->d_station_seismo_field,
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/decompose_mesh_SCOTCH/decompose_mesh_SCOTCH.f90 2012-06-13 12:54:51 UTC (rev 20365)
@@ -814,32 +814,37 @@
! re-partitioning puts poroelastic-elastic coupled elements into same partition
! integer :: nfaces_coupled
! integer, dimension(:,:), pointer :: faces_coupled
- call poro_elastic_repartitioning (nspec, nnodes, elmnts, &
- count_def_mat, mat(1,:) , mat_prop, &
- sup_neighbour, nsize, &
- nparts, part)
- !nparts, part, nfaces_coupled, faces_coupled)
- ! re-partitioning puts moho-surface coupled elements into same partition
- call moho_surface_repartitioning (nspec, nnodes, elmnts, &
- sup_neighbour, nsize, nparts, part, &
- nspec2D_moho,ibelm_moho,nodes_ibelm_moho )
+ ! TODO: supposed to rebalance, but currently broken
+ ! call poro_elastic_repartitioning (nspec, nnodes, elmnts, &
+ ! count_def_mat, mat(1,:) , mat_prop, &
+ ! sup_neighbour, nsize, &
+ ! nparts, part)
+ !nparts, part, nfaces_coupled, faces_coupled)
+ ! re-partitioning puts moho-surface coupled elements into same partition
+ ! call moho_surface_repartitioning (nspec, nnodes, elmnts, &
+ ! sup_neighbour, nsize, nparts, part, &
+ ! nspec2D_moho,ibelm_moho,nodes_ibelm_moho )
+
+
! local number of each element for each partition
call build_glob2loc_elmnts(nspec, part, glob2loc_elmnts,nparts)
- ! local number of each node for each partition
+ ! local number of each node for each partition
call build_glob2loc_nodes(nspec, nnodes,nsize, nnodes_elmnts, nodes_elmnts, part, &
glob2loc_nodes_nparts, glob2loc_nodes_parts, glob2loc_nodes, nparts)
- ! mpi interfaces
+ ! mpi interfaces
! acoustic/elastic/poroelastic boundaries will be split into different MPI partitions
call build_interfaces(nspec, sup_neighbour, part, elmnts, &
xadj, adjncy, tab_interfaces, &
tab_size_interfaces, ninterfaces, &
nparts)
+
+
!or: uncomment if you want acoustic/elastic boundaries NOT to be separated into different MPI partitions
!call build_interfaces_no_ac_el_sep(nspec, sup_neighbour, part, elmnts, &
! xadj, adjncy, tab_interfaces, &
@@ -877,19 +882,23 @@
endif
! gets number of nodes
+
call write_glob2loc_nodes_database(IIN_database, ipart, nnodes_loc, nodes_coords, &
glob2loc_nodes_nparts, glob2loc_nodes_parts, &
glob2loc_nodes, nnodes, 1)
- ! gets number of spectral elements
+
+
call write_partition_database(IIN_database, ipart, nspec_local, nspec, elmnts, &
glob2loc_elmnts, glob2loc_nodes_nparts, &
glob2loc_nodes_parts, glob2loc_nodes, part, mat, ngnod, 1)
+ print*, ipart,": nspec_local=",nspec_local, " nnodes_local=", nnodes_loc
! writes out node coordinate locations
!write(IIN_database,*) nnodes_loc
write(IIN_database) nnodes_loc
+
call write_glob2loc_nodes_database(IIN_database, ipart, nnodes_loc, nodes_coords,&
glob2loc_nodes_nparts, glob2loc_nodes_parts, &
glob2loc_nodes, nnodes, 2)
@@ -956,4 +965,3 @@
!end program pre_meshfem3D
end module decompose_mesh_SCOTCH
-
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/get_model.f90 2012-06-13 12:54:51 UTC (rev 20365)
@@ -78,10 +78,13 @@
ispec_is_elastic(:) = .false.
ispec_is_poroelastic(:) = .false.
- ! prepares tomography model if needed for elements with undefined material definitions
- if( nundefMat_ext_mesh > 0 .or. IMODEL == IMODEL_TOMO ) then
- call model_tomography_broadcast(myrank)
- endif
+ print*,"nundefMat_ext_mesh:",nundefMat_ext_mesh
+! prepares tomography model if needed for elements with undefined material definitions
+ ! TODO: Max -- somehow this code is breaking when I try to run
+ ! Piero's PREM
+ ! if( nundefMat_ext_mesh > 0 .or. IMODEL == IMODEL_TOMO ) then
+ ! call model_tomography_broadcast(myrank)
+ ! endif
! prepares external model values if needed
select case( IMODEL )
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/constants.h.in 2012-06-13 12:54:51 UTC (rev 20365)
@@ -266,7 +266,7 @@
integer, parameter :: MAX_NB_TRIES_OF_DROUX_1993 = 15
!
! postprocess the colors to balance them if Droux (1993) algorithm is not used
- logical, parameter :: BALANCE_COLORS_SIMPLE_ALGO = .true.
+ logical, parameter :: BALANCE_COLORS_SIMPLE_ALGO = .false.
!------------------------------------------------------
!----------- do not modify anything below -------------
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90 2012-06-13 12:54:51 UTC (rev 20365)
@@ -415,6 +415,114 @@
! with cuda functions...
+ subroutine transfer_boundary_to_device(NPROC, Mesh_pointer, &
+ buffer_recv_vector_ext_mesh, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,&
+ request_recv_vector_ext_mesh)
+
+ implicit none
+
+ include "constants.h"
+
+ integer :: NPROC
+ integer(kind=8) :: Mesh_pointer
+! array to assemble
+
+ integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+ real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+ buffer_recv_vector_ext_mesh
+
+ integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+ integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+ integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+
+ integer iinterface ! ipoin
+ integer FORWARD_OR_ADJOINT
+
+! here we have to assemble all the contributions between partitions using MPI
+
+! assemble only if more than one partition
+ if(NPROC > 1) then
+
+! wait for communications completion (recv)
+
+ write(IMAIN,*) "sending MPI_wait"
+ do iinterface = 1, num_interfaces_ext_mesh
+ call wait_req(request_recv_vector_ext_mesh(iinterface))
+ enddo
+
+! send contributions to GPU
+ call transfer_boundary_to_device_asynchronously(Mesh_pointer, buffer_recv_vector_ext_mesh, &
+ num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh)
+ endif
+ ! This step is done via previous function transfer_and_assemble...
+ ! do iinterface = 1, num_interfaces_ext_mesh
+ ! do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+ ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+ ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+ ! enddo
+ ! enddo
+
+end subroutine transfer_boundary_to_device
+
+subroutine assemble_MPI_vector_write_cuda_no_transfer(NPROC,NGLOB_AB,array_val, Mesh_pointer, &
+ buffer_recv_vector_ext_mesh, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+ request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+ FORWARD_OR_ADJOINT )
+
+implicit none
+
+ include "constants.h"
+
+ integer :: NPROC
+ integer :: NGLOB_AB
+ integer(kind=8) :: Mesh_pointer
+! array to assemble
+ real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
+
+ integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+
+ real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+ buffer_recv_vector_ext_mesh
+
+ integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+ integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+ integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+
+ integer iinterface ! ipoin
+ integer FORWARD_OR_ADJOINT
+
+! here we have to assemble all the contributions between partitions using MPI
+
+ ! assemble only if more than one partition
+ if(NPROC > 1) then
+
+ ! adding contributions of neighbours
+ call assemble_accel_on_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, &
+ num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,&
+ ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+
+ ! This step is done via previous function transfer_and_assemble...
+ ! do iinterface = 1, num_interfaces_ext_mesh
+ ! do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+ ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+ ! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+ ! enddo
+ ! enddo
+
+ ! wait for communications completion (send)
+ do iinterface = 1, num_interfaces_ext_mesh
+ call wait_req(request_send_vector_ext_mesh(iinterface))
+ enddo
+ endif
+
+ end subroutine assemble_MPI_vector_write_cuda_no_transfer
+
+
subroutine assemble_MPI_vector_send_cuda(NPROC, &
buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
@@ -665,4 +773,3 @@
endif
end subroutine assemble_MPI_scalar_write_cuda
-
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.F90 2012-06-13 12:54:51 UTC (rev 20365)
@@ -126,9 +126,19 @@
nspec_inner_elastic, &
SIMULATION_TYPE, &
COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
- endif ! GPU_MODE
+ if(phase_is_inner .eqv. .true.) then
+ ! while Inner elements compute "Kernel_2", we wait for MPI to
+ ! finish and transfer the boundary terms to the device
+ ! asynchronously
+ call transfer_boundary_to_device(NPROC,Mesh_pointer,buffer_recv_vector_ext_mesh,&
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,&
+ request_recv_vector_ext_mesh)
+ endif ! inner elements
+
+ endif ! GPU_MODE
+
! adds elastic absorbing boundary term to acceleration (Stacey conditions)
if(ABSORBING_CONDITIONS) &
call compute_stacey_elastic(NSPEC_AB,NGLOB_AB,accel, &
@@ -232,16 +242,13 @@
my_neighbours_ext_mesh, &
request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
else ! GPU_MODE==1
- call transfer_boun_accel_from_device(NGLOB_AB*NDIM, Mesh_pointer, accel,&
- buffer_send_vector_ext_mesh,&
- num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh,&
- nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh,1) ! <-- 1 == fwd accel
- call assemble_MPI_vector_send_cuda(NPROC, &
- buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
- num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,&
- my_neighbours_ext_mesh, &
- request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+
+ ! transfers boundary region to host asynchronously. The
+ ! MPI-send is done from within compute_forces_elastic_cuda,
+ ! once the inner element kernels are launched, and the
+ ! memcpy has finished. see compute_forces_elastic_cuda:1655
+ call transfer_boundary_from_device_asynchronously(Mesh_pointer,nspec_outer_elastic)
+
endif ! GPU_MODE
! adjoint simulations
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.F90 2012-06-13 12:54:51 UTC (rev 20365)
@@ -884,7 +884,7 @@
endif
! initializes GPU and outputs info to files for all processes
- call prepare_cuda_device(myrank,ncuda_devices)
+ call prepare_cuda_device(myrank,NPROC,ncuda_devices)
! collects min/max of local devices found for statistics
call sync_all()
@@ -913,7 +913,11 @@
nrec, nrec_local, &
SIMULATION_TYPE, &
USE_MESH_COLORING_GPU, &
- nspec_acoustic,nspec_elastic)
+ nspec_acoustic,nspec_elastic,&
+ my_neighbours_ext_mesh,&
+ request_send_vector_ext_mesh,&
+ request_recv_vector_ext_mesh,&
+ buffer_recv_vector_ext_mesh)
! prepares fields on GPU for acoustic simulations
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90 2012-06-13 03:57:10 UTC (rev 20364)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90 2012-06-13 12:54:51 UTC (rev 20365)
@@ -45,6 +45,9 @@
real(kind=CUSTOM_REAL):: stf_deltat
double precision :: stf
+ ! TODO: Test and Fix CUDA seismograms code.
+ logical,parameter :: USE_CUDA_SEISMOGRAMS = .false.
+
! gets resulting array values onto CPU
if(GPU_MODE) then
if( nrec_local > 0 ) then
@@ -66,19 +69,31 @@
! this transfers fields only in elements with stations for efficiency
if( ELASTIC_SIMULATION ) then
- call transfer_station_el_from_device( &
+ if(USE_CUDA_SEISMOGRAMS) then
+ call transfer_seismograms_el_from_device(&
+ nrec_local,&
+ Mesh_pointer, &
+ SIMULATION_TYPE,&
+ seismograms_d,&
+ seismograms_v,&
+ seismograms_a,&
+ it)
+ else
+ call transfer_station_el_from_device( &
displ,veloc,accel, &
b_displ,b_veloc,b_accel, &
Mesh_pointer,number_receiver_global, &
ispec_selected_rec,ispec_selected_source, &
ibool,SIMULATION_TYPE)
-
+ endif
! alternative: transfers whole fields
! call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
endif
endif
endif
+if(.not. GPU_MODE .or. (GPU_MODE .and. (.not. USE_CUDA_SEISMOGRAMS))) then
+
do irec_local = 1,nrec_local
! gets global number of that receiver
@@ -277,6 +292,7 @@
if (SIMULATION_TYPE == 2) seismograms_eps(:,:,irec_local,it) = eps_s(:,:)
enddo ! nrec_local
+endif
! write the current or final seismograms
if((mod(it,NTSTEP_BETWEEN_OUTPUT_SEISMOS) == 0 .or. it == NSTEP) .and. (.not.SU_FORMAT)) then
More information about the CIG-COMMITS
mailing list