[cig-commits] [commit] devel, master: add events around copy queue operations (b2c7b97)
cig_noreply at geodynamics.org
cig_noreply at geodynamics.org
Thu Nov 6 08:15:35 PST 2014
Repository : https://github.com/geodynamics/specfem3d_globe
On branches: devel,master
Link : https://github.com/geodynamics/specfem3d_globe/compare/bc58e579b3b0838a0968725a076f5904845437ca...be63f20cbb6f462104e949894dbe205d2398cd7f
>---------------------------------------------------------------
commit b2c7b970c5fe4e11c97b7f32d0d64b63e2d63df7
Author: Kevin Pouget <kevin.pouget at imag.fr>
Date: Mon May 12 16:42:27 2014 +0200
add events around copy queue operations
>---------------------------------------------------------------
b2c7b970c5fe4e11c97b7f32d0d64b63e2d63df7
src/gpu/assemble_MPI_scalar_gpu.c | 38 +++++++++-----
src/gpu/assemble_MPI_vector_gpu.c | 87 +++++++++++++++++++++++--------
src/gpu/compute_add_sources_elastic_gpu.c | 43 ++++++++++-----
src/gpu/mesh_constants_gpu.h | 17 +++---
src/gpu/prepare_mesh_constants_gpu.c | 5 ++
src/gpu/write_seismograms_gpu.c | 20 +++++--
6 files changed, 152 insertions(+), 58 deletions(-)
diff --git a/src/gpu/assemble_MPI_scalar_gpu.c b/src/gpu/assemble_MPI_scalar_gpu.c
index 6addb7b..794ad8c 100644
--- a/src/gpu/assemble_MPI_scalar_gpu.c
+++ b/src/gpu/assemble_MPI_scalar_gpu.c
@@ -87,10 +87,13 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ }
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_TRUE,
0, size_mpi_buffer * sizeof (realw),
- mp->h_send_accel_buffer_oc, 1, &kernel_evt, NULL));
-
+ mp->h_send_accel_buffer_oc, 1, &kernel_evt, &mp->last_copy_evt));
+ mp->has_last_copy_evt = 1;
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_FALSE,
0, size_mpi_buffer * sizeof (realw),
@@ -117,10 +120,13 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
global_work_size, local_work_size, 0, NULL, &kernel_evt));
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ }
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_TRUE,
0, size_mpi_buffer * sizeof (realw),
- mp->h_b_send_accel_buffer_oc, 1, &kernel_evt, NULL));
-
+ mp->h_b_send_accel_buffer_oc, 1, &kernel_evt, &mp->last_copy_evt));
+ mp->has_last_copy_evt = 1;
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_FALSE,
0, size_mpi_buffer * sizeof (realw),
@@ -209,7 +215,6 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
// buffer size
size_mpi_buffer = (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core);
-
// checks if anything to do
if( size_mpi_buffer <= 0 ) return;
@@ -225,11 +230,16 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
size_t global_work_size[2];
size_t local_work_size[2];
cl_uint idx = 0;
+ cl_event *copy_evt = NULL;
+ cl_uint num_evt = 0;
+
+ if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+ copy_evt = &mp->last_copy_evt;
+ num_evt = 1;
+ }
if (*FORWARD_OR_ADJOINT == 1) {
- if (GPU_ASYNC_COPY) {
- clCheck (clFinish (mocl.copy_queue));
- } else {
+ if (!GPU_ASYNC_COPY) {
// copies scalar buffer onto GPU
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_FALSE, 0,
mp->max_nibool_interfaces_oc * mp->num_interfaces_outer_core * sizeof (realw),
@@ -250,16 +260,14 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
global_work_size[1] = num_blocks_y;
clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_potential_on_device, 2, NULL,
- global_work_size, local_work_size, 0, NULL, NULL));
+ global_work_size, local_work_size, num_evt, copy_evt, NULL));
}
else if (*FORWARD_OR_ADJOINT == 3) {
// debug
DEBUG_BACKWARD_ASSEMBLY ();
// copies scalar buffer onto GPU
- if (GPU_ASYNC_COPY) {
- clCheck (clFinish (mocl.copy_queue));
- } else {
+ if (!GPU_ASYNC_COPY) {
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_FALSE, 0,
mp->max_nibool_interfaces_oc * mp->num_interfaces_outer_core * sizeof (realw),
buffer_recv_scalar, 0, NULL, NULL));
@@ -278,7 +286,11 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
global_work_size[1] = num_blocks_y;
clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_potential_on_device, 2, NULL,
- global_work_size, local_work_size, 0, NULL, NULL));
+ global_work_size, local_work_size, num_evt, copy_evt, NULL));
+ }
+ if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ mp->has_last_copy_evt = 0;
}
}
#endif
diff --git a/src/gpu/assemble_MPI_vector_gpu.c b/src/gpu/assemble_MPI_vector_gpu.c
index d3b522f..7f29d0d 100644
--- a/src/gpu/assemble_MPI_vector_gpu.c
+++ b/src/gpu/assemble_MPI_vector_gpu.c
@@ -96,9 +96,13 @@ void FC_FUNC_(transfer_boun_from_device,
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ }
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
size_mpi_buffer * sizeof (realw),
- mp->h_send_accel_buffer_cm, 1, &kernel_evt, NULL));
+ mp->h_send_accel_buffer_cm, 1, &kernel_evt, &mp->last_copy_evt));
+ mp->has_last_copy_evt = 1;
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
@@ -119,9 +123,14 @@ void FC_FUNC_(transfer_boun_from_device,
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ }
+
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
- mp->h_b_send_accel_buffer_cm, 1, &kernel_evt, NULL));
+ mp->h_b_send_accel_buffer_cm, 1, &kernel_evt, &mp->last_copy_evt));
+ mp->has_last_copy_evt = 1;
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
@@ -219,9 +228,14 @@ void FC_FUNC_(transfer_boun_from_device,
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ }
+
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
- mp->h_send_accel_buffer_ic, 1, &kernel_evt, NULL));
+ mp->h_send_accel_buffer_ic, 1, &kernel_evt, &mp->last_copy_evt));
+ mp->has_last_copy_evt = 1;
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
@@ -248,10 +262,14 @@ void FC_FUNC_(transfer_boun_from_device,
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ }
+
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
size_mpi_buffer * sizeof (realw),
- mp->h_b_send_accel_buffer_ic, 1, &kernel_evt, NULL));
-
+ mp->h_b_send_accel_buffer_ic, 1, &kernel_evt, &mp->last_copy_evt));
+ mp->has_last_copy_evt = 1;
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
@@ -336,6 +354,8 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
size_t global_work_size[2];
size_t local_work_size[2];
cl_uint idx = 0;
+ cl_event *copy_evt = NULL;
+ cl_uint num_evt = 0;
#endif
#ifdef USE_CUDA
dim3 grid,threads;
@@ -356,11 +376,14 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
#ifdef USE_OPENCL
if (run_opencl) {
+ if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+ copy_evt = &mp->last_copy_evt;
+ num_evt = 1;
+ }
+
if (*FORWARD_OR_ADJOINT == 1) {
// copies vector buffer values to GPU
- if (GPU_ASYNC_COPY) {
- clCheck (clFinish (mocl.copy_queue));
- } else {
+ if (!GPU_ASYNC_COPY) {
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
NDIM * (mp->max_nibool_interfaces_cm) * (mp->num_interfaces_crust_mantle)*sizeof (realw),
buffer_recv_vector, 0, NULL, NULL));
@@ -378,19 +401,18 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
global_work_size[0] = num_blocks_x * blocksize;
global_work_size[1] = num_blocks_y;
- clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, num_evt, copy_evt, NULL));
} else if (*FORWARD_OR_ADJOINT == 3) {
// debug
DEBUG_BACKWARD_ASSEMBLY ();
- if (GPU_ASYNC_COPY) {
- clCheck (clFinish (mocl.command_queue));
- } else {
+ if (!GPU_ASYNC_COPY) {
// copies vector buffer values to GPU
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
NDIM * (mp->max_nibool_interfaces_cm) * (mp->num_interfaces_crust_mantle) * sizeof (realw),
buffer_recv_vector, 0, NULL, NULL));
}
+
//assemble adjoint accel
clCheck (clSetKernelArg (mocl.kernels.assemble_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_b_accel_crust_mantle.ocl));
clCheck (clSetKernelArg (mocl.kernels.assemble_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_b_send_accel_buffer_crust_mantle.ocl));
@@ -404,7 +426,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
global_work_size[0] = num_blocks_x * blocksize;
global_work_size[1] = num_blocks_y;
- clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, num_evt, copy_evt, NULL));
}
}
#endif
@@ -470,7 +492,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
if (*IREGION == IREGION_INNER_CORE) {
size_mpi_buffer = NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core);
- if( size_mpi_buffer > 0 ){
+ if (size_mpi_buffer > 0) {
// assembles values
blocksize = BLOCKSIZE_TRANSFER;
size_padded = ((int) ceil (((double) mp->max_nibool_interfaces_ic) / ((double) blocksize))) * blocksize;
@@ -480,11 +502,15 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
#ifdef USE_OPENCL
if (run_opencl) {
idx = 0;
+
+ if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+ copy_evt = &mp->last_copy_evt;
+ num_evt = 1;
+ }
+
if (*FORWARD_OR_ADJOINT == 1) {
// copies buffer values to GPU
- if (GPU_ASYNC_COPY) {
- clCheck (clFinish (mocl.copy_queue));
- } else {
+ if (!GPU_ASYNC_COPY) {
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
NDIM * (mp->max_nibool_interfaces_ic) * (mp->num_interfaces_inner_core)*sizeof (realw),
buffer_recv_vector, 0, NULL, NULL));
@@ -507,9 +533,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
// debug
DEBUG_BACKWARD_ASSEMBLY ();
- if (GPU_ASYNC_COPY) {
- clCheck (clFinish (mocl.copy_queue));
- } else {
+ if (!GPU_ASYNC_COPY) {
// copies buffer values to GPU
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
NDIM * (mp->max_nibool_interfaces_ic) * (mp->num_interfaces_inner_core) * sizeof (realw),
@@ -528,7 +552,12 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
global_work_size[0] = num_blocks_x * blocksize;
global_work_size[1] = num_blocks_y;
- clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, num_evt, copy_evt, NULL));
+ }
+
+ if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ mp->has_last_copy_evt = 0;
}
}
#endif
@@ -774,7 +803,6 @@ void FC_FUNC_(sync_copy_from_device,
TRACE("sync_copy_from_device");
int size_mpi_buffer;
-
Mesh *mp = (Mesh *)(*Mesh_pointer); // get Mesh from Fortran integer wrapper
// checks async-memcpy
@@ -796,6 +824,11 @@ void FC_FUNC_(sync_copy_from_device,
// waits for asynchronous copy to finish
#ifdef USE_OPENCL
if (run_opencl) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ mp->has_last_copy_evt = 0;
+ }
+
clCheck (clFinish (mocl.copy_queue));
}
#endif
@@ -822,6 +855,11 @@ void FC_FUNC_(sync_copy_from_device,
// waits for asynchronous copy to finish
#ifdef USE_OPENCL
if (run_opencl) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ mp->has_last_copy_evt = 0;
+ }
+
clCheck (clFinish (mocl.copy_queue));
}
#endif
@@ -849,6 +887,11 @@ void FC_FUNC_(sync_copy_from_device,
// waits for asynchronous copy to finish
#ifdef USE_OPENCL
if (run_opencl) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ mp->has_last_copy_evt = 0;
+ }
+
clCheck (clFinish (mocl.copy_queue));
}
#endif
diff --git a/src/gpu/compute_add_sources_elastic_gpu.c b/src/gpu/compute_add_sources_elastic_gpu.c
index 1ccc2f1..e285c0c 100644
--- a/src/gpu/compute_add_sources_elastic_gpu.c
+++ b/src/gpu/compute_add_sources_elastic_gpu.c
@@ -218,9 +218,12 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
size_t global_work_size[3];
size_t local_work_size[3];
cl_uint idx = 0;
+ cl_event *copy_evt = NULL;
+ cl_uint num_evt = 0;
- if (GPU_ASYNC_COPY) {
- clCheck (clFinish (mocl.copy_queue));
+ if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+ copy_evt = &mp->last_copy_evt;
+ num_evt = 1;
}
clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_adjoint_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_accel_crust_mantle.ocl));
@@ -240,7 +243,12 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
global_work_size[2] = NGLLX;
clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.compute_add_sources_adjoint_kernel, 3, NULL,
- global_work_size, local_work_size, 0, NULL, NULL));
+ global_work_size, local_work_size, num_evt, copy_evt, NULL));
+
+ if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ mp->has_last_copy_evt = 0;
+ }
}
#endif
#ifdef USE_CUDA
@@ -286,7 +294,7 @@ void FC_FUNC_(transfer_adj_to_device,
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of Fortran integer container
// check if anything to do
- if(mp->nadj_rec_local == 0)
+ if (mp->nadj_rec_local == 0)
return;
// total number of receivers/adjoint sources
@@ -304,8 +312,8 @@ void FC_FUNC_(transfer_adj_to_device,
int i,j,k,irec,irec_local;
irec_local = 0;
- for(irec = 0; irec < nrec; irec++) {
- if(mp->myrank == h_islice_selected_rec[irec]) {
+ for (irec = 0; irec < nrec; irec++) {
+ if (mp->myrank == h_islice_selected_rec[irec]) {
// takes only local sources
for (k = 0; k < NGLLX; k++) {
for (j = 0; j < NGLLX; j++) {
@@ -342,9 +350,6 @@ void FC_FUNC_(transfer_adj_to_device,
if (run_cuda) {
print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays.cuda, mp->h_adj_sourcearrays_slice,
(mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),71000);
-
-
-
}
#endif
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -370,12 +375,12 @@ void FC_FUNC_(transfer_adj_to_device_async,
Mesh *mp = (Mesh *)(*Mesh_pointer); //get mesh pointer out of Fortran integer container
// check if anything to do
- if( mp->nadj_rec_local == 0 ) {
+ if (mp->nadj_rec_local == 0) {
return;
}
// checks async-memcpy
- if( GPU_ASYNC_COPY == 0 ){
+ if (GPU_ASYNC_COPY == 0) {
exit_on_error("transfer_adj_to_device_async must be called with GPU_ASYNC_COPY == 1, \
please check mesh_constants_cuda.h");
}
@@ -395,6 +400,11 @@ please check mesh_constants_cuda.h");
#if USE_OPENCL
if (run_opencl) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ mp->has_last_copy_evt = 0;
+ }
+
clCheck (clFinish (mocl.copy_queue));
}
#endif
@@ -434,10 +444,17 @@ please check mesh_constants_cuda.h");
}
#if USE_OPENCL
if (run_opencl) {
- clCheck (clFinish (mocl.command_queue));
+ cl_event *copy_evt = NULL;
+ cl_uint num_evt = 0;
+
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ }
+
clCheck (clEnqueueWriteBuffer (mocl.copy_queue, mp->d_adj_sourcearrays.ocl, CL_FALSE, 0,
mp->nadj_rec_local * NDIM * NGLL3 * sizeof (realw),
- mp->h_adj_sourcearrays_slice, 0, NULL, NULL));
+ mp->h_adj_sourcearrays_slice, num_evt, copy_evt, &mp->last_copy_evt));
+ mp->has_last_copy_evt = 1;
}
#endif
#if USE_CUDA
diff --git a/src/gpu/mesh_constants_gpu.h b/src/gpu/mesh_constants_gpu.h
index 70830aa..246bcb4 100644
--- a/src/gpu/mesh_constants_gpu.h
+++ b/src/gpu/mesh_constants_gpu.h
@@ -875,12 +875,6 @@ typedef struct mesh_ {
// optimizations
// ------------------------------------------------------------------ //
-#if USE_CUDA
- // overlapped memcpy streams
- cudaStream_t compute_stream;
- cudaStream_t copy_stream;
-#endif
-
// A buffer for MPI send/recv, which is duplicated in Fortran but is
// allocated with pinned memory to facilitate asynchronous device <->
// host memory transfers
@@ -924,6 +918,17 @@ typedef struct mesh_ {
cl_mem h_pinned_b_send_accel_buffer_oc;
cl_mem h_pinned_b_recv_accel_buffer_oc;
#endif
+
+#if USE_CUDA
+ // overlapped memcpy streams
+ cudaStream_t compute_stream;
+ cudaStream_t copy_stream;
+#endif
+#if USE_OPENCL
+ cl_event last_copy_evt;
+ int has_last_copy_evt;
+#endif
+
} Mesh;
diff --git a/src/gpu/prepare_mesh_constants_gpu.c b/src/gpu/prepare_mesh_constants_gpu.c
index fc2eeaa..bd3c0ec 100644
--- a/src/gpu/prepare_mesh_constants_gpu.c
+++ b/src/gpu/prepare_mesh_constants_gpu.c
@@ -624,6 +624,11 @@ void FC_FUNC_ (prepare_constants_device,
cudaStreamCreate(&mp->copy_stream);
}
#endif
+#if USE_OPENCL
+ if (run_opencl) {
+ mp->has_last_copy_evt = 0;
+ }
+#endif
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_gpu_error ("prepare_constants_device");
#endif
diff --git a/src/gpu/write_seismograms_gpu.c b/src/gpu/write_seismograms_gpu.c
index 12f0bb3..2e0019d 100644
--- a/src/gpu/write_seismograms_gpu.c
+++ b/src/gpu/write_seismograms_gpu.c
@@ -60,9 +60,12 @@ void write_seismograms_transfer_from_device (Mesh *mp,
size_t local_work_size[2];
cl_uint idx = 0;
cl_event kernel_evt;
+ cl_event *copy_evt = NULL;
+ cl_uint num_evt = 0;
- if (GPU_ASYNC_COPY) {
- clCheck (clFinish (mocl.copy_queue));
+ if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+ copy_evt = &mp->last_copy_evt;
+ num_evt = 1;
}
clCheck (clSetKernelArg (mocl.kernels.write_seismograms_transfer_from_device_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_number_receiver_global.ocl));
@@ -77,13 +80,18 @@ void write_seismograms_transfer_from_device (Mesh *mp,
global_work_size[0] = num_blocks_x * blocksize;
global_work_size[1] = num_blocks_y;
- clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.write_seismograms_transfer_from_device_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
+ clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.write_seismograms_transfer_from_device_kernel, 2, NULL, global_work_size, local_work_size, num_evt, copy_evt, &kernel_evt));
//copies array to CPU
if (GPU_ASYNC_COPY) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ }
+
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_station_seismo_field.ocl, CL_FALSE, 0,
3 * NGLL3 * mp->nrec_local * sizeof (realw),
- mp->h_station_seismo_field, 1, &kernel_evt, NULL));
+ mp->h_station_seismo_field, 1, &kernel_evt, &mp->last_copy_evt));
+ mp->has_last_copy_evt = 1;
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_station_seismo_field.ocl, CL_TRUE, 0,
3 * NGLL3 * mp->nrec_local * sizeof (realw),
@@ -368,6 +376,10 @@ void FC_FUNC_(transfer_seismo_from_device_async,
// waits for previous copy call to be finished
#ifdef USE_OPENCL
if (run_opencl) {
+ if (mp->has_last_copy_evt) {
+ clCheck (clReleaseEvent (mp->last_copy_evt));
+ mp->has_last_copy_evt = 0;
+ }
clCheck (clFinish (mocl.copy_queue));
}
#endif
More information about the CIG-COMMITS
mailing list