[cig-commits] [commit] devel: use ocl events for async dependencies (5cfc0d4)
cig_noreply at geodynamics.org
cig_noreply at geodynamics.org
Thu May 15 15:55:48 PDT 2014
Repository : https://github.com/geodynamics/specfem3d_globe
On branch : devel
Link : https://github.com/geodynamics/specfem3d_globe/compare/f2189843b45ae850da30e7a2f870e7c7a590f993...6acd7dc85c5d44360a32125f46ea3abb864fbe5e
>---------------------------------------------------------------
commit 5cfc0d40af32a20d89c334427dd3e34440b73a65
Author: Kevin Pouget <kevin.pouget at st.com>
Date: Mon May 12 14:46:56 2014 +0200
use ocl events for async dependencies
>---------------------------------------------------------------
5cfc0d40af32a20d89c334427dd3e34440b73a65
src/gpu/assemble_MPI_scalar_gpu.c | 19 ++++++-------
src/gpu/assemble_MPI_vector_gpu.c | 46 ++++++++++++++++---------------
src/gpu/compute_add_sources_elastic_gpu.c | 46 +++++++++++++++----------------
src/gpu/write_seismograms_gpu.c | 25 +++++++++--------
4 files changed, 69 insertions(+), 67 deletions(-)
diff --git a/src/gpu/assemble_MPI_scalar_gpu.c b/src/gpu/assemble_MPI_scalar_gpu.c
index 985989e..6addb7b 100644
--- a/src/gpu/assemble_MPI_scalar_gpu.c
+++ b/src/gpu/assemble_MPI_scalar_gpu.c
@@ -68,6 +68,7 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
size_t global_work_size[2];
size_t local_work_size[2];
cl_uint idx = 0;
+ cl_event kernel_evt;
if (*FORWARD_OR_ADJOINT == 1) {
clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_potential_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_accel_outer_core.ocl));
@@ -82,21 +83,18 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
global_work_size[0] = num_blocks_x * blocksize;
global_work_size[1] = num_blocks_y;
- clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_potential_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_potential_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.command_queue));
-
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_TRUE,
0, size_mpi_buffer * sizeof (realw),
- mp->h_send_accel_buffer_oc, 0, NULL, NULL));
+ mp->h_send_accel_buffer_oc, 1, &kernel_evt, NULL));
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_FALSE,
0, size_mpi_buffer * sizeof (realw),
send_buffer, 0, NULL, NULL));
-
}
}
else if (*FORWARD_OR_ADJOINT == 3) {
@@ -116,14 +114,12 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
global_work_size[1] = num_blocks_y;
clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_potential_on_device, 2, NULL,
- global_work_size, local_work_size, 0, NULL, NULL));
+ global_work_size, local_work_size, 0, NULL, &kernel_evt));
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.command_queue));
-
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_TRUE,
0, size_mpi_buffer * sizeof (realw),
- mp->h_b_send_accel_buffer_oc, 0, NULL, NULL));
+ mp->h_b_send_accel_buffer_oc, 1, &kernel_evt, NULL));
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_FALSE,
@@ -131,6 +127,7 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
send_buffer, 0, NULL, NULL));
}
}
+ clReleaseEvent (kernel_evt);
}
#endif
#ifdef USE_CUDA
@@ -231,7 +228,7 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
if (*FORWARD_OR_ADJOINT == 1) {
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.copy_queue));
+ clCheck (clFinish (mocl.copy_queue));
} else {
// copies scalar buffer onto GPU
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_FALSE, 0,
@@ -261,7 +258,7 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
// copies scalar buffer onto GPU
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.copy_queue));
+ clCheck (clFinish (mocl.copy_queue));
} else {
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_FALSE, 0,
mp->max_nibool_interfaces_oc * mp->num_interfaces_outer_core * sizeof (realw),
diff --git a/src/gpu/assemble_MPI_vector_gpu.c b/src/gpu/assemble_MPI_vector_gpu.c
index e280a7d..d3b522f 100644
--- a/src/gpu/assemble_MPI_vector_gpu.c
+++ b/src/gpu/assemble_MPI_vector_gpu.c
@@ -76,12 +76,15 @@ void FC_FUNC_(transfer_boun_from_device,
#ifdef USE_OPENCL
if (run_opencl) {
+ cl_event kernel_evt;
+
local_work_size[0] = blocksize;
local_work_size[1] = 1;
global_work_size[0] = num_blocks_x * blocksize;
global_work_size[1] = num_blocks_y;
if (*FORWARD_OR_ADJOINT == 1) {
+
clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_accel_crust_mantle.ocl));
clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_send_accel_buffer_crust_mantle.ocl));
clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (int), (void *) &mp->num_interfaces_crust_mantle));
@@ -89,20 +92,18 @@ void FC_FUNC_(transfer_boun_from_device,
clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_nibool_interfaces_crust_mantle.ocl));
clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_ibool_interfaces_crust_mantle.ocl));
- clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
size_mpi_buffer * sizeof (realw),
- mp->h_send_accel_buffer_cm, 0, NULL, NULL));
+ mp->h_send_accel_buffer_cm, 1, &kernel_evt, NULL));
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
send_accel_buffer, 0, NULL, NULL));
}
-
} else if (*FORWARD_OR_ADJOINT == 3) {
// debug
DEBUG_BACKWARD_ASSEMBLY ();
@@ -114,20 +115,20 @@ void FC_FUNC_(transfer_boun_from_device,
clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_nibool_interfaces_crust_mantle.ocl));
clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_ibool_interfaces_crust_mantle.ocl));
- clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
- mp->h_b_send_accel_buffer_cm, 0, NULL, NULL));
+ mp->h_b_send_accel_buffer_cm, 1, &kernel_evt, NULL));
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
send_accel_buffer, 0, NULL, NULL));
}
}
+ clReleaseEvent (kernel_evt);
}
#endif
#ifdef USE_CUDA
@@ -197,6 +198,8 @@ void FC_FUNC_(transfer_boun_from_device,
#ifdef USE_OPENCL
if (run_opencl) {
+ cl_event kernel_evt;
+
idx = 0;
if (*FORWARD_OR_ADJOINT == 1) {
@@ -212,14 +215,13 @@ void FC_FUNC_(transfer_boun_from_device,
global_work_size[0] = num_blocks_x * blocksize;
global_work_size[1] = num_blocks_y;
- clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
- mp->h_send_accel_buffer_ic, 0, NULL, NULL));
+ mp->h_send_accel_buffer_ic, 1, &kernel_evt, NULL));
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
@@ -242,21 +244,21 @@ void FC_FUNC_(transfer_boun_from_device,
global_work_size[0] = num_blocks_x * blocksize;
global_work_size[1] = num_blocks_y;
- clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
size_mpi_buffer * sizeof (realw),
- mp->h_b_send_accel_buffer_ic, 0, NULL, NULL));
- } else {
+ mp->h_b_send_accel_buffer_ic, 1, &kernel_evt, NULL));
+ } else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
send_accel_buffer, 0, NULL, NULL));
}
}
+ clReleaseEvent (kernel_evt);
}
#endif
#ifdef USE_CUDA
@@ -357,7 +359,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
if (*FORWARD_OR_ADJOINT == 1) {
// copies vector buffer values to GPU
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.copy_queue));
+ clCheck (clFinish (mocl.copy_queue));
} else {
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
NDIM * (mp->max_nibool_interfaces_cm) * (mp->num_interfaces_crust_mantle)*sizeof (realw),
@@ -382,7 +384,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
DEBUG_BACKWARD_ASSEMBLY ();
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.command_queue));
+ clCheck (clFinish (mocl.command_queue));
} else {
// copies vector buffer values to GPU
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
@@ -481,7 +483,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
if (*FORWARD_OR_ADJOINT == 1) {
// copies buffer values to GPU
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.copy_queue));
+ clCheck (clFinish (mocl.copy_queue));
} else {
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
NDIM * (mp->max_nibool_interfaces_ic) * (mp->num_interfaces_inner_core)*sizeof (realw),
@@ -506,7 +508,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
DEBUG_BACKWARD_ASSEMBLY ();
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.copy_queue));
+ clCheck (clFinish (mocl.copy_queue));
} else {
// copies buffer values to GPU
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
@@ -640,7 +642,7 @@ void FC_FUNC_(transfer_buffer_to_device_async,
DEBUG_BACKWARD_ASSEMBLY();
// copy on host memory
- memcpy(mp->h_b_recv_accel_buffer_cm, buffer, size_mpi_buffer * sizeof(realw));
+ memcpy (mp->h_b_recv_accel_buffer_cm, buffer, size_mpi_buffer * sizeof(realw));
// asynchronous copy to GPU using copy_stream
#ifdef USE_OPENCL
@@ -794,7 +796,7 @@ void FC_FUNC_(sync_copy_from_device,
// waits for asynchronous copy to finish
#ifdef USE_OPENCL
if (run_opencl) {
- clCheck(clFlush(mocl.copy_queue));
+ clCheck (clFinish (mocl.copy_queue));
}
#endif
#ifdef USE_CUDA
@@ -820,7 +822,7 @@ void FC_FUNC_(sync_copy_from_device,
// waits for asynchronous copy to finish
#ifdef USE_OPENCL
if (run_opencl) {
- clCheck(clFlush(mocl.copy_queue));
+ clCheck (clFinish (mocl.copy_queue));
}
#endif
#ifdef USE_CUDA
@@ -847,7 +849,7 @@ void FC_FUNC_(sync_copy_from_device,
// waits for asynchronous copy to finish
#ifdef USE_OPENCL
if (run_opencl) {
- clCheck(clFlush(mocl.copy_queue));
+ clCheck (clFinish (mocl.copy_queue));
}
#endif
#ifdef USE_CUDA
diff --git a/src/gpu/compute_add_sources_elastic_gpu.c b/src/gpu/compute_add_sources_elastic_gpu.c
index 1b1c7e9..1ccc2f1 100644
--- a/src/gpu/compute_add_sources_elastic_gpu.c
+++ b/src/gpu/compute_add_sources_elastic_gpu.c
@@ -213,26 +213,6 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
// the irec_local variable needs to be precomputed (as
// h_pre_comp..), because normally it is in the loop updating accel,
// and due to how it's incremented, it cannot be parallelized
-#ifdef USE_CUDA
- if (run_cuda) {
- // waits for previous transfer_** calls to be finished
- if (GPU_ASYNC_COPY ){
- // waits for asynchronous copy to finish
- cudaStreamSynchronize(mp->copy_stream);
- }
-
- dim3 grid(num_blocks_x,num_blocks_y,1);
- dim3 threads(NGLLX,NGLLX,NGLLX);
-
- compute_add_sources_adjoint_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle.cuda,
- nrec,
- mp->d_adj_sourcearrays.cuda,
- mp->d_ibool_crust_mantle.cuda,
- mp->d_ispec_selected_rec.cuda,
- mp->d_pre_computed_irec.cuda,
- mp->nadj_rec_local);
- }
-#endif
#if USE_OPENCL
if (run_opencl) {
size_t global_work_size[3];
@@ -240,7 +220,7 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
cl_uint idx = 0;
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.copy_queue));
+ clCheck (clFinish (mocl.copy_queue));
}
clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_adjoint_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_accel_crust_mantle.ocl));
@@ -263,6 +243,26 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
global_work_size, local_work_size, 0, NULL, NULL));
}
#endif
+#ifdef USE_CUDA
+ if (run_cuda) {
+ // waits for previous transfer_** calls to be finished
+ if (GPU_ASYNC_COPY ){
+ // waits for asynchronous copy to finish
+ cudaStreamSynchronize(mp->copy_stream);
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y,1);
+ dim3 threads(NGLLX,NGLLX,NGLLX);
+
+ compute_add_sources_adjoint_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle.cuda,
+ nrec,
+ mp->d_adj_sourcearrays.cuda,
+ mp->d_ibool_crust_mantle.cuda,
+ mp->d_ispec_selected_rec.cuda,
+ mp->d_pre_computed_irec.cuda,
+ mp->nadj_rec_local);
+ }
+#endif
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_gpu_error("compute_add_sources_adjoint_cuda");
#endif
@@ -395,7 +395,7 @@ please check mesh_constants_cuda.h");
#if USE_OPENCL
if (run_opencl) {
- clCheck(clFlush(mocl.copy_queue));
+ clCheck (clFinish (mocl.copy_queue));
}
#endif
#if USE_CUDA
@@ -434,7 +434,7 @@ please check mesh_constants_cuda.h");
}
#if USE_OPENCL
if (run_opencl) {
- clCheck(clFlush(mocl.command_queue));
+ clCheck (clFinish (mocl.command_queue));
clCheck (clEnqueueWriteBuffer (mocl.copy_queue, mp->d_adj_sourcearrays.ocl, CL_FALSE, 0,
mp->nadj_rec_local * NDIM * NGLL3 * sizeof (realw),
mp->h_adj_sourcearrays_slice, 0, NULL, NULL));
diff --git a/src/gpu/write_seismograms_gpu.c b/src/gpu/write_seismograms_gpu.c
index 240d8b1..12f0bb3 100644
--- a/src/gpu/write_seismograms_gpu.c
+++ b/src/gpu/write_seismograms_gpu.c
@@ -56,13 +56,14 @@ void write_seismograms_transfer_from_device (Mesh *mp,
#ifdef USE_OPENCL
if (run_opencl) {
- if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.copy_queue));
- }
-
size_t global_work_size[2];
size_t local_work_size[2];
cl_uint idx = 0;
+ cl_event kernel_evt;
+
+ if (GPU_ASYNC_COPY) {
+ clCheck (clFinish (mocl.copy_queue));
+ }
clCheck (clSetKernelArg (mocl.kernels.write_seismograms_transfer_from_device_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_number_receiver_global.ocl));
clCheck (clSetKernelArg (mocl.kernels.write_seismograms_transfer_from_device_kernel, idx++, sizeof (cl_mem), (void *) &d_ispec_selected->ocl));
@@ -76,20 +77,20 @@ void write_seismograms_transfer_from_device (Mesh *mp,
global_work_size[0] = num_blocks_x * blocksize;
global_work_size[1] = num_blocks_y;
-
- clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.write_seismograms_transfer_from_device_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+ clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.write_seismograms_transfer_from_device_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
//copies array to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_station_seismo_field.ocl, CL_FALSE, 0,
3 * NGLL3 * mp->nrec_local * sizeof (realw),
- mp->h_station_seismo_field, 0, NULL, NULL));
+ mp->h_station_seismo_field, 1, &kernel_evt, NULL));
} else {
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_station_seismo_field.ocl, CL_TRUE, 0,
3 * NGLL3 * mp->nrec_local * sizeof (realw),
mp->h_station_seismo_field, 0, NULL, NULL));
}
+
+ clReleaseEvent (kernel_evt);
}
#endif
#if USE_CUDA
@@ -190,7 +191,6 @@ void write_seismograms_transfer_strain_from_device (Mesh *mp,
clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.write_seismograms_transfer_strain_from_device_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-
//copies array to CPU
clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_station_seismo_field.ocl, CL_TRUE, 0,
NGLL3 * mp->nrec_local * sizeof (realw),
@@ -334,9 +334,7 @@ void FC_FUNC_ (write_seismograms_transfer_gpu,
}
/* ----------------------------------------------------------------------------------------------- */
-
// data transfer to CPU host
-
/* ----------------------------------------------------------------------------------------------- */
extern EXTERN_LANG
@@ -368,6 +366,11 @@ void FC_FUNC_(transfer_seismo_from_device_async,
}
// waits for previous copy call to be finished
+#ifdef USE_OPENCL
+ if (run_opencl) {
+ clCheck (clFinish (mocl.copy_queue));
+ }
+#endif
#ifdef USE_CUDA
if (run_cuda) {
cudaStreamSynchronize(mp->copy_stream);
More information about the CIG-COMMITS
mailing list