[cig-commits] [commit] devel, master: improve speed with clFlush instead of clFinish and fix bugs (c6d2aa9)
cig_noreply at geodynamics.org
cig_noreply at geodynamics.org
Thu Nov 6 08:14:04 PST 2014
Repository : https://github.com/geodynamics/specfem3d_globe
On branches: devel,master
Link : https://github.com/geodynamics/specfem3d_globe/compare/bc58e579b3b0838a0968725a076f5904845437ca...be63f20cbb6f462104e949894dbe205d2398cd7f
>---------------------------------------------------------------
commit c6d2aa94887846fc63c4498315e437d33709dfd8
Author: Kevin Pouget <kevin.pouget at imag.fr>
Date: Tue May 6 16:03:55 2014 +0200
improve speed with clFlush instead of clFinish and fix bugs
>---------------------------------------------------------------
c6d2aa94887846fc63c4498315e437d33709dfd8
src/gpu/assemble_MPI_scalar_gpu.c | 12 ++--
src/gpu/assemble_MPI_vector_gpu.c | 22 +++----
src/gpu/compute_add_sources_elastic_gpu.c | 12 ++--
src/gpu/prepare_mesh_constants_gpu.c | 104 +++++++++++++++---------------
src/gpu/write_seismograms_gpu.c | 4 +-
5 files changed, 81 insertions(+), 73 deletions(-)
diff --git a/src/gpu/assemble_MPI_scalar_gpu.c b/src/gpu/assemble_MPI_scalar_gpu.c
index d96367b..985989e 100644
--- a/src/gpu/assemble_MPI_scalar_gpu.c
+++ b/src/gpu/assemble_MPI_scalar_gpu.c
@@ -53,7 +53,9 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
// MPI buffer size
size_mpi_buffer = mp->max_nibool_interfaces_oc * mp->num_interfaces_outer_core;
// checks if anything to do
- if( size_mpi_buffer <= 0 ) return;
+ if (size_mpi_buffer <= 0) {
+ return;
+ }
int blocksize = BLOCKSIZE_TRANSFER;
int size_padded = ((int) ceil ((double) mp->max_nibool_interfaces_oc / (double) blocksize)) * blocksize;
@@ -84,7 +86,7 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.command_queue));
+ clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_TRUE,
0, size_mpi_buffer * sizeof (realw),
@@ -117,7 +119,7 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
global_work_size, local_work_size, 0, NULL, NULL));
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.command_queue));
+ clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_TRUE,
0, size_mpi_buffer * sizeof (realw),
@@ -229,7 +231,7 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
if (*FORWARD_OR_ADJOINT == 1) {
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
} else {
// copies scalar buffer onto GPU
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_FALSE, 0,
@@ -259,7 +261,7 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
// copies scalar buffer onto GPU
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
} else {
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_FALSE, 0,
mp->max_nibool_interfaces_oc * mp->num_interfaces_outer_core * sizeof (realw),
diff --git a/src/gpu/assemble_MPI_vector_gpu.c b/src/gpu/assemble_MPI_vector_gpu.c
index 071d172..e280a7d 100644
--- a/src/gpu/assemble_MPI_vector_gpu.c
+++ b/src/gpu/assemble_MPI_vector_gpu.c
@@ -93,7 +93,7 @@ void FC_FUNC_(transfer_boun_from_device,
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.command_queue));
+ clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
size_mpi_buffer * sizeof (realw),
mp->h_send_accel_buffer_cm, 0, NULL, NULL));
@@ -118,7 +118,7 @@ void FC_FUNC_(transfer_boun_from_device,
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.command_queue));
+ clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
mp->h_b_send_accel_buffer_cm, 0, NULL, NULL));
@@ -216,7 +216,7 @@ void FC_FUNC_(transfer_boun_from_device,
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.command_queue));
+ clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
size_mpi_buffer * sizeof (realw),
mp->h_send_accel_buffer_ic, 0, NULL, NULL));
@@ -246,7 +246,7 @@ void FC_FUNC_(transfer_boun_from_device,
// copies buffer to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.command_queue));
+ clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
size_mpi_buffer * sizeof (realw),
mp->h_b_send_accel_buffer_ic, 0, NULL, NULL));
@@ -357,7 +357,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
if (*FORWARD_OR_ADJOINT == 1) {
// copies vector buffer values to GPU
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
} else {
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
NDIM * (mp->max_nibool_interfaces_cm) * (mp->num_interfaces_crust_mantle)*sizeof (realw),
@@ -382,7 +382,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
DEBUG_BACKWARD_ASSEMBLY ();
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.command_queue));
+ clCheck(clFlush(mocl.command_queue));
} else {
// copies vector buffer values to GPU
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
@@ -481,7 +481,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
if (*FORWARD_OR_ADJOINT == 1) {
// copies buffer values to GPU
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
} else {
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
NDIM * (mp->max_nibool_interfaces_ic) * (mp->num_interfaces_inner_core)*sizeof (realw),
@@ -506,7 +506,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
DEBUG_BACKWARD_ASSEMBLY ();
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
} else {
// copies buffer values to GPU
clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
@@ -794,7 +794,7 @@ void FC_FUNC_(sync_copy_from_device,
// waits for asynchronous copy to finish
#ifdef USE_OPENCL
if (run_opencl) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
}
#endif
#ifdef USE_CUDA
@@ -820,7 +820,7 @@ void FC_FUNC_(sync_copy_from_device,
// waits for asynchronous copy to finish
#ifdef USE_OPENCL
if (run_opencl) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
}
#endif
#ifdef USE_CUDA
@@ -847,7 +847,7 @@ void FC_FUNC_(sync_copy_from_device,
// waits for asynchronous copy to finish
#ifdef USE_OPENCL
if (run_opencl) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
}
#endif
#ifdef USE_CUDA
diff --git a/src/gpu/compute_add_sources_elastic_gpu.c b/src/gpu/compute_add_sources_elastic_gpu.c
index 6d8c4ae..86b8ff7 100644
--- a/src/gpu/compute_add_sources_elastic_gpu.c
+++ b/src/gpu/compute_add_sources_elastic_gpu.c
@@ -250,7 +250,7 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
cl_uint idx = 0;
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
}
clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_adjoint_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_accel_crust_mantle.ocl));
@@ -342,10 +342,14 @@ void FC_FUNC_(transfer_adj_to_device,
// copies extracted array values onto GPU
#ifdef USE_OPENCL
+ if (run_opencl) {
+ clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_adj_sourcearrays.ocl, CL_TRUE, 0,
+ mp->nadj_rec_local * NDIM * NGLL3 * sizeof (realw),
+ mp->h_adj_sourcearrays_slice, 0, NULL, NULL));
+ }
#endif
#ifdef USE_CUDA
if (run_cuda) {
- // copies extracted array values onto GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays.cuda, mp->h_adj_sourcearrays_slice,
(mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),71000);
@@ -401,7 +405,7 @@ please check mesh_constants_cuda.h");
#if USE_OPENCL
if (run_opencl) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
}
#endif
#if USE_CUDA
@@ -440,7 +444,7 @@ please check mesh_constants_cuda.h");
}
#if USE_OPENCL
if (run_opencl) {
- clCheck(clFinish(mocl.command_queue));
+ clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueWriteBuffer (mocl.copy_queue, mp->d_adj_sourcearrays.ocl, CL_FALSE, 0,
mp->nadj_rec_local * NDIM * NGLL3 * sizeof (realw),
mp->h_adj_sourcearrays_slice, 0, NULL, NULL));
diff --git a/src/gpu/prepare_mesh_constants_gpu.c b/src/gpu/prepare_mesh_constants_gpu.c
index f4819f1..73cecc4 100644
--- a/src/gpu/prepare_mesh_constants_gpu.c
+++ b/src/gpu/prepare_mesh_constants_gpu.c
@@ -3359,7 +3359,7 @@ void FC_FUNC_ (prepare_cleanup_device,
Mesh *mp = (Mesh *) *Mesh_pointer_f;
#ifdef USE_CUDA
- if(run_cuda) {
+ if (run_cuda) {
// synchronizes device
synchronize_cuda();
}
@@ -3372,7 +3372,6 @@ void FC_FUNC_ (prepare_cleanup_device,
//------------------------------------------
#ifdef USE_CUDA
if (run_cuda) {
-
#ifdef USE_TEXTURES_CONSTANTS
cudaUnbindTexture(d_hprime_xx_tex);
cudaUnbindTexture(d_hprimewgll_xx_tex);
@@ -3418,7 +3417,7 @@ void FC_FUNC_ (prepare_cleanup_device,
if (mp->nadj_rec_local > 0) {
#ifdef USE_OPENCL
- if (run_cuda && GPU_ASYNC_COPY) {
+ if (run_opencl && GPU_ASYNC_COPY) {
RELEASE_PINNED_BUFFER_OCL (adj_sourcearrays_slice);
}
#endif
@@ -3433,68 +3432,72 @@ void FC_FUNC_ (prepare_cleanup_device,
}
#ifdef USE_OPENCL
- if (mp->num_interfaces_crust_mantle > 0) {
- if (GPU_ASYNC_COPY) {
- RELEASE_PINNED_BUFFER_OCL (send_accel_buffer_cm);
- RELEASE_PINNED_BUFFER_OCL (recv_accel_buffer_cm);
+ if (run_opencl) {
+ if (mp->num_interfaces_crust_mantle > 0) {
+ if (GPU_ASYNC_COPY) {
+ RELEASE_PINNED_BUFFER_OCL (send_accel_buffer_cm);
+ RELEASE_PINNED_BUFFER_OCL (recv_accel_buffer_cm);
- if (mp->simulation_type == 3) {
- RELEASE_PINNED_BUFFER_OCL (b_send_accel_buffer_cm);
- RELEASE_PINNED_BUFFER_OCL (b_recv_accel_buffer_cm);
+ if (mp->simulation_type == 3) {
+ RELEASE_PINNED_BUFFER_OCL (b_send_accel_buffer_cm);
+ RELEASE_PINNED_BUFFER_OCL (b_recv_accel_buffer_cm);
+ }
}
}
- }
- if (mp->num_interfaces_inner_core > 0) {
- if (GPU_ASYNC_COPY) {
- RELEASE_PINNED_BUFFER_OCL (send_accel_buffer_ic);
- RELEASE_PINNED_BUFFER_OCL (recv_accel_buffer_ic);
+ if (mp->num_interfaces_inner_core > 0) {
+ if (GPU_ASYNC_COPY) {
+ RELEASE_PINNED_BUFFER_OCL (send_accel_buffer_ic);
+ RELEASE_PINNED_BUFFER_OCL (recv_accel_buffer_ic);
- if (mp->simulation_type == 3) {
- RELEASE_PINNED_BUFFER_OCL (b_send_accel_buffer_ic);
- RELEASE_PINNED_BUFFER_OCL (b_recv_accel_buffer_ic);
+ if (mp->simulation_type == 3) {
+ RELEASE_PINNED_BUFFER_OCL (b_send_accel_buffer_ic);
+ RELEASE_PINNED_BUFFER_OCL (b_recv_accel_buffer_ic);
+ }
}
}
- }
- if (mp->num_interfaces_outer_core > 0) {
- if (GPU_ASYNC_COPY) {
- RELEASE_PINNED_BUFFER_OCL (send_accel_buffer_oc);
- RELEASE_PINNED_BUFFER_OCL (recv_accel_buffer_oc);
+ if (mp->num_interfaces_outer_core > 0) {
+ if (GPU_ASYNC_COPY) {
+ RELEASE_PINNED_BUFFER_OCL (send_accel_buffer_oc);
+ RELEASE_PINNED_BUFFER_OCL (recv_accel_buffer_oc);
- if (mp->simulation_type == 3) {
- RELEASE_PINNED_BUFFER_OCL (b_send_accel_buffer_oc);
- RELEASE_PINNED_BUFFER_OCL (b_recv_accel_buffer_oc);
+ if (mp->simulation_type == 3) {
+ RELEASE_PINNED_BUFFER_OCL (b_send_accel_buffer_oc);
+ RELEASE_PINNED_BUFFER_OCL (b_recv_accel_buffer_oc);
+ }
}
}
}
#endif
#ifdef USE_CUDA
- if( mp->num_interfaces_crust_mantle > 0 ){
- if( GPU_ASYNC_COPY){
- cudaFreeHost(mp->h_send_accel_buffer_cm);
- cudaFreeHost(mp->h_recv_accel_buffer_cm);
- if( mp->simulation_type == 3 ){
- cudaFreeHost(mp->h_b_send_accel_buffer_cm);
- cudaFreeHost(mp->h_b_recv_accel_buffer_cm);
+ if (run_cuda) {
+ if (mp->num_interfaces_crust_mantle > 0) {
+ if( GPU_ASYNC_COPY){
+ cudaFreeHost(mp->h_send_accel_buffer_cm);
+ cudaFreeHost(mp->h_recv_accel_buffer_cm);
+ if( mp->simulation_type == 3 ){
+ cudaFreeHost(mp->h_b_send_accel_buffer_cm);
+ cudaFreeHost(mp->h_b_recv_accel_buffer_cm);
+ }
}
}
- }
- if( mp->num_interfaces_inner_core > 0 ){
- if( GPU_ASYNC_COPY){
- cudaFreeHost(mp->h_send_accel_buffer_ic);
- cudaFreeHost(mp->h_recv_accel_buffer_ic);
- if( mp->simulation_type == 3 ){
- cudaFreeHost(mp->h_b_send_accel_buffer_ic);
- cudaFreeHost(mp->h_b_recv_accel_buffer_ic);
+ if( mp->num_interfaces_inner_core > 0 ){
+ if( GPU_ASYNC_COPY){
+ cudaFreeHost(mp->h_send_accel_buffer_ic);
+ cudaFreeHost(mp->h_recv_accel_buffer_ic);
+ if( mp->simulation_type == 3 ){
+ cudaFreeHost(mp->h_b_send_accel_buffer_ic);
+ cudaFreeHost(mp->h_b_recv_accel_buffer_ic);
+ }
}
}
- }
- if( mp->num_interfaces_outer_core > 0 ){
- if( GPU_ASYNC_COPY){
- cudaFreeHost(mp->h_send_accel_buffer_oc);
- cudaFreeHost(mp->h_recv_accel_buffer_oc);
- if( mp->simulation_type == 3 ){
- cudaFreeHost(mp->h_b_send_accel_buffer_oc);
- cudaFreeHost(mp->h_b_recv_accel_buffer_oc);
+ if( mp->num_interfaces_outer_core > 0 ){
+ if( GPU_ASYNC_COPY){
+ cudaFreeHost(mp->h_send_accel_buffer_oc);
+ cudaFreeHost(mp->h_recv_accel_buffer_oc);
+ if( mp->simulation_type == 3 ){
+ cudaFreeHost(mp->h_b_send_accel_buffer_oc);
+ cudaFreeHost(mp->h_b_recv_accel_buffer_oc);
+ }
}
}
}
@@ -3504,7 +3507,6 @@ void FC_FUNC_ (prepare_cleanup_device,
//------------------------------------------
#ifdef USE_OPENCL
if (run_opencl) {
-
if (mp->simulation_type == 1 || mp->simulation_type == 3) {
clReleaseMemObject (mp->d_sourcearrays.ocl);
clReleaseMemObject (mp->d_stf_pre_compute.ocl);
@@ -3547,7 +3549,7 @@ void FC_FUNC_ (prepare_cleanup_device,
//------------------------------------------
// gravity arrays
//------------------------------------------
- if (! mp->gravity) {
+ if (!mp->gravity) {
clReleaseMemObject (mp->d_d_ln_density_dr_table.ocl);
} else {
diff --git a/src/gpu/write_seismograms_gpu.c b/src/gpu/write_seismograms_gpu.c
index f09dabf..26d465f 100644
--- a/src/gpu/write_seismograms_gpu.c
+++ b/src/gpu/write_seismograms_gpu.c
@@ -57,7 +57,7 @@ void write_seismograms_transfer_from_device (Mesh *mp,
#ifdef USE_OPENCL
if (run_opencl) {
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.copy_queue));
+ clCheck(clFlush(mocl.copy_queue));
}
size_t global_work_size[2];
@@ -81,7 +81,7 @@ void write_seismograms_transfer_from_device (Mesh *mp,
//copies array to CPU
if (GPU_ASYNC_COPY) {
- clCheck(clFinish(mocl.command_queue));
+ clCheck(clFlush(mocl.command_queue));
clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_station_seismo_field.ocl, CL_FALSE, 0,
3 * NGLL3 * mp->nrec_local * sizeof (realw),
mp->h_station_seismo_field, 0, NULL, NULL));
More information about the CIG-COMMITS
mailing list