[cig-commits] [commit] devel, master: add events around copy queue operations (b2c7b97)

Thu Nov 6 08:15:35 PST 2014

Repository : https://github.com/geodynamics/specfem3d_globe

On branches: devel,master
Link       : https://github.com/geodynamics/specfem3d_globe/compare/bc58e579b3b0838a0968725a076f5904845437ca...be63f20cbb6f462104e949894dbe205d2398cd7f

>---------------------------------------------------------------

commit b2c7b970c5fe4e11c97b7f32d0d64b63e2d63df7
Author: Kevin Pouget <kevin.pouget at imag.fr>
Date:   Mon May 12 16:42:27 2014 +0200

    add events around copy queue operations


>---------------------------------------------------------------

b2c7b970c5fe4e11c97b7f32d0d64b63e2d63df7
 src/gpu/assemble_MPI_scalar_gpu.c         | 38 +++++++++-----
 src/gpu/assemble_MPI_vector_gpu.c         | 87 +++++++++++++++++++++++--------
 src/gpu/compute_add_sources_elastic_gpu.c | 43 ++++++++++-----
 src/gpu/mesh_constants_gpu.h              | 17 +++---
 src/gpu/prepare_mesh_constants_gpu.c      |  5 ++
 src/gpu/write_seismograms_gpu.c           | 20 +++++--
 6 files changed, 152 insertions(+), 58 deletions(-)

diff --git a/src/gpu/assemble_MPI_scalar_gpu.c b/src/gpu/assemble_MPI_scalar_gpu.c
index 6addb7b..794ad8c 100644
--- a/src/gpu/assemble_MPI_scalar_gpu.c
+++ b/src/gpu/assemble_MPI_scalar_gpu.c
@@ -87,10 +87,13 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
 
       // copies buffer to CPU
       if (GPU_ASYNC_COPY) {
+        if (mp->has_last_copy_evt) {
+          clCheck (clReleaseEvent (mp->last_copy_evt));
+        }
         clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_TRUE,
                                       0, size_mpi_buffer * sizeof (realw),
-                                      mp->h_send_accel_buffer_oc, 1, &kernel_evt, NULL));
-
+                                      mp->h_send_accel_buffer_oc, 1, &kernel_evt, &mp->last_copy_evt));
+        mp->has_last_copy_evt = 1;
       } else {
         clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_FALSE,
                                       0, size_mpi_buffer * sizeof (realw),
@@ -117,10 +120,13 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
                                        global_work_size, local_work_size, 0, NULL, &kernel_evt));
       // copies buffer to CPU
       if (GPU_ASYNC_COPY) {
+        if (mp->has_last_copy_evt) {
+          clCheck (clReleaseEvent (mp->last_copy_evt));
+        }
         clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_TRUE,
                                       0, size_mpi_buffer * sizeof (realw),
-                                      mp->h_b_send_accel_buffer_oc, 1, &kernel_evt, NULL));
-
+                                      mp->h_b_send_accel_buffer_oc, 1, &kernel_evt, &mp->last_copy_evt));
+        mp->has_last_copy_evt = 1;
       } else {
         clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_FALSE,
                                       0, size_mpi_buffer * sizeof (realw),
@@ -209,7 +215,6 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
   // buffer size
   size_mpi_buffer = (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core);
 
-
   // checks if anything to do
   if( size_mpi_buffer <= 0 ) return;
 
@@ -225,11 +230,16 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
     size_t global_work_size[2];
     size_t local_work_size[2];
     cl_uint idx = 0;
+    cl_event *copy_evt = NULL;
+    cl_uint num_evt = 0;
+    
+    if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+      copy_evt = &mp->last_copy_evt;
+      num_evt = 1;
+    }
     
     if (*FORWARD_OR_ADJOINT == 1) {      
-      if (GPU_ASYNC_COPY) {
-        clCheck (clFinish (mocl.copy_queue));
-      } else {
+      if (!GPU_ASYNC_COPY) {
         // copies scalar buffer onto GPU
         clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_FALSE, 0,
                                        mp->max_nibool_interfaces_oc * mp->num_interfaces_outer_core * sizeof (realw),
@@ -250,16 +260,14 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
       global_work_size[1] = num_blocks_y;
 
       clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_potential_on_device, 2, NULL,
-                                       global_work_size, local_work_size, 0, NULL, NULL));
+                                       global_work_size, local_work_size, num_evt, copy_evt, NULL));
     }
     else if (*FORWARD_OR_ADJOINT == 3) {
       // debug
       DEBUG_BACKWARD_ASSEMBLY ();
 
       // copies scalar buffer onto GPU
-      if (GPU_ASYNC_COPY) {
-        clCheck (clFinish (mocl.copy_queue));
-      } else {
+      if (!GPU_ASYNC_COPY) {
         clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_FALSE, 0,
                                        mp->max_nibool_interfaces_oc * mp->num_interfaces_outer_core * sizeof (realw),
                                        buffer_recv_scalar, 0, NULL, NULL));
@@ -278,7 +286,11 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
       global_work_size[1] = num_blocks_y;
 
       clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_potential_on_device, 2, NULL,
-                                       global_work_size, local_work_size, 0, NULL, NULL));
+                                       global_work_size, local_work_size, num_evt, copy_evt, NULL));
+    }
+    if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+      clCheck (clReleaseEvent (mp->last_copy_evt));
+      mp->has_last_copy_evt = 0;
     }
   }
 #endif
diff --git a/src/gpu/assemble_MPI_vector_gpu.c b/src/gpu/assemble_MPI_vector_gpu.c
index d3b522f..7f29d0d 100644
--- a/src/gpu/assemble_MPI_vector_gpu.c
+++ b/src/gpu/assemble_MPI_vector_gpu.c
@@ -96,9 +96,13 @@ void FC_FUNC_(transfer_boun_from_device,
 
           // copies buffer to CPU
           if (GPU_ASYNC_COPY) {
+            if (mp->has_last_copy_evt) {
+              clCheck (clReleaseEvent (mp->last_copy_evt));
+            }
             clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
                                           size_mpi_buffer * sizeof (realw),
-                                          mp->h_send_accel_buffer_cm, 1, &kernel_evt, NULL));
+                                          mp->h_send_accel_buffer_cm, 1, &kernel_evt, &mp->last_copy_evt));
+            mp->has_last_copy_evt = 1;
           } else {
             clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
@@ -119,9 +123,14 @@ void FC_FUNC_(transfer_boun_from_device,
 
           // copies buffer to CPU
           if (GPU_ASYNC_COPY) {
+            if (mp->has_last_copy_evt) {
+              clCheck (clReleaseEvent (mp->last_copy_evt));
+            }
+            
             clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
-                                          mp->h_b_send_accel_buffer_cm, 1, &kernel_evt, NULL));
+                                          mp->h_b_send_accel_buffer_cm, 1, &kernel_evt, &mp->last_copy_evt));
+            mp->has_last_copy_evt = 1;
           } else {
             clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
@@ -219,9 +228,14 @@ void FC_FUNC_(transfer_boun_from_device,
 
           // copies buffer to CPU
           if (GPU_ASYNC_COPY) {
+            if (mp->has_last_copy_evt) {
+              clCheck (clReleaseEvent (mp->last_copy_evt));
+            }
+            
             clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
-                                          mp->h_send_accel_buffer_ic, 1, &kernel_evt, NULL));
+                                          mp->h_send_accel_buffer_ic, 1, &kernel_evt, &mp->last_copy_evt));
+            mp->has_last_copy_evt = 1;
           } else {
             clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
@@ -248,10 +262,14 @@ void FC_FUNC_(transfer_boun_from_device,
 
           // copies buffer to CPU
           if (GPU_ASYNC_COPY) {
+            if (mp->has_last_copy_evt) {
+              clCheck (clReleaseEvent (mp->last_copy_evt));
+            }
+            
             clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
                                           size_mpi_buffer * sizeof (realw),
-                                          mp->h_b_send_accel_buffer_ic, 1, &kernel_evt, NULL));
-            
+                                          mp->h_b_send_accel_buffer_ic, 1, &kernel_evt, &mp->last_copy_evt));
+            mp->has_last_copy_evt = 1;
           } else {            
             clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
@@ -336,6 +354,8 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
   size_t global_work_size[2];
   size_t local_work_size[2];
   cl_uint idx = 0;
+  cl_event *copy_evt = NULL;
+  cl_uint num_evt = 0;
 #endif
 #ifdef USE_CUDA
   dim3 grid,threads;
@@ -356,11 +376,14 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
 
 #ifdef USE_OPENCL
       if (run_opencl) {        
+        if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+          copy_evt = &mp->last_copy_evt;
+          num_evt = 1;
+        }
+        
         if (*FORWARD_OR_ADJOINT == 1) {
           // copies vector buffer values to GPU
-          if (GPU_ASYNC_COPY) {
-            clCheck (clFinish (mocl.copy_queue));
-          } else {
+          if (!GPU_ASYNC_COPY) {
             clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
                                            NDIM * (mp->max_nibool_interfaces_cm) * (mp->num_interfaces_crust_mantle)*sizeof (realw),
                                            buffer_recv_vector, 0, NULL, NULL));
@@ -378,19 +401,18 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
           global_work_size[0] = num_blocks_x * blocksize;
           global_work_size[1] = num_blocks_y;
 
-          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, num_evt, copy_evt, NULL));
         } else if (*FORWARD_OR_ADJOINT == 3) {
           // debug
           DEBUG_BACKWARD_ASSEMBLY ();
           
-          if (GPU_ASYNC_COPY) {
-            clCheck (clFinish (mocl.command_queue));
-          } else {
+          if (!GPU_ASYNC_COPY) {
             // copies vector buffer values to GPU
             clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
                                            NDIM * (mp->max_nibool_interfaces_cm) * (mp->num_interfaces_crust_mantle) * sizeof (realw),
                                            buffer_recv_vector, 0, NULL, NULL));
           }
+          
           //assemble adjoint accel
           clCheck (clSetKernelArg (mocl.kernels.assemble_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_b_accel_crust_mantle.ocl));
           clCheck (clSetKernelArg (mocl.kernels.assemble_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_b_send_accel_buffer_crust_mantle.ocl));
@@ -404,7 +426,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
           global_work_size[0] = num_blocks_x * blocksize;
           global_work_size[1] = num_blocks_y;
 
-          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, num_evt, copy_evt, NULL));
         }
       }
 #endif
@@ -470,7 +492,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
   if (*IREGION == IREGION_INNER_CORE) {
     size_mpi_buffer = NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core);
 
-    if( size_mpi_buffer > 0 ){
+    if (size_mpi_buffer > 0) {
       // assembles values
       blocksize = BLOCKSIZE_TRANSFER;
       size_padded = ((int) ceil (((double) mp->max_nibool_interfaces_ic) / ((double) blocksize))) * blocksize;
@@ -480,11 +502,15 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
 #ifdef USE_OPENCL
       if (run_opencl) {
         idx = 0;
+
+        if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+          copy_evt = &mp->last_copy_evt;
+          num_evt = 1;
+        }
+        
         if (*FORWARD_OR_ADJOINT == 1) {
           // copies buffer values to GPU
-          if (GPU_ASYNC_COPY) {
-            clCheck (clFinish (mocl.copy_queue));
-          } else {
+          if (!GPU_ASYNC_COPY) {
             clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
                                            NDIM * (mp->max_nibool_interfaces_ic) * (mp->num_interfaces_inner_core)*sizeof (realw),
                                            buffer_recv_vector, 0, NULL, NULL));
@@ -507,9 +533,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
           // debug
           DEBUG_BACKWARD_ASSEMBLY ();
           
-          if (GPU_ASYNC_COPY) {
-            clCheck (clFinish (mocl.copy_queue));
-          } else {
+          if (!GPU_ASYNC_COPY) {
             // copies buffer values to GPU
             clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
                                            NDIM * (mp->max_nibool_interfaces_ic) * (mp->num_interfaces_inner_core) * sizeof (realw),
@@ -528,7 +552,12 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
           global_work_size[0] = num_blocks_x * blocksize;
           global_work_size[1] = num_blocks_y;
 
-          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, num_evt, copy_evt, NULL));
+        }
+
+        if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+          clCheck (clReleaseEvent (mp->last_copy_evt));
+          mp->has_last_copy_evt = 0;
         }
       }
 #endif
@@ -774,7 +803,6 @@ void FC_FUNC_(sync_copy_from_device,
   TRACE("sync_copy_from_device");
 
   int size_mpi_buffer;
-
   Mesh *mp = (Mesh *)(*Mesh_pointer); // get Mesh from Fortran integer wrapper
 
   // checks async-memcpy
@@ -796,6 +824,11 @@ void FC_FUNC_(sync_copy_from_device,
       // waits for asynchronous copy to finish
 #ifdef USE_OPENCL
       if (run_opencl) {
+        if (mp->has_last_copy_evt) {
+          clCheck (clReleaseEvent (mp->last_copy_evt));
+          mp->has_last_copy_evt = 0;
+        }
+        
         clCheck (clFinish (mocl.copy_queue));
       }
 #endif
@@ -822,6 +855,11 @@ void FC_FUNC_(sync_copy_from_device,
       // waits for asynchronous copy to finish
 #ifdef USE_OPENCL
       if (run_opencl) {
+        if (mp->has_last_copy_evt) {
+          clCheck (clReleaseEvent (mp->last_copy_evt));
+          mp->has_last_copy_evt = 0;
+        }
+        
         clCheck (clFinish (mocl.copy_queue));
       }
 #endif
@@ -849,6 +887,11 @@ void FC_FUNC_(sync_copy_from_device,
       // waits for asynchronous copy to finish
 #ifdef USE_OPENCL
       if (run_opencl) {
+        if (mp->has_last_copy_evt) {
+          clCheck (clReleaseEvent (mp->last_copy_evt));
+          mp->has_last_copy_evt = 0;
+        }
+        
         clCheck (clFinish (mocl.copy_queue));
       }
 #endif
diff --git a/src/gpu/compute_add_sources_elastic_gpu.c b/src/gpu/compute_add_sources_elastic_gpu.c
index 1ccc2f1..e285c0c 100644
--- a/src/gpu/compute_add_sources_elastic_gpu.c
+++ b/src/gpu/compute_add_sources_elastic_gpu.c
@@ -218,9 +218,12 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
     size_t global_work_size[3];
     size_t local_work_size[3];
     cl_uint idx = 0;
+    cl_event *copy_evt = NULL;
+    cl_uint num_evt = 0;
     
-    if (GPU_ASYNC_COPY) {
-      clCheck (clFinish (mocl.copy_queue));
+    if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+      copy_evt = &mp->last_copy_evt;
+      num_evt = 1;
     }
 
     clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_adjoint_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_accel_crust_mantle.ocl));
@@ -240,7 +243,12 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
     global_work_size[2] = NGLLX;
 
     clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.compute_add_sources_adjoint_kernel, 3, NULL,
-                                     global_work_size, local_work_size, 0, NULL, NULL));
+                                     global_work_size, local_work_size, num_evt, copy_evt, NULL));
+
+    if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+      clCheck (clReleaseEvent (mp->last_copy_evt));
+      mp->has_last_copy_evt = 0;
+    }
   }
 #endif
 #ifdef USE_CUDA
@@ -286,7 +294,7 @@ void FC_FUNC_(transfer_adj_to_device,
   Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of Fortran integer container
 
   // check if anything to do
-  if(mp->nadj_rec_local == 0)
+  if (mp->nadj_rec_local == 0)
     return;
 
   // total number of receivers/adjoint sources
@@ -304,8 +312,8 @@ void FC_FUNC_(transfer_adj_to_device,
   int i,j,k,irec,irec_local;
 
   irec_local = 0;
-  for(irec = 0; irec < nrec; irec++) {
-    if(mp->myrank == h_islice_selected_rec[irec]) {
+  for (irec = 0; irec < nrec; irec++) {
+    if (mp->myrank == h_islice_selected_rec[irec]) {
       // takes only local sources
       for (k = 0; k < NGLLX; k++) {
         for (j = 0; j < NGLLX; j++) {
@@ -342,9 +350,6 @@ void FC_FUNC_(transfer_adj_to_device,
   if (run_cuda) {
     print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays.cuda, mp->h_adj_sourcearrays_slice,
                                        (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),71000);
-
-
-
   }
 #endif
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -370,12 +375,12 @@ void FC_FUNC_(transfer_adj_to_device_async,
   Mesh *mp = (Mesh *)(*Mesh_pointer); //get mesh pointer out of Fortran integer container
 
   // check if anything to do
-  if( mp->nadj_rec_local == 0 ) {
+  if (mp->nadj_rec_local == 0) {
     return;
   }
 
   // checks async-memcpy
-  if( GPU_ASYNC_COPY == 0 ){
+  if (GPU_ASYNC_COPY == 0) {
     exit_on_error("transfer_adj_to_device_async must be called with GPU_ASYNC_COPY == 1, \
 please check mesh_constants_cuda.h");
   }
@@ -395,6 +400,11 @@ please check mesh_constants_cuda.h");
 
 #if USE_OPENCL
   if (run_opencl) {
+    if (mp->has_last_copy_evt) {
+      clCheck (clReleaseEvent (mp->last_copy_evt));
+      mp->has_last_copy_evt = 0;
+    }
+    
     clCheck (clFinish (mocl.copy_queue));
   }
 #endif
@@ -434,10 +444,17 @@ please check mesh_constants_cuda.h");
   }
 #if USE_OPENCL
   if (run_opencl) {
-    clCheck (clFinish (mocl.command_queue));
+    cl_event *copy_evt = NULL;
+    cl_uint num_evt = 0;
+    
+    if (mp->has_last_copy_evt) {
+      clCheck (clReleaseEvent (mp->last_copy_evt));
+    }
+    
     clCheck (clEnqueueWriteBuffer (mocl.copy_queue, mp->d_adj_sourcearrays.ocl, CL_FALSE, 0,
                                    mp->nadj_rec_local * NDIM * NGLL3 * sizeof (realw),
-                                   mp->h_adj_sourcearrays_slice, 0, NULL, NULL));
+                                   mp->h_adj_sourcearrays_slice, num_evt, copy_evt, &mp->last_copy_evt));
+    mp->has_last_copy_evt = 1;
   }
 #endif
 #if USE_CUDA
diff --git a/src/gpu/mesh_constants_gpu.h b/src/gpu/mesh_constants_gpu.h
index 70830aa..246bcb4 100644
--- a/src/gpu/mesh_constants_gpu.h
+++ b/src/gpu/mesh_constants_gpu.h
@@ -875,12 +875,6 @@ typedef struct mesh_ {
   // optimizations
   // ------------------------------------------------------------------ //
 
-#if USE_CUDA
-  // overlapped memcpy streams
-  cudaStream_t compute_stream;
-  cudaStream_t copy_stream;
-#endif
-
   // A buffer for MPI send/recv, which is duplicated in Fortran but is
   // allocated with pinned memory to facilitate asynchronous device <->
   // host memory transfers
@@ -924,6 +918,17 @@ typedef struct mesh_ {
   cl_mem h_pinned_b_send_accel_buffer_oc;
   cl_mem h_pinned_b_recv_accel_buffer_oc;
 #endif
+
+#if USE_CUDA
+  // overlapped memcpy streams
+  cudaStream_t compute_stream;
+  cudaStream_t copy_stream;
+#endif
+#if USE_OPENCL
+  cl_event last_copy_evt;
+  int has_last_copy_evt;
+#endif
+  
 } Mesh;
 
 
diff --git a/src/gpu/prepare_mesh_constants_gpu.c b/src/gpu/prepare_mesh_constants_gpu.c
index fc2eeaa..bd3c0ec 100644
--- a/src/gpu/prepare_mesh_constants_gpu.c
+++ b/src/gpu/prepare_mesh_constants_gpu.c
@@ -624,6 +624,11 @@ void FC_FUNC_ (prepare_constants_device,
     cudaStreamCreate(&mp->copy_stream);
   }
 #endif
+#if USE_OPENCL
+  if (run_opencl) {
+    mp->has_last_copy_evt = 0;
+  }
+#endif
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_gpu_error ("prepare_constants_device");
 #endif
diff --git a/src/gpu/write_seismograms_gpu.c b/src/gpu/write_seismograms_gpu.c
index 12f0bb3..2e0019d 100644
--- a/src/gpu/write_seismograms_gpu.c
+++ b/src/gpu/write_seismograms_gpu.c
@@ -60,9 +60,12 @@ void write_seismograms_transfer_from_device (Mesh *mp,
     size_t local_work_size[2];
     cl_uint idx = 0;
     cl_event kernel_evt;
+    cl_event *copy_evt = NULL;
+    cl_uint num_evt = 0;
     
-    if (GPU_ASYNC_COPY) {
-      clCheck (clFinish (mocl.copy_queue));
+    if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
+      copy_evt = &mp->last_copy_evt;
+      num_evt = 1;
     }
     
     clCheck (clSetKernelArg (mocl.kernels.write_seismograms_transfer_from_device_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_number_receiver_global.ocl));
@@ -77,13 +80,18 @@ void write_seismograms_transfer_from_device (Mesh *mp,
     global_work_size[0] = num_blocks_x * blocksize;
     global_work_size[1] = num_blocks_y;
 
-    clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.write_seismograms_transfer_from_device_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
+    clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.write_seismograms_transfer_from_device_kernel, 2, NULL, global_work_size, local_work_size, num_evt, copy_evt, &kernel_evt));
 
     //copies array to CPU
     if (GPU_ASYNC_COPY) {
+      if (mp->has_last_copy_evt) {
+        clCheck (clReleaseEvent (mp->last_copy_evt));
+      }
+      
       clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_station_seismo_field.ocl, CL_FALSE, 0,
                                     3 * NGLL3 * mp->nrec_local * sizeof (realw),
-                                    mp->h_station_seismo_field, 1, &kernel_evt, NULL));
+                                    mp->h_station_seismo_field, 1, &kernel_evt, &mp->last_copy_evt));
+      mp->has_last_copy_evt = 1;
     } else {
       clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_station_seismo_field.ocl, CL_TRUE, 0,
                                     3 * NGLL3 * mp->nrec_local * sizeof (realw),
@@ -368,6 +376,10 @@ void FC_FUNC_(transfer_seismo_from_device_async,
   // waits for previous copy call to be finished
 #ifdef USE_OPENCL
   if (run_opencl) {
+    if (mp->has_last_copy_evt) {
+      clCheck (clReleaseEvent (mp->last_copy_evt));
+      mp->has_last_copy_evt = 0;
+    }
     clCheck (clFinish (mocl.copy_queue));
   }
 #endif