[cig-commits] [commit] devel, master: use ocl events for async dependencies (5cfc0d4)

Thu Nov 6 08:15:33 PST 2014

Repository : https://github.com/geodynamics/specfem3d_globe

On branches: devel,master
Link       : https://github.com/geodynamics/specfem3d_globe/compare/bc58e579b3b0838a0968725a076f5904845437ca...be63f20cbb6f462104e949894dbe205d2398cd7f

>---------------------------------------------------------------

commit 5cfc0d40af32a20d89c334427dd3e34440b73a65
Author: Kevin Pouget <kevin.pouget at st.com>
Date:   Mon May 12 14:46:56 2014 +0200

    use ocl events for async dependencies


>---------------------------------------------------------------

5cfc0d40af32a20d89c334427dd3e34440b73a65
 src/gpu/assemble_MPI_scalar_gpu.c         | 19 ++++++-------
 src/gpu/assemble_MPI_vector_gpu.c         | 46 ++++++++++++++++---------------
 src/gpu/compute_add_sources_elastic_gpu.c | 46 +++++++++++++++----------------
 src/gpu/write_seismograms_gpu.c           | 25 +++++++++--------
 4 files changed, 69 insertions(+), 67 deletions(-)

diff --git a/src/gpu/assemble_MPI_scalar_gpu.c b/src/gpu/assemble_MPI_scalar_gpu.c
index 985989e..6addb7b 100644
--- a/src/gpu/assemble_MPI_scalar_gpu.c
+++ b/src/gpu/assemble_MPI_scalar_gpu.c
@@ -68,6 +68,7 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
     size_t global_work_size[2];
     size_t local_work_size[2];
     cl_uint idx = 0;
+    cl_event kernel_evt;
     
     if (*FORWARD_OR_ADJOINT == 1) {
       clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_potential_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_accel_outer_core.ocl));
@@ -82,21 +83,18 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
       global_work_size[0] = num_blocks_x * blocksize;
       global_work_size[1] = num_blocks_y;
       
-      clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_potential_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+      clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_potential_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
 
       // copies buffer to CPU
       if (GPU_ASYNC_COPY) {
-        clCheck(clFlush(mocl.command_queue));
-        
         clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_TRUE,
                                       0, size_mpi_buffer * sizeof (realw),
-                                      mp->h_send_accel_buffer_oc, 0, NULL, NULL));
+                                      mp->h_send_accel_buffer_oc, 1, &kernel_evt, NULL));
 
       } else {
         clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_FALSE,
                                       0, size_mpi_buffer * sizeof (realw),
                                       send_buffer, 0, NULL, NULL));
-
       }
     }
     else if (*FORWARD_OR_ADJOINT == 3) {
@@ -116,14 +114,12 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
       global_work_size[1] = num_blocks_y;
 
       clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_potential_on_device, 2, NULL,
-                                       global_work_size, local_work_size, 0, NULL, NULL));
+                                       global_work_size, local_work_size, 0, NULL, &kernel_evt));
       // copies buffer to CPU
       if (GPU_ASYNC_COPY) {
-        clCheck(clFlush(mocl.command_queue));
-        
         clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_TRUE,
                                       0, size_mpi_buffer * sizeof (realw),
-                                      mp->h_b_send_accel_buffer_oc, 0, NULL, NULL));
+                                      mp->h_b_send_accel_buffer_oc, 1, &kernel_evt, NULL));
 
       } else {
         clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_FALSE,
@@ -131,6 +127,7 @@ void FC_FUNC_ (transfer_boun_pot_from_device,
                                       send_buffer, 0, NULL, NULL));
       }
     }
+    clReleaseEvent (kernel_evt);
   }
 #endif
 #ifdef USE_CUDA
@@ -231,7 +228,7 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
     
     if (*FORWARD_OR_ADJOINT == 1) {      
       if (GPU_ASYNC_COPY) {
-        clCheck(clFlush(mocl.copy_queue));
+        clCheck (clFinish (mocl.copy_queue));
       } else {
         // copies scalar buffer onto GPU
         clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_outer_core.ocl, CL_FALSE, 0,
@@ -261,7 +258,7 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
 
       // copies scalar buffer onto GPU
       if (GPU_ASYNC_COPY) {
-        clCheck(clFlush(mocl.copy_queue));
+        clCheck (clFinish (mocl.copy_queue));
       } else {
         clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_outer_core.ocl, CL_FALSE, 0,
                                        mp->max_nibool_interfaces_oc * mp->num_interfaces_outer_core * sizeof (realw),
diff --git a/src/gpu/assemble_MPI_vector_gpu.c b/src/gpu/assemble_MPI_vector_gpu.c
index e280a7d..d3b522f 100644
--- a/src/gpu/assemble_MPI_vector_gpu.c
+++ b/src/gpu/assemble_MPI_vector_gpu.c
@@ -76,12 +76,15 @@ void FC_FUNC_(transfer_boun_from_device,
 
 #ifdef USE_OPENCL
       if (run_opencl) {
+        cl_event kernel_evt;
+        
         local_work_size[0] = blocksize;
         local_work_size[1] = 1;
         global_work_size[0] = num_blocks_x * blocksize;
         global_work_size[1] = num_blocks_y;
 
         if (*FORWARD_OR_ADJOINT == 1) {
+            
           clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_accel_crust_mantle.ocl));
           clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_send_accel_buffer_crust_mantle.ocl));
           clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (int), (void *) &mp->num_interfaces_crust_mantle));
@@ -89,20 +92,18 @@ void FC_FUNC_(transfer_boun_from_device,
           clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_nibool_interfaces_crust_mantle.ocl));
           clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_ibool_interfaces_crust_mantle.ocl));
 
-          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
 
           // copies buffer to CPU
           if (GPU_ASYNC_COPY) {
-            clCheck(clFlush(mocl.command_queue));
             clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
                                           size_mpi_buffer * sizeof (realw),
-                                          mp->h_send_accel_buffer_cm, 0, NULL, NULL));
+                                          mp->h_send_accel_buffer_cm, 1, &kernel_evt, NULL));
           } else {
             clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
                                           send_accel_buffer, 0, NULL, NULL));
           }
-
         } else if (*FORWARD_OR_ADJOINT == 3) {
           // debug
           DEBUG_BACKWARD_ASSEMBLY ();
@@ -114,20 +115,20 @@ void FC_FUNC_(transfer_boun_from_device,
           clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_nibool_interfaces_crust_mantle.ocl));
           clCheck (clSetKernelArg (mocl.kernels.prepare_boundary_accel_on_device, idx++, sizeof (cl_mem), (void *) &mp->d_ibool_interfaces_crust_mantle.ocl));
 
-          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
 
           // copies buffer to CPU
           if (GPU_ASYNC_COPY) {
-            clCheck(clFlush(mocl.command_queue));
             clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
-                                          mp->h_b_send_accel_buffer_cm, 0, NULL, NULL));
+                                          mp->h_b_send_accel_buffer_cm, 1, &kernel_evt, NULL));
           } else {
             clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
                                           send_accel_buffer, 0, NULL, NULL));
           }
         }
+        clReleaseEvent (kernel_evt);
       }
 #endif
 #ifdef USE_CUDA
@@ -197,6 +198,8 @@ void FC_FUNC_(transfer_boun_from_device,
 
 #ifdef USE_OPENCL
       if (run_opencl) {
+        cl_event kernel_evt;
+        
         idx = 0;
         
         if (*FORWARD_OR_ADJOINT == 1) {
@@ -212,14 +215,13 @@ void FC_FUNC_(transfer_boun_from_device,
           global_work_size[0] = num_blocks_x * blocksize;
           global_work_size[1] = num_blocks_y;
 
-          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
 
           // copies buffer to CPU
           if (GPU_ASYNC_COPY) {
-            clCheck(clFlush(mocl.command_queue));
             clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
-                                          mp->h_send_accel_buffer_ic, 0, NULL, NULL));
+                                          mp->h_send_accel_buffer_ic, 1, &kernel_evt, NULL));
           } else {
             clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
@@ -242,21 +244,21 @@ void FC_FUNC_(transfer_boun_from_device,
           global_work_size[0] = num_blocks_x * blocksize;
           global_work_size[1] = num_blocks_y;
 
-          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+          clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.prepare_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
 
           // copies buffer to CPU
           if (GPU_ASYNC_COPY) {
-            clCheck(clFlush(mocl.command_queue));
             clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
                                           size_mpi_buffer * sizeof (realw),
-                                          mp->h_b_send_accel_buffer_ic, 0, NULL, NULL));
-          } else {
+                                          mp->h_b_send_accel_buffer_ic, 1, &kernel_evt, NULL));
             
+          } else {            
             clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_TRUE, 0,
                                           size_mpi_buffer * sizeof (realw),
                                           send_accel_buffer, 0, NULL, NULL));
           }
         }
+        clReleaseEvent (kernel_evt);
       }
 #endif
 #ifdef USE_CUDA
@@ -357,7 +359,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
         if (*FORWARD_OR_ADJOINT == 1) {
           // copies vector buffer values to GPU
           if (GPU_ASYNC_COPY) {
-            clCheck(clFlush(mocl.copy_queue));
+            clCheck (clFinish (mocl.copy_queue));
           } else {
             clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
                                            NDIM * (mp->max_nibool_interfaces_cm) * (mp->num_interfaces_crust_mantle)*sizeof (realw),
@@ -382,7 +384,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
           DEBUG_BACKWARD_ASSEMBLY ();
           
           if (GPU_ASYNC_COPY) {
-            clCheck(clFlush(mocl.command_queue));
+            clCheck (clFinish (mocl.command_queue));
           } else {
             // copies vector buffer values to GPU
             clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_crust_mantle.ocl, CL_FALSE, 0,
@@ -481,7 +483,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
         if (*FORWARD_OR_ADJOINT == 1) {
           // copies buffer values to GPU
           if (GPU_ASYNC_COPY) {
-            clCheck(clFlush(mocl.copy_queue));
+            clCheck (clFinish (mocl.copy_queue));
           } else {
             clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
                                            NDIM * (mp->max_nibool_interfaces_ic) * (mp->num_interfaces_inner_core)*sizeof (realw),
@@ -506,7 +508,7 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
           DEBUG_BACKWARD_ASSEMBLY ();
           
           if (GPU_ASYNC_COPY) {
-            clCheck(clFlush(mocl.copy_queue));
+            clCheck (clFinish (mocl.copy_queue));
           } else {
             // copies buffer values to GPU
             clCheck (clEnqueueWriteBuffer (mocl.command_queue, mp->d_b_send_accel_buffer_inner_core.ocl, CL_FALSE, 0,
@@ -640,7 +642,7 @@ void FC_FUNC_(transfer_buffer_to_device_async,
         DEBUG_BACKWARD_ASSEMBLY();
 
         // copy on host memory
-        memcpy(mp->h_b_recv_accel_buffer_cm, buffer, size_mpi_buffer * sizeof(realw));
+        memcpy (mp->h_b_recv_accel_buffer_cm, buffer, size_mpi_buffer * sizeof(realw));
 
         // asynchronous copy to GPU using copy_stream
 #ifdef USE_OPENCL
@@ -794,7 +796,7 @@ void FC_FUNC_(sync_copy_from_device,
       // waits for asynchronous copy to finish
 #ifdef USE_OPENCL
       if (run_opencl) {
-        clCheck(clFlush(mocl.copy_queue));
+        clCheck (clFinish (mocl.copy_queue));
       }
 #endif
 #ifdef USE_CUDA
@@ -820,7 +822,7 @@ void FC_FUNC_(sync_copy_from_device,
       // waits for asynchronous copy to finish
 #ifdef USE_OPENCL
       if (run_opencl) {
-        clCheck(clFlush(mocl.copy_queue));
+        clCheck (clFinish (mocl.copy_queue));
       }
 #endif
 #ifdef USE_CUDA
@@ -847,7 +849,7 @@ void FC_FUNC_(sync_copy_from_device,
       // waits for asynchronous copy to finish
 #ifdef USE_OPENCL
       if (run_opencl) {
-        clCheck(clFlush(mocl.copy_queue));
+        clCheck (clFinish (mocl.copy_queue));
       }
 #endif
 #ifdef USE_CUDA
diff --git a/src/gpu/compute_add_sources_elastic_gpu.c b/src/gpu/compute_add_sources_elastic_gpu.c
index 1b1c7e9..1ccc2f1 100644
--- a/src/gpu/compute_add_sources_elastic_gpu.c
+++ b/src/gpu/compute_add_sources_elastic_gpu.c
@@ -213,26 +213,6 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
   // the irec_local variable needs to be precomputed (as
   // h_pre_comp..), because normally it is in the loop updating accel,
   // and due to how it's incremented, it cannot be parallelized
-#ifdef USE_CUDA
-  if (run_cuda) {
-    // waits for previous transfer_** calls to be finished
-    if (GPU_ASYNC_COPY ){
-      // waits for asynchronous copy to finish
-      cudaStreamSynchronize(mp->copy_stream);
-    }
-
-    dim3 grid(num_blocks_x,num_blocks_y,1);
-    dim3 threads(NGLLX,NGLLX,NGLLX);
-
-    compute_add_sources_adjoint_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle.cuda,
-                                                                                   nrec,
-                                                                                   mp->d_adj_sourcearrays.cuda,
-                                                                                   mp->d_ibool_crust_mantle.cuda,
-                                                                                   mp->d_ispec_selected_rec.cuda,
-                                                                                   mp->d_pre_computed_irec.cuda,
-                                                                                   mp->nadj_rec_local);
-  }
-#endif
 #if USE_OPENCL
   if (run_opencl) {
     size_t global_work_size[3];
@@ -240,7 +220,7 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
     cl_uint idx = 0;
     
     if (GPU_ASYNC_COPY) {
-      clCheck(clFlush(mocl.copy_queue));
+      clCheck (clFinish (mocl.copy_queue));
     }
 
     clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_adjoint_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_accel_crust_mantle.ocl));
@@ -263,6 +243,26 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
                                      global_work_size, local_work_size, 0, NULL, NULL));
   }
 #endif
+#ifdef USE_CUDA
+  if (run_cuda) {
+    // waits for previous transfer_** calls to be finished
+    if (GPU_ASYNC_COPY ){
+      // waits for asynchronous copy to finish
+      cudaStreamSynchronize(mp->copy_stream);
+    }
+
+    dim3 grid(num_blocks_x,num_blocks_y,1);
+    dim3 threads(NGLLX,NGLLX,NGLLX);
+
+    compute_add_sources_adjoint_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle.cuda,
+                                                                                   nrec,
+                                                                                   mp->d_adj_sourcearrays.cuda,
+                                                                                   mp->d_ibool_crust_mantle.cuda,
+                                                                                   mp->d_ispec_selected_rec.cuda,
+                                                                                   mp->d_pre_computed_irec.cuda,
+                                                                                   mp->nadj_rec_local);
+  }
+#endif
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
   exit_on_gpu_error("compute_add_sources_adjoint_cuda");
 #endif
@@ -395,7 +395,7 @@ please check mesh_constants_cuda.h");
 
 #if USE_OPENCL
   if (run_opencl) {
-    clCheck(clFlush(mocl.copy_queue));
+    clCheck (clFinish (mocl.copy_queue));
   }
 #endif
 #if USE_CUDA
@@ -434,7 +434,7 @@ please check mesh_constants_cuda.h");
   }
 #if USE_OPENCL
   if (run_opencl) {
-    clCheck(clFlush(mocl.command_queue));
+    clCheck (clFinish (mocl.command_queue));
     clCheck (clEnqueueWriteBuffer (mocl.copy_queue, mp->d_adj_sourcearrays.ocl, CL_FALSE, 0,
                                    mp->nadj_rec_local * NDIM * NGLL3 * sizeof (realw),
                                    mp->h_adj_sourcearrays_slice, 0, NULL, NULL));
diff --git a/src/gpu/write_seismograms_gpu.c b/src/gpu/write_seismograms_gpu.c
index 240d8b1..12f0bb3 100644
--- a/src/gpu/write_seismograms_gpu.c
+++ b/src/gpu/write_seismograms_gpu.c
@@ -56,13 +56,14 @@ void write_seismograms_transfer_from_device (Mesh *mp,
 
 #ifdef USE_OPENCL
   if (run_opencl) {    
-    if (GPU_ASYNC_COPY) {
-      clCheck(clFlush(mocl.copy_queue));
-    }
-    
     size_t global_work_size[2];
     size_t local_work_size[2];
     cl_uint idx = 0;
+    cl_event kernel_evt;
+    
+    if (GPU_ASYNC_COPY) {
+      clCheck (clFinish (mocl.copy_queue));
+    }
     
     clCheck (clSetKernelArg (mocl.kernels.write_seismograms_transfer_from_device_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_number_receiver_global.ocl));
     clCheck (clSetKernelArg (mocl.kernels.write_seismograms_transfer_from_device_kernel, idx++, sizeof (cl_mem), (void *) &d_ispec_selected->ocl));
@@ -76,20 +77,20 @@ void write_seismograms_transfer_from_device (Mesh *mp,
     global_work_size[0] = num_blocks_x * blocksize;
     global_work_size[1] = num_blocks_y;
 
-
-    clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.write_seismograms_transfer_from_device_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
+    clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.write_seismograms_transfer_from_device_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, &kernel_evt));
 
     //copies array to CPU
     if (GPU_ASYNC_COPY) {
-      clCheck(clFlush(mocl.command_queue));
       clCheck (clEnqueueReadBuffer (mocl.copy_queue, mp->d_station_seismo_field.ocl, CL_FALSE, 0,
                                     3 * NGLL3 * mp->nrec_local * sizeof (realw),
-                                    mp->h_station_seismo_field, 0, NULL, NULL));
+                                    mp->h_station_seismo_field, 1, &kernel_evt, NULL));
     } else {
       clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_station_seismo_field.ocl, CL_TRUE, 0,
                                     3 * NGLL3 * mp->nrec_local * sizeof (realw),
                                     mp->h_station_seismo_field, 0, NULL, NULL));
     }
+    
+    clReleaseEvent (kernel_evt);
   }
 #endif
 #if USE_CUDA
@@ -190,7 +191,6 @@ void write_seismograms_transfer_strain_from_device (Mesh *mp,
 
     clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.write_seismograms_transfer_strain_from_device_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
 
-
     //copies array to CPU
     clCheck (clEnqueueReadBuffer (mocl.command_queue, mp->d_station_seismo_field.ocl, CL_TRUE, 0,
                                   NGLL3 * mp->nrec_local * sizeof (realw),
@@ -334,9 +334,7 @@ void FC_FUNC_ (write_seismograms_transfer_gpu,
 }
 
 /* ----------------------------------------------------------------------------------------------- */
-
 // data transfer to CPU host
-
 /* ----------------------------------------------------------------------------------------------- */
 
 extern EXTERN_LANG
@@ -368,6 +366,11 @@ void FC_FUNC_(transfer_seismo_from_device_async,
   }
 
   // waits for previous copy call to be finished
+#ifdef USE_OPENCL
+  if (run_opencl) {
+    clCheck (clFinish (mocl.copy_queue));
+  }
+#endif
 #ifdef USE_CUDA
   if (run_cuda) {
     cudaStreamSynchronize(mp->copy_stream);