[cig-commits] [commit] devel, master: bug fix for gpu output infos for displacement and strain when checking norm of arrays (9cb66c4)

Thu Nov 6 08:28:36 PST 2014

Repository : https://github.com/geodynamics/specfem3d_globe

On branches: devel,master
Link       : https://github.com/geodynamics/specfem3d_globe/compare/bc58e579b3b0838a0968725a076f5904845437ca...be63f20cbb6f462104e949894dbe205d2398cd7f

>---------------------------------------------------------------

commit 9cb66c48b146c822835de6c546833b825ffccd81
Author: daniel peter <peterda at ethz.ch>
Date:   Tue Aug 12 13:39:29 2014 +0200

    bug fix for gpu output infos for displacement and strain when checking norm of arrays


>---------------------------------------------------------------

9cb66c48b146c822835de6c546833b825ffccd81
 src/gpu/assemble_MPI_scalar_gpu.c         | 16 +++++---
 src/gpu/assemble_MPI_vector_gpu.c         | 28 ++++++++------
 src/gpu/check_fields_gpu.c                | 63 ++++++++++++++++++++-----------
 src/gpu/compute_add_sources_elastic_gpu.c | 18 +++++----
 src/gpu/mesh_constants_gpu.h              |  5 +--
 src/gpu/prepare_mesh_constants_gpu.c      | 44 ++++++++++++---------
 src/gpu/update_displacement_gpu.c         | 21 +++++------
 src/gpu/write_seismograms_gpu.c           | 10 +++--
 src/specfem3D/check_stability.f90         |  1 -
 9 files changed, 123 insertions(+), 83 deletions(-)

diff --git a/src/gpu/assemble_MPI_scalar_gpu.c b/src/gpu/assemble_MPI_scalar_gpu.c
index 5a89d0b..1d9d8e7 100644
--- a/src/gpu/assemble_MPI_scalar_gpu.c
+++ b/src/gpu/assemble_MPI_scalar_gpu.c
@@ -233,9 +233,11 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
     cl_event *copy_evt = NULL;
     cl_uint num_evt = 0;
 
-    if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
-      copy_evt = &mp->last_copy_evt;
-      num_evt = 1;
+    if (GPU_ASYNC_COPY ){
+      if ( mp->has_last_copy_evt) {
+        copy_evt = &mp->last_copy_evt;
+        num_evt = 1;
+      }
     }
 
     if (*FORWARD_OR_ADJOINT == 1) {
@@ -288,9 +290,11 @@ void FC_FUNC_ (transfer_asmbl_pot_to_device,
       clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_potential_on_device, 2, NULL,
                                        global_work_size, local_work_size, num_evt, copy_evt, NULL));
     }
-    if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
-      clCheck (clReleaseEvent (mp->last_copy_evt));
-      mp->has_last_copy_evt = 0;
+    if (GPU_ASYNC_COPY ){
+      if ( mp->has_last_copy_evt) {
+        clCheck (clReleaseEvent (mp->last_copy_evt));
+        mp->has_last_copy_evt = 0;
+      }
     }
   }
 #endif
diff --git a/src/gpu/assemble_MPI_vector_gpu.c b/src/gpu/assemble_MPI_vector_gpu.c
index d28af3e..3ace0b7 100644
--- a/src/gpu/assemble_MPI_vector_gpu.c
+++ b/src/gpu/assemble_MPI_vector_gpu.c
@@ -379,9 +379,11 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
 
 #ifdef USE_OPENCL
       if (run_opencl) {
-        if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
-          copy_evt = &mp->last_copy_evt;
-          num_evt = 1;
+        if (GPU_ASYNC_COPY){
+          if ( mp->has_last_copy_evt) {
+            copy_evt = &mp->last_copy_evt;
+            num_evt = 1;
+          }
         }
 
         if (*FORWARD_OR_ADJOINT == 1) {
@@ -506,9 +508,11 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
       if (run_opencl) {
         idx = 0;
 
-        if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
-          copy_evt = &mp->last_copy_evt;
-          num_evt = 1;
+        if (GPU_ASYNC_COPY){
+          if ( mp->has_last_copy_evt) {
+            copy_evt = &mp->last_copy_evt;
+            num_evt = 1;
+          }
         }
 
         if (*FORWARD_OR_ADJOINT == 1) {
@@ -558,9 +562,11 @@ void FC_FUNC_ (transfer_asmbl_accel_to_device,
           clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.assemble_boundary_accel_on_device, 2, NULL, global_work_size, local_work_size, num_evt, copy_evt, NULL));
         }
 
-        if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
-          clCheck (clReleaseEvent (mp->last_copy_evt));
-          mp->has_last_copy_evt = 0;
+        if (GPU_ASYNC_COPY){
+          if( mp->has_last_copy_evt) {
+            clCheck (clReleaseEvent (mp->last_copy_evt));
+            mp->has_last_copy_evt = 0;
+          }
         }
       }
 #endif
@@ -639,7 +645,7 @@ void FC_FUNC_(transfer_buffer_to_device_async,
   Mesh *mp = (Mesh *)(*Mesh_pointer); // get Mesh from Fortran integer wrapper
 
   // checks async-memcpy
-  if (GPU_ASYNC_COPY == 0) {
+  if ( ! GPU_ASYNC_COPY ) {
     exit_on_error("transfer_buffer_to_device_async must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_cuda.h");
   }
 
@@ -809,7 +815,7 @@ void FC_FUNC_(sync_copy_from_device,
   Mesh *mp = (Mesh *)(*Mesh_pointer); // get Mesh from Fortran integer wrapper
 
   // checks async-memcpy
-  if (GPU_ASYNC_COPY == 0) {
+  if ( ! GPU_ASYNC_COPY ) {
     exit_on_error("sync_copy_from_device must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_gpu.h");
   }
 
diff --git a/src/gpu/check_fields_gpu.c b/src/gpu/check_fields_gpu.c
index 28c00fe..4b050d7 100644
--- a/src/gpu/check_fields_gpu.c
+++ b/src/gpu/check_fields_gpu.c
@@ -221,8 +221,6 @@ void exit_on_gpu_error (char *kernel_name) {
       fclose (fp);
     }
 
-    // releases previous contexts
-
     // stops program
 #ifdef WITH_MPI
     MPI_Abort (MPI_COMM_WORLD, 1);
@@ -278,7 +276,7 @@ void print_CUDA_error_if_any(cudaError_t err, int num) {
 #else
     myrank = 0;
 #endif
-    sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+    sprintf(filename,"OUTPUT_FILES/error_message_%06d.txt",myrank);
     fp = fopen(filename,"a+");
     if (fp != NULL){
       fprintf(fp,"\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
@@ -467,14 +465,17 @@ void FC_FUNC_ (get_free_device_memory,
 // Auxiliary functions
 /*----------------------------------------------------------------------------------------------- */
 
-realw get_device_array_maximum_value (Mesh *mp, gpu_realw_mem *d_array, int size) {
+realw get_device_array_maximum_value (gpu_realw_mem *d_array, int size) {
+
+// gets maximum of array on GPU by copying over to CPU and handle it there
+
+  realw *h_array;
   realw max = 0.0f;
 
   // checks if anything to do
   if (size > 0) {
-    realw *h_array = (realw *) calloc(size ,sizeof (realw));
-
     h_array = (realw *) calloc (size, sizeof (realw));
+    
 #ifdef USE_OPENCL
     if (run_opencl) {
       clCheck (clEnqueueReadBuffer (mocl.command_queue, d_array->ocl, CL_TRUE, 0,
@@ -487,11 +488,11 @@ realw get_device_array_maximum_value (Mesh *mp, gpu_realw_mem *d_array, int size
       // explicitly wait for cuda kernels to finish
       // (cudaMemcpy implicitly synchronizes all other cuda operations)
       synchronize_cuda();
-      print_CUDA_error_if_any(cudaMemcpy(h_array,d_array,sizeof(realw)*size,cudaMemcpyDeviceToHost),33001);
+      print_CUDA_error_if_any(cudaMemcpy(h_array,d_array->cuda,sizeof(realw)*size,cudaMemcpyDeviceToHost),33001);
     }
 #endif
     // finds maximum value in array
-    max = h_array[0];
+    max = abs(h_array[0]);
     int i;
     for (i = 1; i < size; i++) {
       if (abs (h_array[i]) > max)
@@ -516,12 +517,11 @@ void FC_FUNC_ (check_norm_acoustic_from_device,
 
   Mesh *mp = (Mesh *) *Mesh_pointer_f;     //get mesh pointer out of Fortran integer container
   realw max;
-  gpu_realw_mem d_max;
 
   max = 0.0f;
 
-  // way 2 b: timing Elapsed time: 1.236916e-03
   // launch simple reduction kernel
+  gpu_realw_mem d_max;
   realw *h_max;
   int blocksize = BLOCKSIZE_TRANSFER;
 
@@ -560,6 +560,7 @@ void FC_FUNC_ (check_norm_acoustic_from_device,
 
     clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.get_maximum_scalar_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
 
+    // copies to CPU
     clCheck (clEnqueueReadBuffer (mocl.command_queue, d_max.ocl, CL_TRUE, 0,
                                   num_blocks_x * num_blocks_y * sizeof (realw),
                                   h_max, 0, NULL, NULL));
@@ -577,7 +578,7 @@ void FC_FUNC_ (check_norm_acoustic_from_device,
     }else if(*FORWARD_OR_ADJOINT == 3 ){
       get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_outer_core.cuda,size,d_max.cuda);
     }
-
+    // copies to CPU
     print_CUDA_error_if_any(cudaMemcpy(h_max,d_max.cuda,num_blocks_x*num_blocks_y*sizeof(realw),
                                        cudaMemcpyDeviceToHost),222);
   }
@@ -626,12 +627,14 @@ void FC_FUNC_ (check_norm_elastic_from_device,
 
   int size, size_padded;
 
+  max = 0.0f;
+
   // launch simple reduction kernel
+  gpu_realw_mem d_max;
   realw *h_max;
   int blocksize = BLOCKSIZE_TRANSFER;
 
   // crust_mantle
-  max = 0.0f;
   size = mp->NGLOB_CRUST_MANTLE;
 
   size_padded = ((int) ceil (((double) size) / ((double) blocksize))) * blocksize;
@@ -641,7 +644,6 @@ void FC_FUNC_ (check_norm_elastic_from_device,
 
   h_max = (realw *) calloc (num_blocks_x * num_blocks_y, sizeof (realw));
 
-  gpu_realw_mem d_max;
 #ifdef USE_OPENCL
   cl_int errcode;
 
@@ -675,7 +677,6 @@ void FC_FUNC_ (check_norm_elastic_from_device,
 #endif
 #ifdef USE_CUDA
   dim3 grid,threads;
-
   if (run_cuda) {
     grid = dim3(num_blocks_x,num_blocks_y);
     threads = dim3(blocksize,1,1);
@@ -687,6 +688,18 @@ void FC_FUNC_ (check_norm_elastic_from_device,
     }else if(*FORWARD_OR_ADJOINT == 3 ){
       get_maximum_vector_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_crust_mantle.cuda,size,d_max.cuda);
     }
+    // copies to CPU
+    print_CUDA_error_if_any(cudaMemcpy(h_max,d_max.cuda,num_blocks_x*num_blocks_y*sizeof(realw),
+                                       cudaMemcpyDeviceToHost),222);
+
+    //debug
+    //realw max_d, max_v, max_a;
+    //max_d = get_device_array_maximum_value(&mp->d_displ_crust_mantle, NDIM * mp->NGLOB_CRUST_MANTLE);
+    //max_v = get_device_array_maximum_value(&mp->d_veloc_crust_mantle, NDIM * mp->NGLOB_CRUST_MANTLE);
+    //max_a = get_device_array_maximum_value(&mp->d_accel_crust_mantle, NDIM * mp->NGLOB_CRUST_MANTLE);
+    //printf ("rank %d - max crust_mantle displ: %e veloc: %e accel: %e\n", mp->myrank, max_d, max_v, max_a);
+    //fflush (stdout);
+    //synchronize_mpi ();
   }
 #endif
 
@@ -694,14 +707,13 @@ void FC_FUNC_ (check_norm_elastic_from_device,
   max = h_max[0];
   int i;
   for (i = 1; i < num_blocks_x * num_blocks_y; i++) {
-    // debug
-    printf("rank %i: maximum cm = %i %f\n",mp->myrank,i,h_max[i]);
     // sets maximum
     if (max < h_max[i])
       max = h_max[i];
   }
   max_crust_mantle = max;
 
+  // frees arrays
 #ifdef USE_OPENCL
   if (run_opencl) {
     clReleaseMemObject (d_max.ocl);
@@ -767,7 +779,6 @@ void FC_FUNC_ (check_norm_elastic_from_device,
     }else if(*FORWARD_OR_ADJOINT == 3 ){
       get_maximum_vector_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_inner_core.cuda,size,d_max.cuda);
     }
-
     // copies to CPU
     print_CUDA_error_if_any(cudaMemcpy(h_max,d_max.cuda,num_blocks_x*num_blocks_y*sizeof(realw),
                                        cudaMemcpyDeviceToHost),222);
@@ -781,6 +792,7 @@ void FC_FUNC_ (check_norm_elastic_from_device,
   }
   max_inner_core = max;
 
+  // frees arrays
 #ifdef USE_OPENCL
   if (run_opencl) {
     clReleaseMemObject (d_max.ocl);
@@ -791,9 +803,11 @@ void FC_FUNC_ (check_norm_elastic_from_device,
     cudaFree(d_max.cuda);
   }
 #endif
-
   free (h_max);
 
+  //debug
+  //printf ("rank %d - max norm elastic: crust_mantle = %e inner_core = %e\n",mp->myrank,max_crust_mantle,max_inner_core);
+  
   // return result
   max = MAX (max_inner_core, max_crust_mantle);
   *norm = max;
@@ -866,7 +880,7 @@ void FC_FUNC_ (check_norm_strain_from_device,
     global_work_size[0] = num_blocks_x * blocksize;
     global_work_size[1] = num_blocks_y;
     clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.get_maximum_scalar_kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL));
-
+    // copies to CPU
     clCheck (clEnqueueReadBuffer (mocl.command_queue, d_max.ocl, CL_TRUE, 0,
                                   num_blocks_x * num_blocks_y * sizeof (realw),
                                   h_max, 0, NULL, NULL));
@@ -881,7 +895,7 @@ void FC_FUNC_ (check_norm_strain_from_device,
 
     // determines max for: eps_trace_over_3_crust_mantle
     get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_eps_trace_over_3_crust_mantle.cuda,size,d_max.cuda);
-
+    // copies to CPU
     print_CUDA_error_if_any(cudaMemcpy(h_max,d_max.cuda,num_blocks_x*num_blocks_y*sizeof(realw),
                                        cudaMemcpyDeviceToHost),221);
   }
@@ -906,7 +920,6 @@ void FC_FUNC_ (check_norm_strain_from_device,
     cudaFree(d_max.cuda);
   }
 #endif
-
   free (h_max);
 
   // initializes
@@ -918,7 +931,7 @@ void FC_FUNC_ (check_norm_strain_from_device,
   get_blocks_xy (size_padded / blocksize, &num_blocks_x, &num_blocks_y);
 
 
-  h_max = (realw *) calloc (num_blocks_x*num_blocks_y, sizeof (realw));
+  h_max = (realw *) calloc (num_blocks_x * num_blocks_y, sizeof (realw));
   max_eps = 0.0f;
 
 #ifdef USE_OPENCL
@@ -958,6 +971,9 @@ void FC_FUNC_ (check_norm_strain_from_device,
 #endif
 #ifdef USE_CUDA
   if (run_cuda) {
+    grid = dim3(num_blocks_x,num_blocks_y);
+    threads = dim3(blocksize,1,1);
+  
     cudaMalloc((void**)&d_max.cuda,num_blocks_x*num_blocks_y*sizeof(realw));
 
     // determines max for: epsilondev_xx_crust_mantle
@@ -967,6 +983,8 @@ void FC_FUNC_ (check_norm_strain_from_device,
                                        cudaMemcpyDeviceToHost),222);
     max = h_max[0];
     for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
+      //debug
+      //if(mp->myrank == 0 ){printf ("rank %d - max %i %e %i %i\n",mp->myrank,i,h_max[i],num_blocks_x,num_blocks_y);}
       if( max < h_max[i]) max = h_max[i];
     }
     max_eps = MAX(max_eps,max);
@@ -1030,7 +1048,6 @@ void FC_FUNC_ (check_norm_strain_from_device,
     cudaFree(d_max.cuda);
   }
 #endif
-
   free (h_max);
 
 #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
diff --git a/src/gpu/compute_add_sources_elastic_gpu.c b/src/gpu/compute_add_sources_elastic_gpu.c
index 81403ec..7255ce1 100644
--- a/src/gpu/compute_add_sources_elastic_gpu.c
+++ b/src/gpu/compute_add_sources_elastic_gpu.c
@@ -221,9 +221,11 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
     cl_event *copy_evt = NULL;
     cl_uint num_evt = 0;
 
-    if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
-      copy_evt = &mp->last_copy_evt;
-      num_evt = 1;
+    if (GPU_ASYNC_COPY){
+      if (mp->has_last_copy_evt) {
+        copy_evt = &mp->last_copy_evt;
+        num_evt = 1;
+      }
     }
 
     clCheck (clSetKernelArg (mocl.kernels.compute_add_sources_adjoint_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_accel_crust_mantle.ocl));
@@ -245,9 +247,11 @@ void FC_FUNC_ (compute_add_sources_adjoint_gpu,
     clCheck (clEnqueueNDRangeKernel (mocl.command_queue, mocl.kernels.compute_add_sources_adjoint_kernel, 3, NULL,
                                      global_work_size, local_work_size, num_evt, copy_evt, NULL));
 
-    if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
-      clCheck (clReleaseEvent (mp->last_copy_evt));
-      mp->has_last_copy_evt = 0;
+    if (GPU_ASYNC_COPY){
+      if (mp->has_last_copy_evt) {
+        clCheck (clReleaseEvent (mp->last_copy_evt));
+        mp->has_last_copy_evt = 0;
+      }
     }
   }
 #endif
@@ -380,7 +384,7 @@ void FC_FUNC_(transfer_adj_to_device_async,
   }
 
   // checks async-memcpy
-  if (GPU_ASYNC_COPY == 0) {
+  if (! GPU_ASYNC_COPY ) {
     exit_on_error("transfer_adj_to_device_async must be called with GPU_ASYNC_COPY == 1, \
 please check mesh_constants_cuda.h");
   }
diff --git a/src/gpu/mesh_constants_gpu.h b/src/gpu/mesh_constants_gpu.h
index c4d4c42..38bbb40 100644
--- a/src/gpu/mesh_constants_gpu.h
+++ b/src/gpu/mesh_constants_gpu.h
@@ -150,9 +150,8 @@ typedef float realw;
 //#define R_EARTH_KM 6368.0f
 
 // Asynchronous memory copies between GPU and CPU
-#ifndef GPU_ASYNC_COPY
+// (set to 0 for synchronuous/blocking copies, set to 1 for asynchronuous copies)
 #define GPU_ASYNC_COPY 1
-#endif
 
 /*----------------------------------------------------------------------------------------------- */
 
@@ -942,7 +941,7 @@ void exit_on_gpu_error (char *kernel_name);
 void exit_on_error (char *info);
 void synchronize_mpi ();
 void get_blocks_xy (int num_blocks, int *num_blocks_x, int *num_blocks_y);
-realw get_device_array_maximum_value (Mesh *mp, gpu_realw_mem *d_array, int size);
+realw get_device_array_maximum_value (gpu_realw_mem *d_array, int size);
 
 /* ----------------------------------------------------------------------------------------------- */
 
diff --git a/src/gpu/prepare_mesh_constants_gpu.c b/src/gpu/prepare_mesh_constants_gpu.c
index fa13260..0b7a213 100644
--- a/src/gpu/prepare_mesh_constants_gpu.c
+++ b/src/gpu/prepare_mesh_constants_gpu.c
@@ -3375,34 +3375,44 @@ void FC_FUNC_ (prepare_cleanup_device,
 
   if (mp->nrec_local > 0) {
 #ifdef USE_OPENCL
-    if (run_opencl && GPU_ASYNC_COPY) {
-      RELEASE_PINNED_BUFFER_OCL (station_seismo_field);
-    } else
+    if (run_opencl ){
+      if (GPU_ASYNC_COPY) {
+        RELEASE_PINNED_BUFFER_OCL (station_seismo_field);
+      } else {
+        free (mp->h_station_seismo_field);
+      }
+    }
 #endif
 #ifdef USE_CUDA
-    if (run_cuda && GPU_ASYNC_COPY) {
-      cudaFreeHost(mp->h_station_seismo_field);
-    } else
-#endif
-    {
-      free (mp->h_station_seismo_field);
+    if (run_cuda ){
+      if (GPU_ASYNC_COPY) {
+        cudaFreeHost(mp->h_station_seismo_field);
+      } else {
+        free (mp->h_station_seismo_field);
+      }
     }
+#endif
   }
 
   if (mp->nadj_rec_local > 0) {
 #ifdef USE_OPENCL
-    if (run_opencl && GPU_ASYNC_COPY) {
-      RELEASE_PINNED_BUFFER_OCL (adj_sourcearrays_slice);
+    if (run_opencl){
+      if (GPU_ASYNC_COPY) {
+        RELEASE_PINNED_BUFFER_OCL (adj_sourcearrays_slice);
+      } else {
+        free (mp->h_adj_sourcearrays_slice);
+      }
     }
 #endif
 #ifdef USE_CUDA
-    if (run_cuda && GPU_ASYNC_COPY) {
-      cudaFreeHost(mp->h_adj_sourcearrays_slice);
-    } else
-#endif
-    {
-      free (mp->h_adj_sourcearrays_slice);
+    if (run_cuda){
+      if (GPU_ASYNC_COPY) {
+        cudaFreeHost(mp->h_adj_sourcearrays_slice);
+      } else {
+        free (mp->h_adj_sourcearrays_slice);
+      }
     }
+#endif
   }
 
 #ifdef USE_OPENCL
diff --git a/src/gpu/update_displacement_gpu.c b/src/gpu/update_displacement_gpu.c
index edd929d..09e38cb 100644
--- a/src/gpu/update_displacement_gpu.c
+++ b/src/gpu/update_displacement_gpu.c
@@ -44,13 +44,12 @@ void FC_FUNC_ (update_displacement_ic_gpu,
 
   int size = NDIM * mp->NGLOB_INNER_CORE;
 
-  //debug
-
 #if DEBUG_BACKWARD_SIMULATIONS == 1 && DEBUG == 1
+  //debug
   realw max_d, max_v, max_a;
-  max_d = get_device_array_maximum_value (mp, mp->d_b_displ_inner_core, size);
-  max_v = get_device_array_maximum_value (mp, mp->d_b_veloc_inner_core, size);
-  max_a = get_device_array_maximum_value (mp, mp->d_b_accel_inner_core, size);
+  max_d = get_device_array_maximum_value(&mp->d_b_displ_inner_core, size);
+  max_v = get_device_array_maximum_value(&mp->d_b_veloc_inner_core, size);
+  max_a = get_device_array_maximum_value(&mp->d_b_accel_inner_core, size);
   printf ("rank %d - max inner_core displ: %f veloc: %f accel: %f\n", mp->myrank, max_d, max_v, max_a);
   fflush (stdout);
   synchronize_mpi ();
@@ -163,9 +162,9 @@ void FC_FUNC_ (update_displacement_cm_gpu,
 
 #if DEBUG_BACKWARD_SIMULATIONS == 1 && DEBUG == 1
   realw max_d, max_v, max_a;
-  max_d = get_device_array_maximum_value (mp, mp->d_b_displ_crust_mantle, size);
-  max_v = get_device_array_maximum_value (mp, mp->d_b_veloc_crust_mantle, size);
-  max_a = get_device_array_maximum_value (mp, mp->d_b_accel_crust_mantle, size);
+  max_d = get_device_array_maximum_value(&mp->d_b_displ_crust_mantle, size);
+  max_v = get_device_array_maximum_value(&mp->d_b_veloc_crust_mantle, size);
+  max_a = get_device_array_maximum_value(&mp->d_b_accel_crust_mantle, size);
   printf ("rank %d - max crust_mantle displ: %f veloc: %f accel: %f\n", mp->myrank, max_d, max_v, max_a);
   fflush (stdout);
   synchronize_mpi ();
@@ -275,9 +274,9 @@ void FC_FUNC_ (update_displacement_oc_gpu,
 
 #if DEBUG_BACKWARD_SIMULATIONS == 1 && DEBUG == 1
   realw max_d, max_v, max_a;
-  max_d = get_device_array_maximum_value (mp, mp->d_b_displ_outer_core, size);
-  max_v = get_device_array_maximum_value (mp, mp->d_b_veloc_outer_core, size);
-  max_a = get_device_array_maximum_value (mp, mp->d_b_accel_outer_core, size);
+  max_d = get_device_array_maximum_value(&mp->d_b_displ_outer_core, size);
+  max_v = get_device_array_maximum_value(&mp->d_b_veloc_outer_core, size);
+  max_a = get_device_array_maximum_value(&mp->d_b_accel_outer_core, size);
   printf ("rank %d - max outer_core displ: %f veloc: %f accel: %f\n", mp->myrank, max_d, max_v, max_a);
   fflush (stdout);
   synchronize_mpi ();
diff --git a/src/gpu/write_seismograms_gpu.c b/src/gpu/write_seismograms_gpu.c
index 8919c19..d94b5b4 100644
--- a/src/gpu/write_seismograms_gpu.c
+++ b/src/gpu/write_seismograms_gpu.c
@@ -63,9 +63,11 @@ void write_seismograms_transfer_from_device (Mesh *mp,
     cl_event *copy_evt = NULL;
     cl_uint num_evt = 0;
 
-    if (GPU_ASYNC_COPY && mp->has_last_copy_evt) {
-      copy_evt = &mp->last_copy_evt;
-      num_evt = 1;
+    if (GPU_ASYNC_COPY ){
+      if (mp->has_last_copy_evt) {
+        copy_evt = &mp->last_copy_evt;
+        num_evt = 1;
+      }
     }
 
     clCheck (clSetKernelArg (mocl.kernels.write_seismograms_transfer_from_device_kernel, idx++, sizeof (cl_mem), (void *) &mp->d_number_receiver_global.ocl));
@@ -369,7 +371,7 @@ void FC_FUNC_(transfer_seismo_from_device_async,
   }
 
   // checks async-memcpy
-  if (GPU_ASYNC_COPY ==  0){
+  if (! GPU_ASYNC_COPY ){
     exit_on_error("transfer_seismo_from_device_async must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_cuda.h");
   }
 
diff --git a/src/specfem3D/check_stability.f90 b/src/specfem3D/check_stability.f90
index 2a7ed00..09dd09a 100644
--- a/src/specfem3D/check_stability.f90
+++ b/src/specfem3D/check_stability.f90
@@ -96,7 +96,6 @@
                     displ_inner_core(3,:)**2)))
 
     Ufluidnorm = maxval(abs(displ_outer_core))
-
   else
     ! on GPU
     ! way 2: just get maximum of fields from GPU