[cig-commits] r22998 - in seismo/3D/SPECFEM3D_GLOBE/trunk/src: cuda meshfem3D specfem3D

Mon Feb 17 07:57:44 PST 2014

Author: danielpeter
Date: 2014-02-17 07:57:43 -0800 (Mon, 17 Feb 2014)
New Revision: 22998

Modified:
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_model.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/initialize_simulation.f90
   seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90
Log:
adds hybrid initialization for GPU mode (turned off by default); re-adds error check on GPU initialization

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu
===================================================================

--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu	2014-02-12 15:08:13 UTC (rev 22997)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu	2014-02-17 15:57:43 UTC (rev 22998)
@@ -88,13 +88,18 @@
   // Gets number of GPU devices
   device_count = 0;
   cudaGetDeviceCount(&device_count);
-  // Do not check if command failed: 
-  // `exit_on_cuda_error` call cudaDevice/ThreadSynchronize. If multiple 
-  // MPI tasks access multiple GPUs per node, they will try to synchronize
-  // GPU 0 and depending on the order of the calls error will be raised
-  // when setting the device number. If MPS is enabled, some GPUs will silently
-  // not be used.
-
+  // Do not check if command failed with `exit_on_cuda_error` since it calls cudaDevice()/ThreadSynchronize():
+  // If multiple MPI tasks access multiple GPUs per node, they will try to synchronize
+  // GPU 0 and depending on the order of the calls, an error will be raised
+  // when setting the device number. If MPS is enabled, some GPUs will silently not be used.
+  //
+  // being verbose and catches error from first call to CUDA runtime function, without synchronize call
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess){
+    fprintf(stderr,"Error after cudaGetDeviceCount: %s\n", cudaGetErrorString(err));
+    exit_on_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on titan enable environment: CRAY_CUDA_PROXY=1 to use single GPU with multiple MPI processes\n\nexiting...\n");
+  }
+  
   // returns device count to fortran
   if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
   *ncuda_devices = device_count;

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_model.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_model.f90	2014-02-12 15:08:13 UTC (rev 22997)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_model.f90	2014-02-17 15:57:43 UTC (rev 22998)
@@ -41,27 +41,24 @@
 
   ! creates global slice addressing for solver
   call create_addressing(myrank,NCHUNKS,NPROC,NPROC_ETA,NPROC_XI,NPROCTOT, &
-                        addressing,ichunk_slice,iproc_xi_slice,iproc_eta_slice, &
-                        OUTPUT_FILES)
+                         addressing,ichunk_slice,iproc_xi_slice,iproc_eta_slice, &
+                         OUTPUT_FILES)
 
-
   ! this for the different counters (which are now different if the superbrick is cut in the outer core)
   call setup_counters(myrank, &
-                        NSPEC1D_RADIAL,NSPEC2D_XI,NSPEC2D_ETA,NGLOB1D_RADIAL, &
-                        DIFF_NSPEC1D_RADIAL,DIFF_NSPEC2D_XI,DIFF_NSPEC2D_ETA, &
-                        CUT_SUPERBRICK_XI,CUT_SUPERBRICK_ETA, &
-                        NPROCTOT,iproc_xi_slice,iproc_eta_slice, &
-                        NSPEC1D_RADIAL_CORNER,NSPEC2D_XI_FACE, &
-                        NSPEC2D_ETA_FACE,NGLOB1D_RADIAL_CORNER)
+                      NSPEC1D_RADIAL,NSPEC2D_XI,NSPEC2D_ETA,NGLOB1D_RADIAL, &
+                      DIFF_NSPEC1D_RADIAL,DIFF_NSPEC2D_XI,DIFF_NSPEC2D_ETA, &
+                      CUT_SUPERBRICK_XI,CUT_SUPERBRICK_ETA, &
+                      NPROCTOT,iproc_xi_slice,iproc_eta_slice, &
+                      NSPEC1D_RADIAL_CORNER,NSPEC2D_XI_FACE, &
+                      NSPEC2D_ETA_FACE,NGLOB1D_RADIAL_CORNER)
 
-
   ! distributes 3D models
   call meshfem3D_models_broadcast(myrank,NSPEC, &
-                                MIN_ATTENUATION_PERIOD,MAX_ATTENUATION_PERIOD,&
-                                R80,R220,R670,RCMB,RICB, &
-                                LOCAL_PATH)
+                                  MIN_ATTENUATION_PERIOD,MAX_ATTENUATION_PERIOD,&
+                                  R80,R220,R670,RCMB,RICB, &
+                                  LOCAL_PATH)
 
-
   ! user output
   if(myrank == 0 ) then
     write(IMAIN,*)

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/initialize_simulation.f90	2014-02-12 15:08:13 UTC (rev 22997)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/initialize_simulation.f90	2014-02-17 15:57:43 UTC (rev 22998)
@@ -232,7 +232,7 @@
   if(nrec < 1) call exit_MPI(myrank,trim(STATIONS)//': need at least one receiver')
 
   ! initializes GPU cards
-  if( GPU_MODE ) call initialize_GPU()
+  call initialize_GPU()
 
   ! initializes VTK window
   if( VTK_MODE ) then
@@ -450,36 +450,86 @@
   implicit none
   ! local parameters
   integer :: ncuda_devices,ncuda_devices_min,ncuda_devices_max
+  integer :: iproc
 
-  ! GPU_MODE now defined in Par_file
-  if(myrank == 0 ) then
-    write(IMAIN,*)
-    write(IMAIN,*) "GPU_MODE Active."
+  !----------------------------------------------------------------
+  ! user test parameters
+  !
+  ! for hybrid computing: distributes mpi processes to use CPU and GPU
+  ! note that a single mpi process on GPU is about >30x faster than on single CPU-core
+  !
+  ! cray xk7 node: 16-core CPU, 1 K20x GPU card
+  !                using 15 processes on single GPU and 1 processes on CPU still slows down the computation
+  !                as the GPU slices will be waiting for the CPU one...
+  !
+  ! turns on/off hybrid CPU-GPU computing
+  logical,parameter :: USE_HYBRID_CPU_GPU = .false.
+  ! total number of mpi processes run on a single node
+  integer, parameter :: TOTAL_PROCESSES_PER_NODE = 16
+  ! number of mpi processes run on CPU-cores
+  integer, parameter :: PROCESSES_PER_CPU = 1
+
+  !----------------------------------------------------------------
+
+  if( GPU_MODE .and. USE_HYBRID_CPU_GPU ) then
+    ! distributes processes on GPU and CPU
+    if( mod(myrank,TOTAL_PROCESSES_PER_NODE) < PROCESSES_PER_CPU ) then
+      ! turns of GPU mode for this process
+      GPU_MODE = .false.
+    else
+      !leaves GPU_MODE == .true.
+      continue
+    endif
+
+    ! user output
+    if( myrank == 0 ) print*,'Hybrid CPU-GPU computation:'
+    do iproc = 0, NPROCTOT_VAL-1
+      if( myrank == iproc ) then
+        if( myrank < TOTAL_PROCESSES_PER_NODE ) then
+          print*,'rank ',myrank,' has GPU_MODE set to ',GPU_MODE
+        endif
+      endif
+      call synchronize_all()
+    enddo
   endif
 
-  ! check for GPU runs
-  if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) &
-    stop 'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5'
-  if( CUSTOM_REAL /= 4 ) &
-    stop 'GPU mode runs only with CUSTOM_REAL == 4'
-  if( ATTENUATION_VAL ) then
-    if( N_SLS /= 3 ) &
-      stop 'GPU mode does not support N_SLS /= 3 yet'
+  ! initializes number of local cuda devices
+  ncuda_devices = 0
+
+  ! GPU_MODE now defined in Par_file
+  if( GPU_MODE ) then
+    if(myrank == 0 ) then
+      write(IMAIN,*)
+      write(IMAIN,*) "GPU_MODE Active."
+      call flush_IMAIN()
+    endif
+
+    ! check for GPU runs
+    if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) &
+      stop 'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5'
+    if( CUSTOM_REAL /= 4 ) &
+      stop 'GPU mode runs only with CUSTOM_REAL == 4'
+    if( ATTENUATION_VAL ) then
+      if( N_SLS /= 3 ) &
+        stop 'GPU mode does not support N_SLS /= 3 yet'
+    endif
+
+    ! initializes GPU and outputs info to files for all processes
+    call initialize_cuda_device(myrank,ncuda_devices)
   endif
 
-  ! initializes GPU and outputs info to files for all processes
-  call initialize_cuda_device(myrank,ncuda_devices)
-
   ! collects min/max of local devices found for statistics
   call synchronize_all()
   call min_all_i(ncuda_devices,ncuda_devices_min)
   call max_all_i(ncuda_devices,ncuda_devices_max)
 
-  if( myrank == 0 ) then
-    write(IMAIN,*) "GPU number of devices per node: min =",ncuda_devices_min
-    write(IMAIN,*) "                                max =",ncuda_devices_max
-    write(IMAIN,*)
-    call flush_IMAIN()
+  if( GPU_MODE ) then
+    if( myrank == 0 ) then
+      write(IMAIN,*) "GPU number of devices per node: min =",ncuda_devices_min
+      write(IMAIN,*) "                                max =",ncuda_devices_max
+      write(IMAIN,*)
+      call flush_IMAIN()
+    endif
   endif
 
   end subroutine initialize_GPU

Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90	2014-02-12 15:08:13 UTC (rev 22997)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90	2014-02-17 15:57:43 UTC (rev 22998)
@@ -1876,11 +1876,11 @@
   if(myrank == 0 ) write(IMAIN,*) "  loading non-gravity/gravity arrays"
 
   allocate(cr_d_ln_density_dr_table(NRAD_GRAVITY), &
-          cr_minus_rho_g_over_kappa_fluid(NRAD_GRAVITY), &
-          cr_minus_gravity_table(NRAD_GRAVITY), &
-          cr_minus_deriv_gravity_table(NRAD_GRAVITY), &
-          cr_density_table(NRAD_GRAVITY), &
-          stat=ier)
+           cr_minus_rho_g_over_kappa_fluid(NRAD_GRAVITY), &
+           cr_minus_gravity_table(NRAD_GRAVITY), &
+           cr_minus_deriv_gravity_table(NRAD_GRAVITY), &
+           cr_density_table(NRAD_GRAVITY), &
+           stat=ier)
   if( ier /= 0 ) stop 'error allocating cr_minus_rho_g_over_kappa_fluid, etc...'
 
   allocate(cr_wgll_cube(NGLLX,NGLLY,NGLLZ),stat=ier)
@@ -2126,6 +2126,7 @@
 
   ! outer core region
   if(myrank == 0 ) write(IMAIN,*) "  loading outer core region"
+
   call prepare_outer_core_device(Mesh_pointer, &
                                 xix_outer_core,xiy_outer_core,xiz_outer_core, &
                                 etax_outer_core,etay_outer_core,etaz_outer_core, &
@@ -2150,6 +2151,7 @@
 
   ! inner core region
   if(myrank == 0 ) write(IMAIN,*) "  loading inner core region"
+
   call prepare_inner_core_device(Mesh_pointer, &
                                  xix_inner_core,xiy_inner_core,xiz_inner_core, &
                                  etax_inner_core,etay_inner_core,etaz_inner_core, &
@@ -2171,6 +2173,7 @@
 
   ! transfer forward and backward fields to device with initial values
   if(myrank == 0 ) write(IMAIN,*) "  transfering initial wavefield"
+
   call transfer_fields_cm_to_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
                                    Mesh_pointer)
 
@@ -2194,9 +2197,6 @@
                                     Mesh_pointer)
   endif
 
-  ! synchronizes processes
-  call synchronize_all()
-
   ! outputs GPU usage to files for all processes
   call output_free_device_memory(myrank)