[cig-commits] r22998 - in seismo/3D/SPECFEM3D_GLOBE/trunk/src: cuda meshfem3D specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Mon Feb 17 07:57:44 PST 2014
Author: danielpeter
Date: 2014-02-17 07:57:43 -0800 (Mon, 17 Feb 2014)
New Revision: 22998
Modified:
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_model.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/initialize_simulation.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90
Log:
adds hybrid initialization for GPU mode (turned off by default); re-adds error check on GPU initialization
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu 2014-02-12 15:08:13 UTC (rev 22997)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu 2014-02-17 15:57:43 UTC (rev 22998)
@@ -88,13 +88,18 @@
// Gets number of GPU devices
device_count = 0;
cudaGetDeviceCount(&device_count);
- // Do not check if command failed:
- // `exit_on_cuda_error` call cudaDevice/ThreadSynchronize. If multiple
- // MPI tasks access multiple GPUs per node, they will try to synchronize
- // GPU 0 and depending on the order of the calls error will be raised
- // when setting the device number. If MPS is enabled, some GPUs will silently
- // not be used.
-
+ // Do not check if command failed with `exit_on_cuda_error` since it calls cudaDevice()/ThreadSynchronize():
+ // If multiple MPI tasks access multiple GPUs per node, they will try to synchronize
+ // GPU 0 and depending on the order of the calls, an error will be raised
+ // when setting the device number. If MPS is enabled, some GPUs will silently not be used.
+ //
+ // being verbose and catches error from first call to CUDA runtime function, without synchronize call
+ cudaError_t err = cudaGetLastError();
+ if (err != cudaSuccess){
+ fprintf(stderr,"Error after cudaGetDeviceCount: %s\n", cudaGetErrorString(err));
+ exit_on_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on titan enable environment: CRAY_CUDA_PROXY=1 to use single GPU with multiple MPI processes\n\nexiting...\n");
+ }
+
// returns device count to fortran
if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
*ncuda_devices = device_count;
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_model.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_model.f90 2014-02-12 15:08:13 UTC (rev 22997)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/setup_model.f90 2014-02-17 15:57:43 UTC (rev 22998)
@@ -41,27 +41,24 @@
! creates global slice addressing for solver
call create_addressing(myrank,NCHUNKS,NPROC,NPROC_ETA,NPROC_XI,NPROCTOT, &
- addressing,ichunk_slice,iproc_xi_slice,iproc_eta_slice, &
- OUTPUT_FILES)
+ addressing,ichunk_slice,iproc_xi_slice,iproc_eta_slice, &
+ OUTPUT_FILES)
-
! this for the different counters (which are now different if the superbrick is cut in the outer core)
call setup_counters(myrank, &
- NSPEC1D_RADIAL,NSPEC2D_XI,NSPEC2D_ETA,NGLOB1D_RADIAL, &
- DIFF_NSPEC1D_RADIAL,DIFF_NSPEC2D_XI,DIFF_NSPEC2D_ETA, &
- CUT_SUPERBRICK_XI,CUT_SUPERBRICK_ETA, &
- NPROCTOT,iproc_xi_slice,iproc_eta_slice, &
- NSPEC1D_RADIAL_CORNER,NSPEC2D_XI_FACE, &
- NSPEC2D_ETA_FACE,NGLOB1D_RADIAL_CORNER)
+ NSPEC1D_RADIAL,NSPEC2D_XI,NSPEC2D_ETA,NGLOB1D_RADIAL, &
+ DIFF_NSPEC1D_RADIAL,DIFF_NSPEC2D_XI,DIFF_NSPEC2D_ETA, &
+ CUT_SUPERBRICK_XI,CUT_SUPERBRICK_ETA, &
+ NPROCTOT,iproc_xi_slice,iproc_eta_slice, &
+ NSPEC1D_RADIAL_CORNER,NSPEC2D_XI_FACE, &
+ NSPEC2D_ETA_FACE,NGLOB1D_RADIAL_CORNER)
-
! distributes 3D models
call meshfem3D_models_broadcast(myrank,NSPEC, &
- MIN_ATTENUATION_PERIOD,MAX_ATTENUATION_PERIOD,&
- R80,R220,R670,RCMB,RICB, &
- LOCAL_PATH)
+ MIN_ATTENUATION_PERIOD,MAX_ATTENUATION_PERIOD,&
+ R80,R220,R670,RCMB,RICB, &
+ LOCAL_PATH)
-
! user output
if(myrank == 0 ) then
write(IMAIN,*)
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/initialize_simulation.f90 2014-02-12 15:08:13 UTC (rev 22997)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/initialize_simulation.f90 2014-02-17 15:57:43 UTC (rev 22998)
@@ -232,7 +232,7 @@
if(nrec < 1) call exit_MPI(myrank,trim(STATIONS)//': need at least one receiver')
! initializes GPU cards
- if( GPU_MODE ) call initialize_GPU()
+ call initialize_GPU()
! initializes VTK window
if( VTK_MODE ) then
@@ -450,36 +450,86 @@
implicit none
! local parameters
integer :: ncuda_devices,ncuda_devices_min,ncuda_devices_max
+ integer :: iproc
- ! GPU_MODE now defined in Par_file
- if(myrank == 0 ) then
- write(IMAIN,*)
- write(IMAIN,*) "GPU_MODE Active."
+ !----------------------------------------------------------------
+ ! user test parameters
+ !
+ ! for hybrid computing: distributes mpi processes to use CPU and GPU
+ ! note that a single mpi process on GPU is about >30x faster than on single CPU-core
+ !
+ ! cray xk7 node: 16-core CPU, 1 K20x GPU card
+ ! using 15 processes on single GPU and 1 processes on CPU still slows down the computation
+ ! as the GPU slices will be waiting for the CPU one...
+ !
+ ! turns on/off hybrid CPU-GPU computing
+ logical,parameter :: USE_HYBRID_CPU_GPU = .false.
+ ! total number of mpi processes run on a single node
+ integer, parameter :: TOTAL_PROCESSES_PER_NODE = 16
+ ! number of mpi processes run on CPU-cores
+ integer, parameter :: PROCESSES_PER_CPU = 1
+
+ !----------------------------------------------------------------
+
+ if( GPU_MODE .and. USE_HYBRID_CPU_GPU ) then
+ ! distributes processes on GPU and CPU
+ if( mod(myrank,TOTAL_PROCESSES_PER_NODE) < PROCESSES_PER_CPU ) then
+ ! turns of GPU mode for this process
+ GPU_MODE = .false.
+ else
+ !leaves GPU_MODE == .true.
+ continue
+ endif
+
+ ! user output
+ if( myrank == 0 ) print*,'Hybrid CPU-GPU computation:'
+ do iproc = 0, NPROCTOT_VAL-1
+ if( myrank == iproc ) then
+ if( myrank < TOTAL_PROCESSES_PER_NODE ) then
+ print*,'rank ',myrank,' has GPU_MODE set to ',GPU_MODE
+ endif
+ endif
+ call synchronize_all()
+ enddo
endif
- ! check for GPU runs
- if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) &
- stop 'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5'
- if( CUSTOM_REAL /= 4 ) &
- stop 'GPU mode runs only with CUSTOM_REAL == 4'
- if( ATTENUATION_VAL ) then
- if( N_SLS /= 3 ) &
- stop 'GPU mode does not support N_SLS /= 3 yet'
+ ! initializes number of local cuda devices
+ ncuda_devices = 0
+
+ ! GPU_MODE now defined in Par_file
+ if( GPU_MODE ) then
+ if(myrank == 0 ) then
+ write(IMAIN,*)
+ write(IMAIN,*) "GPU_MODE Active."
+ call flush_IMAIN()
+ endif
+
+ ! check for GPU runs
+ if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) &
+ stop 'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5'
+ if( CUSTOM_REAL /= 4 ) &
+ stop 'GPU mode runs only with CUSTOM_REAL == 4'
+ if( ATTENUATION_VAL ) then
+ if( N_SLS /= 3 ) &
+ stop 'GPU mode does not support N_SLS /= 3 yet'
+ endif
+
+ ! initializes GPU and outputs info to files for all processes
+ call initialize_cuda_device(myrank,ncuda_devices)
endif
- ! initializes GPU and outputs info to files for all processes
- call initialize_cuda_device(myrank,ncuda_devices)
-
! collects min/max of local devices found for statistics
call synchronize_all()
call min_all_i(ncuda_devices,ncuda_devices_min)
call max_all_i(ncuda_devices,ncuda_devices_max)
- if( myrank == 0 ) then
- write(IMAIN,*) "GPU number of devices per node: min =",ncuda_devices_min
- write(IMAIN,*) " max =",ncuda_devices_max
- write(IMAIN,*)
- call flush_IMAIN()
+ if( GPU_MODE ) then
+ if( myrank == 0 ) then
+ write(IMAIN,*) "GPU number of devices per node: min =",ncuda_devices_min
+ write(IMAIN,*) " max =",ncuda_devices_max
+ write(IMAIN,*)
+ call flush_IMAIN()
+ endif
endif
end subroutine initialize_GPU
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90 2014-02-12 15:08:13 UTC (rev 22997)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/prepare_timerun.f90 2014-02-17 15:57:43 UTC (rev 22998)
@@ -1876,11 +1876,11 @@
if(myrank == 0 ) write(IMAIN,*) " loading non-gravity/gravity arrays"
allocate(cr_d_ln_density_dr_table(NRAD_GRAVITY), &
- cr_minus_rho_g_over_kappa_fluid(NRAD_GRAVITY), &
- cr_minus_gravity_table(NRAD_GRAVITY), &
- cr_minus_deriv_gravity_table(NRAD_GRAVITY), &
- cr_density_table(NRAD_GRAVITY), &
- stat=ier)
+ cr_minus_rho_g_over_kappa_fluid(NRAD_GRAVITY), &
+ cr_minus_gravity_table(NRAD_GRAVITY), &
+ cr_minus_deriv_gravity_table(NRAD_GRAVITY), &
+ cr_density_table(NRAD_GRAVITY), &
+ stat=ier)
if( ier /= 0 ) stop 'error allocating cr_minus_rho_g_over_kappa_fluid, etc...'
allocate(cr_wgll_cube(NGLLX,NGLLY,NGLLZ),stat=ier)
@@ -2126,6 +2126,7 @@
! outer core region
if(myrank == 0 ) write(IMAIN,*) " loading outer core region"
+
call prepare_outer_core_device(Mesh_pointer, &
xix_outer_core,xiy_outer_core,xiz_outer_core, &
etax_outer_core,etay_outer_core,etaz_outer_core, &
@@ -2150,6 +2151,7 @@
! inner core region
if(myrank == 0 ) write(IMAIN,*) " loading inner core region"
+
call prepare_inner_core_device(Mesh_pointer, &
xix_inner_core,xiy_inner_core,xiz_inner_core, &
etax_inner_core,etay_inner_core,etaz_inner_core, &
@@ -2171,6 +2173,7 @@
! transfer forward and backward fields to device with initial values
if(myrank == 0 ) write(IMAIN,*) " transfering initial wavefield"
+
call transfer_fields_cm_to_device(NDIM*NGLOB_CRUST_MANTLE,displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
Mesh_pointer)
@@ -2194,9 +2197,6 @@
Mesh_pointer)
endif
- ! synchronizes processes
- call synchronize_all()
-
! outputs GPU usage to files for all processes
call output_free_device_memory(myrank)
More information about the CIG-COMMITS
mailing list