[cig-commits] r22991 - in seismo/3D/SPECFEM3D_GLOBE/trunk: . setup src/auxiliaries src/cuda src/meshfem3D src/shared src/specfem3D utils/Visualization/VTK_ParaView
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Mon Jan 13 08:06:21 PST 2014
Author: danielpeter
Date: 2014-01-13 08:06:21 -0800 (Mon, 13 Jan 2014)
New Revision: 22991
Modified:
seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in
seismo/3D/SPECFEM3D_GLOBE/trunk/setup/constants.h.in
seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data.F90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data_adios_impl.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_scalar_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_vector_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_add_sources_elastic_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_coupling_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_kernels_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_acoustic_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_elastic_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/update_displacement_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/write_seismograms_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_crust.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/rules.mk
seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/write_VTK_file.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_scalar.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_vector.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_add_sources.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.F90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_stacey_outer_core.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_receivers.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_sources.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/write_seismograms.f90
seismo/3D/SPECFEM3D_GLOBE/trunk/utils/Visualization/VTK_ParaView/create_slice_VTK.f90
Log:
adds asynchronuous memory copies between CPU and GPU; bug fix in adjoint sources on GPU; additional tweaks in getting values for crust2.0 arrays; adds explicit output format for VTK cell types
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/Makefile.in 2014-01-13 16:06:21 UTC (rev 22991)
@@ -55,7 +55,7 @@
#######################################
####
#### CUDA
-#### with configure: ./configure --with-cuda CUDA_LIB=.. CUDA_INC=.. MPI_INC=.. ..
+#### with configure: ./configure --with-cuda=cuda5 CUDA_FLAGS=.. CUDA_LIB=.. CUDA_INC=.. MPI_INC=.. ..
####
#######################################
@@ -66,7 +66,8 @@
@COND_CUDA5_FALSE at CUDA5 = no
MPI_INCLUDES = @MPI_INCLUDES@
-CUDA_INC = @CUDA_FLAGS@ @CUDA_CPPFLAGS@ -I${SETUP}
+CUDA_FLAGS = @CUDA_FLAGS@
+CUDA_INC = @CUDA_CPPFLAGS@ -I${SETUP}
CUDA_LINK = @CUDA_LDFLAGS@ @CUDA_LIBS@
@COND_CUDA_TRUE at NVCC = nvcc
@@ -89,7 +90,7 @@
@COND_CUDA_TRUE@@COND_CUDA5_FALSE at GENCODE = $(GENCODE_20)
# CUDA flags and linking
- at COND_CUDA_TRUE@NVCC_FLAGS_BASE = $(CUDA_INC) $(MPI_INCLUDES)
+ at COND_CUDA_TRUE@NVCC_FLAGS_BASE = $(CUDA_FLAGS) $(CUDA_INC) $(MPI_INCLUDES)
@COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -dc $(GENCODE)
@COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -DUSE_OLDER_CUDA4_GPU $(GENCODE)
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/setup/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/setup/constants.h.in 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/setup/constants.h.in 2014-01-13 16:06:21 UTC (rev 22991)
@@ -236,19 +236,25 @@
! the mesher) and use them for the computation of boundary kernel (in the solver)
logical, parameter :: SAVE_BOUNDARY_MESH = .false.
-!************************************************************
-! added these parameters for the GPU version of the solver with mesh coloring
+!!-----------------------------------------------------------
+!!
+!! GPU optimization
+!!
+!!-----------------------------------------------------------
+! added these parameters for the GPU version of the solver
+! asynchronuous memcopy between CPU and GPU
+ logical, parameter :: GPU_ASYNC_COPY = .true.
+
+! mesh coloring
! add mesh coloring for the GPU + MPI implementation
logical, parameter :: USE_MESH_COLORING_GPU = .false.
integer, parameter :: MAX_NUMBER_OF_COLORS = 1000
-
! enhanced coloring:
! using Droux algorithm
! try several times with one more color before giving up
logical, parameter :: USE_DROUX_OPTIMIZATION = .false.
integer, parameter :: MAX_NB_TRIES_OF_DROUX_1993 = 15
-
! using balancing algorithm
! postprocess the colors to balance them if Droux (1993) algorithm is not used
logical, parameter :: BALANCE_COLORS_SIMPLE_ALGO = .true.
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data.F90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data.F90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -114,7 +114,7 @@
print *, sizeprocs, "procs"
if (sizeprocs .ne. 1) then
print *, "sequential program. Only mpirun -np 1 ..."
- call MPI_Abort(MPI_COMM_WORLD, mpier, ierr)
+ call MPI_Abort(MPI_COMM_WORLD, mpier, ierr)
endif
#endif
@@ -357,7 +357,7 @@
print *
! topology file
-#ifndef ADIOS_INPUT
+#ifndef ADIOS_INPUT
topo_file = trim(prname_topo) // 'solver_data.bin'
open(unit = 28,file = trim(topo_file),status='old',action='read', iostat = ios, form='unformatted')
if (ios /= 0) then
@@ -461,7 +461,7 @@
total_dat_xyz(1,np+numpoin) = x
total_dat_xyz(2,np+numpoin) = y
total_dat_xyz(3,np+numpoin) = z
-#endif
+#endif
mask_ibool(iglob) = .true.
num_ibool(iglob) = numpoin
endif
@@ -521,7 +521,7 @@
if( idoubling_inner_core(ispec) == IFLAG_IN_FICTITIOUS_CUBE) then
! connectivity must be given, otherwise element count would be wrong
! maps "fictitious" connectivity, element is all with iglob = 1
-#ifndef USE_VTK_INSTEAD_OF_MESH
+#ifndef USE_VTK_INSTEAD_OF_MESH
do k = 1, NGLLZ-1, dk
do j = 1, NGLLY-1, dj
do i = 1, NGLLX-1, di
@@ -635,7 +635,7 @@
! VTK
! type: hexahedrons
write(IOVTK,'(a,i12)') "CELL_TYPES ",ne
- write(IOVTK,*) (12,it=1,ne)
+ write(IOVTK,'(6i12)') (12,it=1,ne)
write(IOVTK,*) ""
write(IOVTK,'(a,i12)') "POINT_DATA ",np
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data_adios_impl.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data_adios_impl.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/auxiliaries/combine_vol_data_adios_impl.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -6,7 +6,7 @@
contains
!=============================================================================
-!> Print help message.
+!> Print help message.
subroutine print_usage_adios()
print *, 'Usage: '
print *, ' xcombine_data slice_list varname var_file mesh_file ' // &
@@ -73,7 +73,7 @@
endif
iregion = 0
- if (command_argument_count() == 7) then
+ if (command_argument_count() == 7) then
read(arg(7),*) iregion
endif
if (iregion > 3 .or. iregion < 0) stop 'Iregion = 0,1,2,3'
@@ -89,11 +89,11 @@
!=============================================================================
-!> Open ADIOS value and mesh files, read mode
+!> Open ADIOS value and mesh files, read mode
subroutine init_adios(value_file_name, mesh_file_name, &
value_handle, mesh_handle)
implicit none
- ! Parameters
+ ! Parameters
character(len=*), intent(in) :: value_file_name, mesh_file_name
integer(kind=8), intent(out) :: value_handle, mesh_handle
! Variables
@@ -109,10 +109,10 @@
!=============================================================================
-!> Open ADIOS value and mesh files, read mode
+!> Open ADIOS value and mesh files, read mode
subroutine clean_adios(value_handle, mesh_handle)
implicit none
- ! Parameters
+ ! Parameters
integer(kind=8), intent(in) :: value_handle, mesh_handle
! Variables
integer :: ier
@@ -142,18 +142,18 @@
0, 1, nglob, ier)
call adios_schedule_read(mesh_handle, sel, trim(reg_name) // "/nspec", &
0, 1, nspec, ier)
- call adios_perform_reads(mesh_handle, ier)
+ call adios_perform_reads(mesh_handle, ier)
end subroutine read_scalars_adios_mesh
!=============================================================================
-subroutine read_coordinates_adios_mesh(mesh_handle, iproc, ir, nglob, nspec, &
+subroutine read_coordinates_adios_mesh(mesh_handle, iproc, ir, nglob, nspec, &
xstore, ystore, zstore, ibool)
implicit none
include 'constants.h'
! Parameters
integer(kind=8), intent(in) :: mesh_handle
- integer, intent(in) :: iproc, ir, nglob, nspec
+ integer, intent(in) :: iproc, ir, nglob, nspec
real(kind=CUSTOM_REAL),dimension(:), intent(inout) :: xstore, ystore, zstore
integer, dimension(:,:,:,:), intent(inout) :: ibool
! Variables
@@ -171,7 +171,7 @@
call adios_schedule_read(mesh_handle, sel_scalar, &
trim(reg_name) // "x_global/offset", &
0, 1, offset_coord, ier)
- call adios_perform_reads(mesh_handle, ier)
+ call adios_perform_reads(mesh_handle, ier)
start(1) = offset_ibool
count_ad(1) = NGLLX * NGLLY * NGLLZ * nspec
@@ -180,7 +180,7 @@
trim(reg_name) // "ibool/array", 0, 1, &
ibool, ier)
- start(1) = offset_coord
+ start(1) = offset_coord
count_ad(1) = nglob
call adios_selection_boundingbox (sel_coord , 1, start, count_ad)
call adios_schedule_read(mesh_handle, sel_coord, &
@@ -192,7 +192,7 @@
call adios_schedule_read(mesh_handle, sel_coord, &
trim(reg_name) // "z_global/array", 0, 1, &
zstore, ier)
- call adios_perform_reads(mesh_handle, ier)
+ call adios_perform_reads(mesh_handle, ier)
end subroutine read_coordinates_adios_mesh
@@ -202,7 +202,7 @@
implicit none
include 'constants.h'
! Parameters
- integer(kind=8), intent(in) :: value_handle
+ integer(kind=8), intent(in) :: value_handle
character(len=*), intent(in) :: var_name
integer, intent(in) :: iproc, ir, nspec
real(kind=CUSTOM_REAL), dimension(:,:,:,:), intent(inout) :: data
@@ -218,7 +218,7 @@
call adios_schedule_read(value_handle, sel, &
trim(reg_name) // trim(var_name) // "/offset", &
0, 1, offset, ier)
- call adios_perform_reads(value_handle, ier)
+ call adios_perform_reads(value_handle, ier)
start(1) = offset
count_ad(1) = NGLLX * NGLLY * NGLLZ * nspec
@@ -226,7 +226,7 @@
call adios_schedule_read(value_handle, sel, &
trim(reg_name) // trim(var_name) // "/array", 0, 1,&
data, ier)
- call adios_perform_reads(value_handle, ier)
+ call adios_perform_reads(value_handle, ier)
end subroutine read_values_adios
end module combine_vol_data_adios_mod
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_scalar_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_scalar_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_scalar_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -63,22 +63,31 @@
}
+/* ----------------------------------------------------------------------------------------------- */
+// MPI transfer
+
/* ----------------------------------------------------------------------------------------------- */
// prepares and transfers the inter-element edge-nodes to the host to be MPI'd
+// (elements on boundary)
+
extern "C"
void FC_FUNC_(transfer_boun_pot_from_device,
TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer_f,
- realw* send_potential_dot_dot_buffer,
+ realw* send_buffer,
int* FORWARD_OR_ADJOINT){
TRACE("transfer_boun_pot_from_device");
+ int size_mpi_buffer;
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ // mpi buffer size
+ size_mpi_buffer = (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core);
+
// checks if anything to do
- if( mp->num_interfaces_outer_core == 0 ) return;
+ if( size_mpi_buffer <= 0 ) return;
int blocksize = BLOCKSIZE_TRANSFER;
int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_oc))/((double)blocksize)))*blocksize;
@@ -90,41 +99,65 @@
dim3 threads(blocksize,1,1);
if(*FORWARD_OR_ADJOINT == 1) {
- prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_accel_outer_core,
- mp->d_send_accel_buffer_outer_core,
- mp->num_interfaces_outer_core,
- mp->max_nibool_interfaces_oc,
- mp->d_nibool_interfaces_outer_core,
- mp->d_ibool_interfaces_outer_core);
+ prepare_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_outer_core,
+ mp->d_send_accel_buffer_outer_core,
+ mp->num_interfaces_outer_core,
+ mp->max_nibool_interfaces_oc,
+ mp->d_nibool_interfaces_outer_core,
+ mp->d_ibool_interfaces_outer_core);
- print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_accel_buffer_outer_core,
- (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+ // copies buffer to CPU
+ if( GPU_ASYNC_COPY ){
+ // waits until kernel is finished before starting async memcpy
+ cudaStreamSynchronize(mp->compute_stream);
+ // copies buffer to CPU
+ cudaMemcpyAsync(mp->h_send_accel_buffer_oc,mp->d_send_accel_buffer_outer_core,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyDeviceToHost,mp->copy_stream);
+ }else{
+ // synchronuous copy
+ print_CUDA_error_if_any(cudaMemcpy(send_buffer,mp->d_send_accel_buffer_outer_core,
+ size_mpi_buffer*sizeof(realw),
cudaMemcpyDeviceToHost),98000);
+ }
}
else if(*FORWARD_OR_ADJOINT == 3) {
// debug
DEBUG_BACKWARD_ASSEMBLY();
- prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_accel_outer_core,
- mp->d_b_send_accel_buffer_outer_core,
- mp->num_interfaces_outer_core,
- mp->max_nibool_interfaces_oc,
- mp->d_nibool_interfaces_outer_core,
- mp->d_ibool_interfaces_outer_core);
+ prepare_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_outer_core,
+ mp->d_b_send_accel_buffer_outer_core,
+ mp->num_interfaces_outer_core,
+ mp->max_nibool_interfaces_oc,
+ mp->d_nibool_interfaces_outer_core,
+ mp->d_ibool_interfaces_outer_core);
- print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_b_send_accel_buffer_outer_core,
- (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+
+ // copies buffer to CPU
+ if( GPU_ASYNC_COPY ){
+ // waits until kernel is finished before starting async memcpy
+ cudaStreamSynchronize(mp->compute_stream);
+ // copies buffer to CPU
+ cudaMemcpyAsync(mp->h_b_send_accel_buffer_oc,mp->d_b_send_accel_buffer_outer_core,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyDeviceToHost,mp->copy_stream);
+ }else{
+ // synchronuous copy
+ print_CUDA_error_if_any(cudaMemcpy(send_buffer,mp->d_b_send_accel_buffer_outer_core,
+ size_mpi_buffer*sizeof(realw),
cudaMemcpyDeviceToHost),98001);
+ }
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("prepare_boundary_kernel");
+ exit_on_cuda_error("transfer_boun_pot_from_device");
#endif
}
+/* ----------------------------------------------------------------------------------------------- */
+// ASSEMBLY
+
/* ----------------------------------------------------------------------------------------------- */
@@ -161,12 +194,15 @@
int* FORWARD_OR_ADJOINT) {
TRACE("transfer_asmbl_pot_to_device");
+ int size_mpi_buffer;
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- //double start_time = get_time();
+ // buffer size
+ size_mpi_buffer = (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core);
+
// checks if anything to do
- if( mp->num_interfaces_outer_core == 0 ) return;
+ if( size_mpi_buffer <= 0 ) return;
// assembles on GPU
int blocksize = BLOCKSIZE_TRANSFER;
@@ -179,35 +215,46 @@
dim3 threads(blocksize,1,1);
if(*FORWARD_OR_ADJOINT == 1) {
- // copies scalar buffer onto GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_outer_core, buffer_recv_scalar,
- (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
- cudaMemcpyHostToDevice),99000);
- //assemble forward field
- assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_accel_outer_core,
- mp->d_send_accel_buffer_outer_core,
- mp->num_interfaces_outer_core,
- mp->max_nibool_interfaces_oc,
- mp->d_nibool_interfaces_outer_core,
- mp->d_ibool_interfaces_outer_core);
- }
- else if(*FORWARD_OR_ADJOINT == 3) {
+ // asynchronuous copy
+ if( GPU_ASYNC_COPY ){
+ // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+ cudaStreamSynchronize(mp->copy_stream);
+ }else{
+ // copies scalar buffer onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_outer_core, buffer_recv_scalar,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice),99000);
+ }
+
+ //assembles forward field
+ assemble_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_outer_core,
+ mp->d_send_accel_buffer_outer_core,
+ mp->num_interfaces_outer_core,
+ mp->max_nibool_interfaces_oc,
+ mp->d_nibool_interfaces_outer_core,
+ mp->d_ibool_interfaces_outer_core);
+ }else if(*FORWARD_OR_ADJOINT == 3) {
// debug
DEBUG_BACKWARD_ASSEMBLY();
- // copies scalar buffer onto GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_outer_core, buffer_recv_scalar,
- (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
- cudaMemcpyHostToDevice),99001);
+ // asynchronuous copy
+ if( GPU_ASYNC_COPY ){
+ // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+ cudaStreamSynchronize(mp->copy_stream);
+ }else{
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ // copies scalar buffer onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_outer_core, buffer_recv_scalar,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice),99001);
+ }
- //assemble reconstructed/backward field
- assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_accel_outer_core,
- mp->d_b_send_accel_buffer_outer_core,
- mp->num_interfaces_outer_core,
- mp->max_nibool_interfaces_oc,
- mp->d_nibool_interfaces_outer_core,
- mp->d_ibool_interfaces_outer_core);
+ //assembles reconstructed/backward field
+ assemble_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_outer_core,
+ mp->d_b_send_accel_buffer_outer_core,
+ mp->num_interfaces_outer_core,
+ mp->max_nibool_interfaces_oc,
+ mp->d_nibool_interfaces_outer_core,
+ mp->d_ibool_interfaces_outer_core);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_vector_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_vector_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/assemble_MPI_vector_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -66,26 +66,35 @@
/* ----------------------------------------------------------------------------------------------- */
+// MPI transfer
+
+/* ----------------------------------------------------------------------------------------------- */
+
// prepares and transfers the inter-element edge-nodes to the host to be MPI'd
// (elements on boundary)
+
extern "C"
-void FC_FUNC_(transfer_boun_accel_from_device,
- TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
- realw* send_accel_buffer,
- int* IREGION,
- int* FORWARD_OR_ADJOINT){
- TRACE("transfer_boun_accel_from_device");
+void FC_FUNC_(transfer_boun_from_device,
+ TRANSFER_BOUN_FROM_DEVICE)(long* Mesh_pointer_f,
+ realw* send_accel_buffer,
+ int* IREGION,
+ int* FORWARD_OR_ADJOINT){
+ TRACE("transfer_boun_from_device");
int blocksize,size_padded;
int num_blocks_x,num_blocks_y;
dim3 grid,threads;
+ int size_mpi_buffer;
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
// crust/mantle region
if( *IREGION == IREGION_CRUST_MANTLE ){
- if( mp->num_interfaces_crust_mantle > 0 ){
+ size_mpi_buffer = NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle;
+
+ if( size_mpi_buffer > 0 ){
+
blocksize = BLOCKSIZE_TRANSFER;
size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_cm)/((double)blocksize)))*blocksize;
@@ -95,44 +104,60 @@
threads = dim3(blocksize,1,1);
if(*FORWARD_OR_ADJOINT == 1) {
- prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle,
- mp->d_send_accel_buffer_crust_mantle,
- mp->num_interfaces_crust_mantle,
- mp->max_nibool_interfaces_cm,
- mp->d_nibool_interfaces_crust_mantle,
- mp->d_ibool_interfaces_crust_mantle);
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
+ mp->d_send_accel_buffer_crust_mantle,
+ mp->num_interfaces_crust_mantle,
+ mp->max_nibool_interfaces_cm,
+ mp->d_nibool_interfaces_crust_mantle,
+ mp->d_ibool_interfaces_crust_mantle);
// copies buffer to CPU
- print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_crust_mantle,
- NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle*sizeof(realw),
- cudaMemcpyDeviceToHost),41000);
-
+ if( GPU_ASYNC_COPY ){
+ // waits until kernel is finished before starting async memcpy
+ cudaStreamSynchronize(mp->compute_stream);
+ // copies buffer to CPU
+ cudaMemcpyAsync(mp->h_send_accel_buffer_cm,mp->d_send_accel_buffer_crust_mantle,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyDeviceToHost,mp->copy_stream);
+ }else{
+ // synchronuous copy
+ print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_crust_mantle,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyDeviceToHost),41000);
+ }
}
else if(*FORWARD_OR_ADJOINT == 3) {
// debug
DEBUG_BACKWARD_ASSEMBLY();
- prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
- mp->d_b_send_accel_buffer_crust_mantle,
- mp->num_interfaces_crust_mantle,
- mp->max_nibool_interfaces_cm,
- mp->d_nibool_interfaces_crust_mantle,
- mp->d_ibool_interfaces_crust_mantle);
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
+ mp->d_b_send_accel_buffer_crust_mantle,
+ mp->num_interfaces_crust_mantle,
+ mp->max_nibool_interfaces_cm,
+ mp->d_nibool_interfaces_crust_mantle,
+ mp->d_ibool_interfaces_crust_mantle);
+
// copies buffer to CPU
- print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_crust_mantle,
- NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle*sizeof(realw),
- cudaMemcpyDeviceToHost),41001);
-
+ if( GPU_ASYNC_COPY ){
+ // waits until kernel is finished before starting async memcpy
+ cudaStreamSynchronize(mp->compute_stream);
+ // copies buffer to CPU
+ cudaMemcpyAsync(mp->h_b_send_accel_buffer_cm,mp->d_b_send_accel_buffer_crust_mantle,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyDeviceToHost,mp->copy_stream);
+ }else{
+ // synchronuous copy
+ print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_crust_mantle,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyDeviceToHost),41001);
+ }
}
-
-
}
}
// inner core region
if( *IREGION == IREGION_INNER_CORE ){
- if( mp->num_interfaces_inner_core > 0 ){
+ size_mpi_buffer = NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core;
+
+ if( size_mpi_buffer > 0 ){
+
blocksize = BLOCKSIZE_TRANSFER;
size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ic)/((double)blocksize)))*blocksize;
@@ -142,45 +167,65 @@
threads = dim3(blocksize,1,1);
if(*FORWARD_OR_ADJOINT == 1) {
- prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
- mp->d_send_accel_buffer_inner_core,
- mp->num_interfaces_inner_core,
- mp->max_nibool_interfaces_ic,
- mp->d_nibool_interfaces_inner_core,
- mp->d_ibool_interfaces_inner_core);
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_inner_core,
+ mp->d_send_accel_buffer_inner_core,
+ mp->num_interfaces_inner_core,
+ mp->max_nibool_interfaces_ic,
+ mp->d_nibool_interfaces_inner_core,
+ mp->d_ibool_interfaces_inner_core);
// copies buffer to CPU
- print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_inner_core,
- NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core*sizeof(realw),
+ if( GPU_ASYNC_COPY ){
+ // waits until kernel is finished before starting async memcpy
+ cudaStreamSynchronize(mp->compute_stream);
+ // copies buffer to CPU
+ cudaMemcpyAsync(mp->h_send_accel_buffer_ic,mp->d_send_accel_buffer_inner_core,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyDeviceToHost,mp->copy_stream);
+ }else{
+ // synchronuous copy
+ print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_inner_core,size_mpi_buffer*sizeof(realw),
cudaMemcpyDeviceToHost),41000);
-
+ }
}
else if(*FORWARD_OR_ADJOINT == 3) {
// debug
DEBUG_BACKWARD_ASSEMBLY();
- prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core,
- mp->d_b_send_accel_buffer_inner_core,
- mp->num_interfaces_inner_core,
- mp->max_nibool_interfaces_ic,
- mp->d_nibool_interfaces_inner_core,
- mp->d_ibool_interfaces_inner_core);
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_inner_core,
+ mp->d_b_send_accel_buffer_inner_core,
+ mp->num_interfaces_inner_core,
+ mp->max_nibool_interfaces_ic,
+ mp->d_nibool_interfaces_inner_core,
+ mp->d_ibool_interfaces_inner_core);
// copies buffer to CPU
- print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_inner_core,
- NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core*sizeof(realw),
+ if( GPU_ASYNC_COPY ){
+ // waits until kernel is finished before starting async memcpy
+ cudaStreamSynchronize(mp->compute_stream);
+ // copies buffer to CPU
+ cudaMemcpyAsync(mp->h_b_send_accel_buffer_ic,mp->d_b_send_accel_buffer_inner_core,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyDeviceToHost,mp->copy_stream);
+ }else{
+ // synchronuous copy
+ print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_inner_core,size_mpi_buffer*sizeof(realw),
cudaMemcpyDeviceToHost),41001);
+ }
}
}
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("transfer_boun_accel_from_device");
+ exit_on_cuda_error("transfer_boun_from_device");
#endif
}
+
/* ----------------------------------------------------------------------------------------------- */
+// ASSEMBLY
+
+/* ----------------------------------------------------------------------------------------------- */
+
__global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
int num_interfaces,
int max_nibool_interfaces,
@@ -218,12 +263,16 @@
int blocksize,size_padded;
int num_blocks_x,num_blocks_y;
dim3 grid,threads;
+ int size_mpi_buffer;
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// crust/mantle region
if( *IREGION == IREGION_CRUST_MANTLE ){
- if( mp->num_interfaces_crust_mantle > 0 ){
+
+ size_mpi_buffer = NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle);
+
+ if( size_mpi_buffer > 0 ){
// assembles values
blocksize = BLOCKSIZE_TRANSFER;
size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_cm)/((double)blocksize)))*blocksize;
@@ -234,30 +283,42 @@
threads = dim3(blocksize,1,1);
if(*FORWARD_OR_ADJOINT == 1) {
- // copies vector buffer values to GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_crust_mantle, buffer_recv_vector,
- NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
- cudaMemcpyHostToDevice),41000);
+ // asynchronuous copy
+ if( GPU_ASYNC_COPY ){
+ // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+ cudaStreamSynchronize(mp->copy_stream);
+ }else{
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ // copies vector buffer values to GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_crust_mantle,buffer_recv_vector,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice),41000);
+ }
+
//assemble forward accel
- assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle,
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
mp->d_send_accel_buffer_crust_mantle,
mp->num_interfaces_crust_mantle,
mp->max_nibool_interfaces_cm,
mp->d_nibool_interfaces_crust_mantle,
mp->d_ibool_interfaces_crust_mantle);
- }
- else if(*FORWARD_OR_ADJOINT == 3) {
+ }else if(*FORWARD_OR_ADJOINT == 3) {
// debug
DEBUG_BACKWARD_ASSEMBLY();
- // copies vector buffer values to GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_crust_mantle, buffer_recv_vector,
- NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
- cudaMemcpyHostToDevice),41000);
+ // asynchronuous copy
+ if( GPU_ASYNC_COPY ){
+ // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+ cudaStreamSynchronize(mp->copy_stream);
+ }else{
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ // copies vector buffer values to GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_crust_mantle,buffer_recv_vector,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice),41000);
+ }
//assemble adjoint accel
- assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
mp->d_b_send_accel_buffer_crust_mantle,
mp->num_interfaces_crust_mantle,
mp->max_nibool_interfaces_cm,
@@ -273,7 +334,10 @@
// inner core region
if( *IREGION == IREGION_INNER_CORE ){
- if( mp->num_interfaces_inner_core > 0 ){
+
+ size_mpi_buffer = NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core);
+
+ if( size_mpi_buffer > 0 ){
// assembles values
blocksize = BLOCKSIZE_TRANSFER;
size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ic)/((double)blocksize)))*blocksize;
@@ -284,13 +348,19 @@
threads = dim3(blocksize,1,1);
if(*FORWARD_OR_ADJOINT == 1) {
- // copies buffer values to GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_inner_core, buffer_recv_vector,
- NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw),
- cudaMemcpyHostToDevice),41001);
+ if( GPU_ASYNC_COPY ){
+ // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+ cudaStreamSynchronize(mp->copy_stream);
+ }else{
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ // copies buffer values to GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_inner_core,buffer_recv_vector,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice),41001);
+ }
+
//assemble forward accel
- assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_inner_core,
mp->d_send_accel_buffer_inner_core,
mp->num_interfaces_inner_core,
mp->max_nibool_interfaces_ic,
@@ -301,13 +371,18 @@
// debug
DEBUG_BACKWARD_ASSEMBLY();
- // copies buffer values to GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_inner_core, buffer_recv_vector,
- NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw),
- cudaMemcpyHostToDevice),41001);
+ if( GPU_ASYNC_COPY ){
+ // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+ cudaStreamSynchronize(mp->copy_stream);
+ }else{
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ // copies buffer values to GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_inner_core,buffer_recv_vector,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice),41001);
+ }
//assemble adjoint accel
- assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core,
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_inner_core,
mp->d_b_send_accel_buffer_inner_core,
mp->num_interfaces_inner_core,
mp->max_nibool_interfaces_ic,
@@ -321,3 +396,201 @@
exit_on_cuda_error("transfer_asmbl_accel_to_device in inner_core");
#endif
}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Asynchronuous memory copy for mpi buffers
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_buffer_to_device_async,
+ TRANSFER_BUFFER_TO_DEVICE_ASYNC)(long* Mesh_pointer,
+ realw* buffer,
+ int* IREGION,
+ int* FORWARD_OR_ADJOINT) {
+
+// asynchronous transfer from host to device
+
+ TRACE("transfer_buffer_to_device_async");
+
+ int size_mpi_buffer;
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+ // checks async-memcpy
+ if( GPU_ASYNC_COPY == 0 ){
+ exit_on_error("transfer_buffer_to_device_async must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_cuda.h");
+ }
+
+ // regions
+ if( *IREGION == IREGION_CRUST_MANTLE ){
+ // crust/mantle region
+ size_mpi_buffer = NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle;
+
+ if( size_mpi_buffer > 0 ){
+
+ if(*FORWARD_OR_ADJOINT == 1) {
+ // copy on host memory
+ memcpy(mp->h_recv_accel_buffer_cm,buffer,size_mpi_buffer*sizeof(realw));
+
+ // asynchronous copy to GPU using copy_stream
+ cudaMemcpyAsync(mp->d_send_accel_buffer_crust_mantle,mp->h_recv_accel_buffer_cm,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice,mp->copy_stream);
+
+ }else if(*FORWARD_OR_ADJOINT == 3) {
+ // debug
+ DEBUG_BACKWARD_ASSEMBLY();
+
+ // copy on host memory
+ memcpy(mp->h_b_recv_accel_buffer_cm,buffer,size_mpi_buffer*sizeof(realw));
+
+ // asynchronous copy to GPU using copy_stream
+ cudaMemcpyAsync(mp->d_b_send_accel_buffer_crust_mantle,mp->h_b_recv_accel_buffer_cm,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice,mp->copy_stream);
+ }
+ }
+ }else if( *IREGION == IREGION_INNER_CORE ){
+ // inner core region
+ size_mpi_buffer = NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core;
+
+ if( size_mpi_buffer > 0 ){
+
+ if(*FORWARD_OR_ADJOINT == 1) {
+ // copy on host memory
+ memcpy(mp->h_recv_accel_buffer_ic,buffer,size_mpi_buffer*sizeof(realw));
+
+ // asynchronous copy to GPU using copy_stream
+ cudaMemcpyAsync(mp->d_send_accel_buffer_inner_core,mp->h_recv_accel_buffer_ic,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice,mp->copy_stream);
+
+ }else if(*FORWARD_OR_ADJOINT == 3) {
+ // debug
+ DEBUG_BACKWARD_ASSEMBLY();
+
+ // copy on host memory
+ memcpy(mp->h_b_recv_accel_buffer_ic,buffer,size_mpi_buffer*sizeof(realw));
+
+ // asynchronous copy to GPU using copy_stream
+ cudaMemcpyAsync(mp->d_b_send_accel_buffer_inner_core,mp->h_b_recv_accel_buffer_ic,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice,mp->copy_stream);
+ }
+ }
+ }else if( *IREGION == IREGION_OUTER_CORE ){
+ // outer core region
+ size_mpi_buffer = mp->max_nibool_interfaces_oc*mp->num_interfaces_outer_core;
+
+ if( size_mpi_buffer > 0 ){
+
+ if(*FORWARD_OR_ADJOINT == 1) {
+ // copy on host memory
+ memcpy(mp->h_recv_accel_buffer_oc,buffer,size_mpi_buffer*sizeof(realw));
+
+ // asynchronous copy to GPU using copy_stream
+ cudaMemcpyAsync(mp->d_send_accel_buffer_outer_core,mp->h_recv_accel_buffer_oc,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice,mp->copy_stream);
+
+ }else if(*FORWARD_OR_ADJOINT == 3) {
+ // debug
+ DEBUG_BACKWARD_ASSEMBLY();
+
+ // copy on host memory
+ memcpy(mp->h_b_recv_accel_buffer_oc,buffer,size_mpi_buffer*sizeof(realw));
+
+ // asynchronous copy to GPU using copy_stream
+ cudaMemcpyAsync(mp->d_b_send_accel_buffer_outer_core,mp->h_b_recv_accel_buffer_oc,size_mpi_buffer*sizeof(realw),
+ cudaMemcpyHostToDevice,mp->copy_stream);
+ }
+ }
+ }
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(sync_copy_from_device,
+ SYNC_copy_FROM_DEVICE)(long* Mesh_pointer,
+ int* iphase,
+ realw* send_buffer,
+ int* IREGION,
+ int* FORWARD_OR_ADJOINT) {
+
+// synchronizes copy stream before copying buffers from pinned memory to CPU host
+
+ TRACE("sync_copy_from_device");
+
+ int size_mpi_buffer;
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+ // checks async-memcpy
+ if( GPU_ASYNC_COPY == 0 ){
+ exit_on_error("sync_copy_from_device must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_cuda.h");
+ }
+
+ // Wait until async-memcpy of outer elements is finished and start MPI.
+ if( *iphase != 2 ){ exit_on_error("sync_copy_from_device must be called for iphase == 2"); }
+
+ // regions
+ if( *IREGION == IREGION_CRUST_MANTLE ){
+ // crust/mantle
+ size_mpi_buffer = NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle;
+
+ if( size_mpi_buffer > 0 ){
+ // waits for asynchronous copy to finish
+ cudaStreamSynchronize(mp->copy_stream);
+
+ if(*FORWARD_OR_ADJOINT == 1) {
+ // There have been problems using the pinned-memory with MPI, so
+ // we copy the buffer into a non-pinned region.
+ memcpy(send_buffer,mp->h_send_accel_buffer_cm,size_mpi_buffer*sizeof(realw));
+
+ }else if(*FORWARD_OR_ADJOINT == 3){
+ // we copy the buffer into a non-pinned region.
+ memcpy(send_buffer,mp->h_b_send_accel_buffer_cm,size_mpi_buffer*sizeof(realw));
+ }
+ }
+ }else if( *IREGION == IREGION_INNER_CORE ){
+ // inner core
+ size_mpi_buffer = NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core;
+
+ if( size_mpi_buffer > 0 ){
+ // waits for asynchronous copy to finish
+ cudaStreamSynchronize(mp->copy_stream);
+
+ if(*FORWARD_OR_ADJOINT == 1) {
+ // There have been problems using the pinned-memory with MPI, so
+ // we copy the buffer into a non-pinned region.
+ memcpy(send_buffer,mp->h_send_accel_buffer_ic,size_mpi_buffer*sizeof(realw));
+
+ }else if(*FORWARD_OR_ADJOINT == 3){
+ // we copy the buffer into a non-pinned region.
+ memcpy(send_buffer,mp->h_b_send_accel_buffer_ic,size_mpi_buffer*sizeof(realw));
+ }
+ }
+ }else if( *IREGION == IREGION_OUTER_CORE ){
+ // outer core
+ size_mpi_buffer = mp->max_nibool_interfaces_oc*mp->num_interfaces_outer_core;
+
+ if( size_mpi_buffer > 0 ){
+ // waits for asynchronous copy to finish
+ cudaStreamSynchronize(mp->copy_stream);
+
+ if(*FORWARD_OR_ADJOINT == 1) {
+ // There have been problems using the pinned-memory with MPI, so
+ // we copy the buffer into a non-pinned region.
+ memcpy(send_buffer,mp->h_send_accel_buffer_oc,size_mpi_buffer*sizeof(realw));
+
+ }else if(*FORWARD_OR_ADJOINT == 3){
+ // we copy the buffer into a non-pinned region.
+ memcpy(send_buffer,mp->h_b_send_accel_buffer_oc,size_mpi_buffer*sizeof(realw));
+ }
+ }
+ }
+
+ // memory copy is now finished, so non-blocking MPI send can proceed
+}
+
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/check_fields_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -111,7 +111,7 @@
#else
myrank = 0;
#endif
- sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+ sprintf(filename,"OUTPUT_FILES/error_message_%06d.txt",myrank);
fp = fopen(filename,"a+");
if (fp != NULL){
fprintf(fp,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
@@ -152,7 +152,7 @@
#else
myrank = 0;
#endif
- sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+ sprintf(filename,"OUTPUT_FILES/error_message_%06d.txt",myrank);
fp = fopen(filename,"a+");
if (fp != NULL){
fprintf(fp,"ERROR: %s\n",info);
@@ -185,7 +185,7 @@
#else
myrank = 0;
#endif
- sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+ sprintf(filename,"OUTPUT_FILES/error_message_%06d.txt",myrank);
fp = fopen(filename,"a+");
if (fp != NULL){
fprintf(fp,"\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
@@ -509,11 +509,15 @@
cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
if(*FORWARD_OR_ADJOINT == 1 ){
- get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_displ_outer_core,size,d_max);
+ get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_outer_core,size,d_max);
}else if(*FORWARD_OR_ADJOINT == 3 ){
- get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,size,d_max);
+ get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_outer_core,size,d_max);
}
+ // explicitly waits until previous compute stream finishes
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
cudaMemcpyDeviceToHost),222);
@@ -650,11 +654,15 @@
cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
if(*FORWARD_OR_ADJOINT == 1 ){
- get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,size,d_max);
+ get_maximum_vector_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_crust_mantle,size,d_max);
}else if(*FORWARD_OR_ADJOINT == 3 ){
- get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,size,d_max);
+ get_maximum_vector_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_crust_mantle,size,d_max);
}
+ // explicitly waits until previous compute stream finishes
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
// copies to CPU
print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
cudaMemcpyDeviceToHost),222);
@@ -753,7 +761,7 @@
max = 0.0f;
// determines max for: eps_trace_over_3_crust_mantle
- get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_eps_trace_over_3_crust_mantle,size,d_max);
+ get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_eps_trace_over_3_crust_mantle,size,d_max);
print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
cudaMemcpyDeviceToHost),221);
@@ -785,7 +793,7 @@
max_eps = 0.0f;
// determines max for: epsilondev_xx_crust_mantle
- get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,size,d_max);
+ get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_crust_mantle,size,d_max);
print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
cudaMemcpyDeviceToHost),222);
@@ -796,7 +804,7 @@
max_eps = MAX(max_eps,max);
// determines max for: epsilondev_yy_crust_mantle
- get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_yy_crust_mantle,size,d_max);
+ get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_yy_crust_mantle,size,d_max);
print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
cudaMemcpyDeviceToHost),223);
@@ -807,7 +815,7 @@
max_eps = MAX(max_eps,max);
// determines max for: epsilondev_xy_crust_mantle
- get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_xy_crust_mantle,size,d_max);
+ get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xy_crust_mantle,size,d_max);
print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
cudaMemcpyDeviceToHost),224);
@@ -818,7 +826,7 @@
max_eps = MAX(max_eps,max);
// determines max for: epsilondev_xz_crust_mantle
- get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_xz_crust_mantle,size,d_max);
+ get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xz_crust_mantle,size,d_max);
print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
cudaMemcpyDeviceToHost),225);
@@ -829,7 +837,7 @@
max_eps = MAX(max_eps,max);
// determines max for: epsilondev_yz_crust_mantle
- get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_epsilondev_yz_crust_mantle,size,d_max);
+ get_maximum_scalar_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_yz_crust_mantle,size,d_max);
print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
cudaMemcpyDeviceToHost),226);
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_add_sources_elastic_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_add_sources_elastic_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -72,12 +72,9 @@
// note: for global version, sourcearrays has dimensions
// sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,NSOURCES)
- atomicAdd(&accel[3*iglob], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
- 0,i,j,k,isource)]*stf);
- atomicAdd(&accel[3*iglob+1], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
- 1,i,j,k,isource)]*stf);
- atomicAdd(&accel[3*iglob+2], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
- 2,i,j,k,isource)]*stf);
+ atomicAdd(&accel[3*iglob], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,isource)]*stf);
+ atomicAdd(&accel[3*iglob+1], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,isource)]*stf);
+ atomicAdd(&accel[3*iglob+2], sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,isource)]*stf);
}
}
}
@@ -111,7 +108,7 @@
NSOURCES*sizeof(double),cudaMemcpyHostToDevice),71018);
// adds source contributions
- compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
+ compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
mp->d_ibool_crust_mantle,
mp->d_sourcearrays,
mp->d_stf_pre_compute,
@@ -157,7 +154,7 @@
print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
NSOURCES*sizeof(double),cudaMemcpyHostToDevice),71019);
- compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+ compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
mp->d_ibool_crust_mantle,
mp->d_sourcearrays,
mp->d_stf_pre_compute,
@@ -202,14 +199,9 @@
iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
// atomic operations are absolutely necessary for correctness!
- atomicAdd(&accel[3*iglob], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
- 0,i,j,k,irec_local)]);
-
- atomicAdd(&accel[3*iglob+1], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
- 1,i,j,k,irec_local)]);
-
- atomicAdd(&accel[3*iglob+2], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
- 2,i,j,k,irec_local)]);
+ atomicAdd(&accel[3*iglob], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,irec_local)]);
+ atomicAdd(&accel[3*iglob+1], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,irec_local)]);
+ atomicAdd(&accel[3*iglob+2], adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,irec_local)]);
}
}
@@ -218,12 +210,11 @@
extern "C"
void FC_FUNC_(compute_add_sources_adjoint_cuda,
COMPUTE_ADD_SOURCES_ADJOINT_CUDA)(long* Mesh_pointer,
- int* nrec,
- realw* h_adj_sourcearrays,
- int* h_islice_selected_rec,
- int* h_ispec_selected_rec,
- int* time_index) {
+ int* h_nrec) {
+// adds adjoint sources
+// note: call this routine after transfer_adj_to_device**() to have correct adjoint sourcearrays in array d_adj_sourcearrays
+
TRACE("compute_add_sources_adjoint_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
@@ -231,47 +222,96 @@
// check if anything to do
if( mp->nadj_rec_local == 0 ) return;
+ // total number of receivers/adjoint sources
+ int nrec = *h_nrec;
+
+ // waits for previous transfer_** calls to be finished
+ if( GPU_ASYNC_COPY ){
+ // waits for asynchronous copy to finish
+ cudaStreamSynchronize(mp->copy_stream);
+ }
+
int num_blocks_x, num_blocks_y;
get_blocks_xy(mp->nadj_rec_local,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(NGLLX,NGLLX,NGLLX);
+ // the irec_local variable needs to be precomputed (as
+ // h_pre_comp..), because normally it is in the loop updating accel,
+ // and due to how it's incremented, it cannot be parallelized
+ compute_add_sources_adjoint_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
+ nrec,
+ mp->d_adj_sourcearrays,
+ mp->d_ibool_crust_mantle,
+ mp->d_ispec_selected_rec,
+ mp->d_pre_computed_irec,
+ mp->nadj_rec_local);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("compute_add_sources_adjoint_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// adjoint memory transfers
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_adj_to_device,
+ TRANSFER_ADJ_TO_DEVICE)(long* Mesh_pointer,
+ int* h_nrec,
+ realw* h_adj_sourcearrays,
+ int* h_islice_selected_rec) {
+
+// transfers adjoint source arrays synchronuously to GPU
+
+ TRACE("transfer_adj_to_device");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+ // check if anything to do
+ if( mp->nadj_rec_local == 0 ) return;
+
+ // total number of receivers/adjoint sources
+ int nrec = *h_nrec;
+
// build slice of adj_sourcearrays because full array is *very* large.
- // note: this extracts array values for local adjoint sources at given time step "time_index"
+ //
+ // note: this copies array values for local adjoint sources at given time step "iadj_vec(it)"
// from large adj_sourcearrays array into h_adj_sourcearrays_slice
- int i,j,k;
- int irec_local = 0;
- for(int irec = 0; irec < *nrec; irec++) {
+ //
+ // dimension of global array version
+ // adj_sourcearrays is (NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC)
+ // passed as function argument here is pointer to slice at time iadj_vec(it)
+ // which has dimension (NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local)
+ int i,j,k,irec_local;
+
+ irec_local = 0;
+ for(int irec = 0; irec < nrec; irec++) {
if(mp->myrank == h_islice_selected_rec[irec]) {
- irec_local++;
-
// takes only local sources
for(k=0;k<NGLLX;k++) {
for(j=0;j<NGLLX;j++) {
for(i=0;i<NGLLX;i++) {
+ mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,irec_local)]
+ = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,irec_local)];
- // note: global version uses dimensions
- // h_adj_sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC)
- mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
- 0,i,j,k,irec_local-1)]
- = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
- 0,i,j,k,irec_local-1,*time_index-1)];
+ mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,irec_local)]
+ = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,irec_local)];
- mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
- 1,i,j,k,irec_local-1)]
- = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
- 1,i,j,k,irec_local-1,*time_index-1)];
-
- mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,
- 2,i,j,k,irec_local-1)]
- = h_adj_sourcearrays[INDEX6(NDIM,NGLLX,NGLLX,NGLLX,mp->nadj_rec_local,
- 2,i,j,k,irec_local-1,*time_index-1)];
+ mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,irec_local)]
+ = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,irec_local)];
}
}
}
+ // increases local receivers counter
+ irec_local++;
}
}
+
// check all local sources were added
if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
@@ -279,20 +319,85 @@
print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
(mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),71000);
-
- // the irec_local variable needs to be precomputed (as
- // h_pre_comp..), because normally it is in the loop updating accel,
- // and due to how it's incremented, it cannot be parallelized
- compute_add_sources_adjoint_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
- *nrec,
- mp->d_adj_sourcearrays,
- mp->d_ibool_crust_mantle,
- mp->d_ispec_selected_rec,
- mp->d_pre_computed_irec,
- mp->nadj_rec_local);
-
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_add_sources_adjoint_cuda");
+ exit_on_cuda_error("transfer_adj_to_device");
#endif
}
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(transfer_adj_to_device_async,
+ TRANSFER_ADJ_TO_DEVICE_ASYNC)(long* Mesh_pointer,
+ int* h_nrec,
+ realw* h_adj_sourcearrays,
+ int* h_islice_selected_rec) {
+
+// asynchronous transfer for next adjoint source arrays from host to device
+
+ TRACE("transfer_adj_to_device_async");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+ // check if anything to do
+ if( mp->nadj_rec_local == 0 ) return;
+
+ // checks async-memcpy
+ if( GPU_ASYNC_COPY == 0 ){
+ exit_on_error("transfer_adj_to_device_async must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_cuda.h");
+ }
+
+ // total number of receivers/adjoint sources
+ int nrec = *h_nrec;
+
+ // build slice of adj_sourcearrays because full array is *very* large.
+ //
+ // note: this copies array values for local adjoint sources at given time step "iadj_vec(it)"
+ // from large adj_sourcearrays array into h_adj_sourcearrays_slice
+ //
+ // dimension of global array version
+ // adj_sourcearrays is (NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC)
+ // passed as function argument here is pointer to slice at time iadj_vec(it)
+ // which has dimension (NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local)
+ int i,j,k,irec_local;
+
+ // waits for previous copy_stream call to be finished
+ cudaStreamSynchronize(mp->copy_stream);
+
+ irec_local = 0;
+ for(int irec = 0; irec < nrec; irec++) {
+ if(mp->myrank == h_islice_selected_rec[irec]) {
+ // takes only local sources
+ for(k=0;k<NGLLX;k++) {
+ for(j=0;j<NGLLX;j++) {
+ for(i=0;i<NGLLX;i++) {
+ mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,irec_local)]
+ = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,0,i,j,k,irec_local)];
+
+ mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,irec_local)]
+ = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,1,i,j,k,irec_local)];
+
+ mp->h_adj_sourcearrays_slice[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,irec_local)]
+ = h_adj_sourcearrays[INDEX5(NDIM,NGLLX,NGLLX,NGLLX,2,i,j,k,irec_local)];
+ }
+ }
+ }
+ // increases local receivers counter
+ irec_local++;
+ }
+ }
+
+ // check all local sources were added
+ if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
+
+ // waits for previous compute_add_sources_adjoint_cuda_kernel() call to be finished
+ cudaStreamSynchronize(mp->compute_stream);
+
+ // copies extracted array values onto GPU
+ // (asynchronous copy to GPU using copy_stream)
+ cudaMemcpyAsync(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,(mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),
+ cudaMemcpyHostToDevice,mp->copy_stream);
+
+}
+
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_coupling_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_coupling_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -125,7 +125,7 @@
// launches GPU kernel
if( *FORWARD_OR_ADJOINT == 1 ){
- compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+ compute_coupling_fluid_CMB_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_crust_mantle,
mp->d_accel_outer_core,
mp->d_ibool_crust_mantle,
mp->d_ibelm_bottom_crust_mantle,
@@ -140,7 +140,7 @@
DEBUG_BACKWARD_COUPLING();
// adjoint simulations
- compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+ compute_coupling_fluid_CMB_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_crust_mantle,
mp->d_b_accel_outer_core,
mp->d_ibool_crust_mantle,
mp->d_ibelm_bottom_crust_mantle,
@@ -245,7 +245,7 @@
// launches GPU kernel
if( *FORWARD_OR_ADJOINT == 1 ){
- compute_coupling_fluid_ICB_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+ compute_coupling_fluid_ICB_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_inner_core,
mp->d_accel_outer_core,
mp->d_ibool_inner_core,
mp->d_ibelm_top_inner_core,
@@ -260,7 +260,7 @@
DEBUG_BACKWARD_COUPLING();
// adjoint simulations
- compute_coupling_fluid_ICB_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
+ compute_coupling_fluid_ICB_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_inner_core,
mp->d_b_accel_outer_core,
mp->d_ibool_inner_core,
mp->d_ibelm_top_inner_core,
@@ -371,7 +371,7 @@
// launches GPU kernel
if( *FORWARD_OR_ADJOINT == 1 ){
- compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+ compute_coupling_CMB_fluid_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_crust_mantle,
mp->d_accel_crust_mantle,
mp->d_accel_outer_core,
mp->d_ibool_crust_mantle,
@@ -390,7 +390,7 @@
DEBUG_BACKWARD_COUPLING();
// adjoint simulations
- compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+ compute_coupling_CMB_fluid_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_crust_mantle,
mp->d_b_accel_crust_mantle,
mp->d_b_accel_outer_core,
mp->d_ibool_crust_mantle,
@@ -504,7 +504,7 @@
// launches GPU kernel
if( *FORWARD_OR_ADJOINT == 1 ){
- compute_coupling_ICB_fluid_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+ compute_coupling_ICB_fluid_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_inner_core,
mp->d_accel_inner_core,
mp->d_accel_outer_core,
mp->d_ibool_inner_core,
@@ -523,7 +523,7 @@
DEBUG_BACKWARD_COUPLING();
// adjoint simulations
- compute_coupling_ICB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
+ compute_coupling_ICB_fluid_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_inner_core,
mp->d_b_accel_inner_core,
mp->d_b_accel_outer_core,
mp->d_ibool_inner_core,
@@ -624,7 +624,7 @@
// uses corrected mass matrices
if( *FORWARD_OR_ADJOINT == 1 ){
- compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
+ compute_coupling_ocean_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
mp->d_rmassx_crust_mantle,
mp->d_rmassy_crust_mantle,
mp->d_rmassz_crust_mantle,
@@ -637,7 +637,7 @@
DEBUG_BACKWARD_COUPLING();
// for backward/reconstructed potentials
- compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+ compute_coupling_ocean_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
mp->d_b_rmassx_crust_mantle,
mp->d_b_rmassy_crust_mantle,
mp->d_b_rmassz_crust_mantle,
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_crust_mantle_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -59,11 +59,11 @@
// templates for texture fetching
// FORWARD_OR_ADJOINT == 1 <- forward arrays
template<> __device__ float texfetch_displ_cm<1>(int x) { return tex1Dfetch(d_displ_cm_tex, x); }
-template<> __device__ float texfetch_veloc_cm<1>(int x) { return tex1Dfetch(d_veloc_cm_tex, x); }
+//template<> __device__ float texfetch_veloc_cm<1>(int x) { return tex1Dfetch(d_veloc_cm_tex, x); }
template<> __device__ float texfetch_accel_cm<1>(int x) { return tex1Dfetch(d_accel_cm_tex, x); }
// FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
template<> __device__ float texfetch_displ_cm<3>(int x) { return tex1Dfetch(d_b_displ_cm_tex, x); }
-template<> __device__ float texfetch_veloc_cm<3>(int x) { return tex1Dfetch(d_b_veloc_cm_tex, x); }
+//template<> __device__ float texfetch_veloc_cm<3>(int x) { return tex1Dfetch(d_b_veloc_cm_tex, x); }
template<> __device__ float texfetch_accel_cm<3>(int x) { return tex1Dfetch(d_b_accel_cm_tex, x); }
#endif
@@ -469,8 +469,8 @@
realw costwothetasq,costwophisq,sintwophisq;
realw etaminone,twoetaminone;
realw two_eta_aniso,four_eta_aniso,six_eta_aniso;
- realw two_rhovsvsq,two_rhovshsq; // two_rhovpvsq,two_rhovphsq
- realw four_rhovsvsq,four_rhovshsq; // four_rhovpvsq,four_rhovphsq
+ realw two_rhovsvsq,two_rhovshsq;
+ realw four_rhovsvsq,four_rhovshsq;
realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
// cosine and sine function in CUDA only supported for float
@@ -567,23 +567,17 @@
twoetaminone = 2.0f * eta_aniso - 1.0f;
// precompute some products to reduce the CPU time
-
two_eta_aniso = 2.0f * eta_aniso;
four_eta_aniso = 4.0f * eta_aniso;
six_eta_aniso = 6.0f * eta_aniso;
- //two_rhovpvsq = 2.0f * rhovpvsq;
- //two_rhovphsq = 2.0f * rhovphsq;
two_rhovsvsq = 2.0f * rhovsvsq;
two_rhovshsq = 2.0f * rhovshsq;
- //four_rhovpvsq = 4.0f * rhovpvsq;
- //four_rhovphsq = 4.0f * rhovphsq;
four_rhovsvsq = 4.0f * rhovsvsq;
four_rhovshsq = 4.0f * rhovshsq;
// the 21 anisotropic coefficients computed using Mathematica
-
c11 = rhovphsq*sinphifour + 2.0f*cosphisq*sinphisq*
(rhovphsq*costhetasq + (eta_aniso*rhovphsq + two_rhovsvsq - two_eta_aniso*rhovsvsq)*
sinthetasq) + cosphifour*
@@ -1285,7 +1279,6 @@
// update memory variables based upon the Runge-Kutta scheme
if( ATTENUATION && ( ! PARTIAL_PHYS_DISPERSION_ONLY ) ){
-
compute_element_cm_att_memory(tx,working_element,
d_muvstore,
factor_common,alphaval,betaval,gammaval,
@@ -1378,7 +1371,7 @@
// cudaEventRecord( start, 0 );
if( FORWARD_OR_ADJOINT == 1 ){
- Kernel_2_crust_mantle_impl<<<grid,threads>>>(nb_blocks_to_compute,
+ Kernel_2_crust_mantle_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
mp->NGLOB_CRUST_MANTLE,
d_ibool,
d_ispec_is_tiso,
@@ -1428,7 +1421,7 @@
// debug
DEBUG_BACKWARD_FORCES();
- Kernel_2_crust_mantle_impl<<< grid,threads>>>(nb_blocks_to_compute,
+ Kernel_2_crust_mantle_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
mp->NGLOB_CRUST_MANTLE,
d_ibool,
d_ispec_is_tiso,
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_inner_core_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -978,7 +978,7 @@
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
if( FORWARD_OR_ADJOINT == 1 ){
- Kernel_2_inner_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
+ Kernel_2_inner_core_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
mp->NGLOB_INNER_CORE,
d_ibool,
d_idoubling,
@@ -1026,7 +1026,7 @@
// debug
DEBUG_BACKWARD_FORCES();
- Kernel_2_inner_core_impl<<< grid,threads>>>(nb_blocks_to_compute,
+ Kernel_2_inner_core_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
mp->NGLOB_INNER_CORE,
d_ibool,
d_idoubling,
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_forces_outer_core_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -494,7 +494,7 @@
// cudaEventRecord( start, 0 );
if( FORWARD_OR_ADJOINT == 1 ){
- Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
+ Kernel_2_outer_core_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
mp->NGLOB_OUTER_CORE,
d_ibool,
mp->d_phase_ispec_inner_outer_core,
@@ -525,7 +525,7 @@
// debug
DEBUG_BACKWARD_FORCES();
- Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
+ Kernel_2_outer_core_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
mp->NGLOB_OUTER_CORE,
d_ibool,
mp->d_phase_ispec_inner_outer_core,
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_kernels_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_kernels_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -515,7 +515,7 @@
dim3 threads(blocksize,1,1);
// density kernel
- compute_kernels_rho_cudakernel<<<grid,threads>>>(mp->d_ibool_crust_mantle,
+ compute_kernels_rho_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool_crust_mantle,
mp->d_accel_crust_mantle,
mp->d_b_displ_crust_mantle,
mp->d_rho_kl_crust_mantle,
@@ -533,7 +533,7 @@
// computes strain locally based on current backward/reconstructed (b_displ) wavefield
if(! mp->anisotropic_kl){
// isotropic kernels
- compute_kernels_iso_undo_att_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,
+ compute_kernels_iso_undo_att_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_crust_mantle,
mp->d_epsilondev_yy_crust_mantle,
mp->d_epsilondev_xy_crust_mantle,
mp->d_epsilondev_xz_crust_mantle,
@@ -551,7 +551,7 @@
mp->d_hprime_xx);
}else{
// anisotropic kernels
- compute_kernels_ani_undo_att_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,
+ compute_kernels_ani_undo_att_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_crust_mantle,
mp->d_epsilondev_yy_crust_mantle,
mp->d_epsilondev_xy_crust_mantle,
mp->d_epsilondev_xz_crust_mantle,
@@ -572,7 +572,7 @@
// takes strain arrays computed from previous compute_forces call
if(! mp->anisotropic_kl){
// isotropic kernels
- compute_kernels_iso_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,
+ compute_kernels_iso_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_crust_mantle,
mp->d_epsilondev_yy_crust_mantle,
mp->d_epsilondev_xy_crust_mantle,
mp->d_epsilondev_xz_crust_mantle,
@@ -590,7 +590,7 @@
deltat);
}else{
// anisotropic kernels
- compute_kernels_ani_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_crust_mantle,
+ compute_kernels_ani_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_crust_mantle,
mp->d_epsilondev_yy_crust_mantle,
mp->d_epsilondev_xy_crust_mantle,
mp->d_epsilondev_xz_crust_mantle,
@@ -640,7 +640,7 @@
// only isotropic kernels in inner core so far implemented
// density kernel
- compute_kernels_rho_cudakernel<<<grid,threads>>>(mp->d_ibool_inner_core,
+ compute_kernels_rho_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool_inner_core,
mp->d_accel_inner_core,
mp->d_b_displ_inner_core,
mp->d_rho_kl_inner_core,
@@ -657,7 +657,7 @@
// computes strain locally based on current backward/reconstructed (b_displ) wavefield
// isotropic kernels (shear, bulk)
- compute_kernels_iso_undo_att_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_inner_core,
+ compute_kernels_iso_undo_att_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_inner_core,
mp->d_epsilondev_yy_inner_core,
mp->d_epsilondev_xy_inner_core,
mp->d_epsilondev_xz_inner_core,
@@ -678,7 +678,7 @@
// takes strain arrays computed from previous compute_forces call
// isotropic kernels (shear, bulk)
- compute_kernels_iso_cudakernel<<<grid,threads>>>(mp->d_epsilondev_xx_inner_core,
+ compute_kernels_iso_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_epsilondev_xx_inner_core,
mp->d_epsilondev_yy_inner_core,
mp->d_epsilondev_xy_inner_core,
mp->d_epsilondev_xz_inner_core,
@@ -881,7 +881,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- compute_kernels_acoustic_kernel<<<grid,threads>>>(mp->d_ibool_outer_core,
+ compute_kernels_acoustic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool_outer_core,
mp->d_rhostore_outer_core,
mp->d_kappavstore_outer_core,
mp->d_hprime_xx,
@@ -975,7 +975,7 @@
cudaMemcpyHostToDevice),90900);
// calculates noise strength kernel
- compute_kernels_strength_noise_cuda_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+ compute_kernels_strength_noise_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_crust_mantle,
mp->d_ibelm_top_crust_mantle,
mp->d_ibool_crust_mantle,
mp->d_noise_surface_movie,
@@ -1032,7 +1032,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// checks
- if( ! mp->approximate_hess_kl ){exit_on_cuda_error("approximate_hess_kl flag not properly initialized");}
+ if( ! mp->approximate_hess_kl ){exit_on_error("approximate_hess_kl flag not properly initialized");}
int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
realw deltat = *deltat_f;
@@ -1043,7 +1043,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- compute_kernels_hess_cudakernel<<<grid,threads>>>(mp->d_ibool_crust_mantle,
+ compute_kernels_hess_cudakernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool_crust_mantle,
mp->d_accel_crust_mantle,
mp->d_b_accel_crust_mantle,
mp->d_hess_kl_crust_mantle,
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_acoustic_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_acoustic_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -230,7 +230,7 @@
break;
default:
- exit_on_cuda_error("compute_stacey_acoustic_cuda: unknown interface type");
+ exit_on_error("compute_stacey_acoustic_cuda: unknown interface type");
break;
}
@@ -251,7 +251,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_veloc_outer_core,
+ compute_stacey_acoustic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc_outer_core,
mp->d_accel_outer_core,
interface_type,
num_abs_boundary_faces,
@@ -270,7 +270,11 @@
d_b_absorb_potential);
// adjoint simulations: stores absorbed wavefield part
- if( mp->save_forward && num_abs_boundary_faces > 0 ){
+ if( mp->save_forward ){
+ // explicitly waits until previous compute stream finishes
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
// copies array to CPU
print_CUDA_error_if_any(cudaMemcpy(absorb_potential,d_b_absorb_potential,
NGLL2*num_abs_boundary_faces*sizeof(realw),
@@ -437,7 +441,7 @@
break;
default:
- exit_on_cuda_error("compute_stacey_acoustic_cuda: unknown interface type");
+ exit_on_error("compute_stacey_acoustic_cuda: unknown interface type");
break;
}
@@ -466,7 +470,7 @@
cudaMemcpyHostToDevice),7700);
}
- compute_stacey_acoustic_backward_kernel<<<grid,threads>>>(mp->d_b_accel_outer_core,
+ compute_stacey_acoustic_backward_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_outer_core,
d_b_absorb_potential,
interface_type,
num_abs_boundary_faces,
@@ -552,7 +556,7 @@
break;
default:
- exit_on_cuda_error("compute_stacey_acoustic_undoatt_cuda: unknown interface type");
+ exit_on_error("compute_stacey_acoustic_undoatt_cuda: unknown interface type");
break;
}
@@ -573,7 +577,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_b_veloc_outer_core,
+ compute_stacey_acoustic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_veloc_outer_core,
mp->d_b_accel_outer_core,
interface_type,
num_abs_boundary_faces,
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_elastic_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/compute_stacey_elastic_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -237,7 +237,7 @@
break;
default:
- exit_on_cuda_error("compute_stacey_elastic_cuda: unknown interface type");
+ exit_on_error("compute_stacey_elastic_cuda: unknown interface type");
break;
}
@@ -259,7 +259,7 @@
dim3 threads(blocksize,1,1);
// absorbing boundary contributions
- compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc_crust_mantle,
+ compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc_crust_mantle,
mp->d_accel_crust_mantle,
interface_type,
num_abs_boundary_faces,
@@ -281,7 +281,11 @@
// adjoint simulations: stores absorbed wavefield part
- if(mp->save_forward && num_abs_boundary_faces > 0 ) {
+ if( mp->save_forward ) {
+ // explicitly waits until previous compute stream finishes
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
// copies array to CPU
print_CUDA_error_if_any(cudaMemcpy(absorb_field,d_b_absorb_field,
NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyDeviceToHost),7701);
@@ -438,7 +442,7 @@
break;
default:
- exit_on_cuda_error("compute_stacey_elastic_cuda: unknown interface type");
+ exit_on_error("compute_stacey_elastic_cuda: unknown interface type");
break;
}
@@ -465,7 +469,7 @@
NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
// absorbing boundary contributions
- compute_stacey_elastic_backward_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+ compute_stacey_elastic_backward_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
d_b_absorb_field,
interface_type,
num_abs_boundary_faces,
@@ -548,7 +552,7 @@
break;
default:
- exit_on_cuda_error("compute_stacey_elastic_undoatt_cuda: unknown interface type");
+ exit_on_error("compute_stacey_elastic_undoatt_cuda: unknown interface type");
break;
}
@@ -570,7 +574,7 @@
dim3 threads(blocksize,1,1);
// absorbing boundary contributions
- compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_b_veloc_crust_mantle,
+ compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_veloc_crust_mantle,
mp->d_b_accel_crust_mantle,
interface_type,
num_abs_boundary_faces,
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/initialize_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -88,6 +88,8 @@
// Gets number of GPU devices
device_count = 0;
cudaGetDeviceCount(&device_count);
+
+ // checks if command failed
exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on titan enable environment: CRAY_CUDA_PROXY=1 to use single GPU with multiple MPI processes\n\nexiting...\n");
// returns device count to fortran
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/mesh_constants_cuda.h 2014-01-13 16:06:21 UTC (rev 22991)
@@ -52,6 +52,7 @@
/* ----------------------------------------------------------------------------------------------- */
+// debug: outputs traces
#define DEBUG 0
#if DEBUG == 1
#define TRACE(x) printf("%s\n",x);
@@ -59,6 +60,7 @@
#define TRACE(x) // printf("%s\n",x);
#endif
+// more outputs
#define MAXDEBUG 0
#if MAXDEBUG == 1
#define LOG(x) printf("%s\n",x)
@@ -90,13 +92,14 @@
#define DEBUG_BACKWARD_UPDATE()
#endif
-
// error checking after cuda function calls
-#define ENABLE_VERY_SLOW_ERROR_CHECKING
+// (note: this synchronizes many calls, thus e.g. no asynchronuous memcpy possible)
+//#define ENABLE_VERY_SLOW_ERROR_CHECKING
// maximum function
#define MAX(x,y) (((x) < (y)) ? (y) : (x))
+
/* ----------------------------------------------------------------------------------------------- */
// cuda constant arrays
@@ -122,7 +125,8 @@
// region ids
#define IREGION_CRUST_MANTLE 1
-#define IREGION_INNER_CORE 3
+#define IREGION_OUTER_CORE 2
+#define IREGION_INNER_CORE 3
// inner core : fictitious elements id (from constants.h)
#define IFLAG_IN_FICTITIOUS_CUBE 11
@@ -132,6 +136,9 @@
// uncomment line below for PREM with oceans
//#define R_EARTH_KM 6368.0f
+// Asynchronuous memory copies between GPU and CPU
+#define GPU_ASYNC_COPY 1
+
/* ----------------------------------------------------------------------------------------------- */
// (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
@@ -199,22 +206,14 @@
// indexing
-#define INDEX2(xsize,x,y) x + (y)*xsize
+#define INDEX2(isize,i,j) i + isize*j
+#define INDEX3(isize,jsize,i,j,k) i + isize*(j + jsize*k)
+#define INDEX4(isize,jsize,ksize,i,j,k,x) i + isize*(j + jsize*(k + ksize*x))
+#define INDEX5(isize,jsize,ksize,xsize,i,j,k,x,y) i + isize*(j + jsize*(k + ksize*(x + xsize*y)))
-#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
-//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
+//#define INDEX6(isize,jsize,ksize,xsize,ysize,i,j,k,x,y,z) i + isize*(j + jsize*(k + ksize*(x + xsize*(y + ysize*z))))
+//#define INDEX4_PADDED(isize,jsize,ksize,i,j,k,x) i + isize*(j + jsize*k) + x*NGLL3_PADDED
-#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
-//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
-
-#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*(j))))
-//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
-
-#define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
-
-#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*NGLL3_PADDED
-//#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*NGLL3_PADDED
-
/* ----------------------------------------------------------------------------------------------- */
// type of "working" variables: see also CUSTOM_REAL
@@ -625,10 +624,15 @@
int* d_number_receiver_global;
int* d_ispec_selected_rec;
int* d_islice_selected_rec;
+
int nrec_local;
+
realw* d_station_seismo_field;
realw* h_station_seismo_field;
+ realw* d_station_strain_field;
+ realw* h_station_strain_field;
+
// adjoint receivers/sources
int nadj_rec_local;
realw* d_adj_sourcearrays;
@@ -731,6 +735,33 @@
// noise sensitivity kernel
realw* d_Sigma_kl;
+ // ------------------------------------------------------------------ //
+ // optimizations
+ // ------------------------------------------------------------------ //
+
+ // overlapped memcpy streams
+ cudaStream_t compute_stream;
+ cudaStream_t copy_stream;
+
+ // A buffer for mpi-send/recv, which is duplicated in fortran but is
+ // allocated with pinned memory to facilitate asynchronus device <->
+ // host memory transfers
+ // crust/mantle
+ float* h_send_accel_buffer_cm;
+ float* h_recv_accel_buffer_cm;
+ float* h_b_send_accel_buffer_cm;
+ float* h_b_recv_accel_buffer_cm;
+ // inner core
+ float* h_send_accel_buffer_ic;
+ float* h_recv_accel_buffer_ic;
+ float* h_b_send_accel_buffer_ic;
+ float* h_b_recv_accel_buffer_ic;
+ // outer core
+ float* h_send_accel_buffer_oc;
+ float* h_recv_accel_buffer_oc;
+ float* h_b_send_accel_buffer_oc;
+ float* h_b_recv_accel_buffer_oc;
+
} Mesh;
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/prepare_mesh_constants_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -195,7 +195,7 @@
print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_ic_tex_ptr, mp->d_hprime_xx,
&channelDesc, sizeof(realw)*(NGLL2)), 1106);
#else
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx_cm_tex, mp->d_hprime_xx,
&channelDesc, sizeof(realw)*(NGLL2)), 1102);
print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx_oc_tex, mp->d_hprime_xx,
@@ -282,10 +282,24 @@
// for seismograms
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
NDIM*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+ // for transfering values from GPU to CPU
+ if( GPU_ASYNC_COPY ){
+ // todo
+ // only pinned memory can handle memcpy calls asynchronuously
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_station_seismo_field),
+ NDIM*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+ }else{
+ mp->h_station_seismo_field = (realw*) malloc( NDIM*NGLL3*(mp->nrec_local)*sizeof(realw) );
+ if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
+ }
- mp->h_station_seismo_field = (realw*) malloc( NDIM*NGLL3*(mp->nrec_local)*sizeof(realw) );
- if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
-
+ // for adjoint strain
+ if( mp->simulation_type == 2 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_strain_field),
+ NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+ mp->h_station_strain_field = (realw*) malloc( NGLL3*(mp->nrec_local)*sizeof(realw) );
+ if( mp->h_station_strain_field == NULL) exit_on_error("h_station_strain_field not allocated \n");
+ }
}
copy_todevice_int((void**)&mp->d_ispec_selected_rec,h_ispec_selected_rec,(*nrec));
@@ -318,12 +332,17 @@
free(h_pre_computed_irec);
// temporary array to prepare extracted source array values
- mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw) );
- if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+ if( GPU_ASYNC_COPY ){
+ // note: Allocate pinned buffers otherwise cudaMemcpyAsync() will behave like cudaMemcpy(), i.e. synchronuously.
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_adj_sourcearrays_slice),
+ (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw)),6011);
+ }else{
+ mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw) );
+ if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+ }
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
(mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw)),6003);
-
}
// for rotation and new attenuation
@@ -335,6 +354,13 @@
mp->two_omega_earth = 0.f;
mp->b_two_omega_earth = 0.f;
+ // setup two streams, one for compute and one for host<->device memory copies
+ // uses pinned memory for asynchronuous data transfers
+ // compute stream
+ cudaStreamCreate(&mp->compute_stream);
+ // copy stream (needed to transfer mpi buffers)
+ cudaStreamCreate(&mp->copy_stream);
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_constants_device");
#endif
@@ -362,14 +388,13 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
// arrays only needed when rotation is required
- if( ! mp->rotation ){
- exit_on_cuda_error("prepare_fields_rotation_device: rotation flag not properly initialized");
- }
+ if( ! mp->rotation ){exit_on_error("prepare_fields_rotation_device: rotation flag not properly initialized");}
+
// checks array size
if( *NSPEC_OUTER_CORE_ROTATION != mp->NSPEC_OUTER_CORE){
printf("error prepare_fields_rotation_device: rotation array has wrong size: %d instead of %d\n",
*NSPEC_OUTER_CORE_ROTATION,mp->NSPEC_OUTER_CORE);
- exit_on_cuda_error("prepare_fields_rotation_device: rotation array has wrong size");
+ exit_on_error("prepare_fields_rotation_device: rotation array has wrong size");
}
// rotation arrays (needed only for outer core region)
@@ -482,7 +507,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
// checks flag
- if( ! mp->attenuation ){ exit_on_cuda_error("prepare_fields_attenuat_device attenuation not properly initialized"); }
+ if( ! mp->attenuation ){ exit_on_error("prepare_fields_attenuat_device attenuation not properly initialized"); }
// crust_mantle
R_size1 = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
@@ -605,7 +630,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
// checks flag
- if( ! mp->compute_and_store_strain ){ exit_on_cuda_error("prepare_fields_strain_device strain not properly initialized"); }
+ if( ! mp->compute_and_store_strain ){ exit_on_error("prepare_fields_strain_device strain not properly initialized"); }
// crust_mantle
R_size = NGLL3*mp->NSPEC_CRUST_MANTLE;
@@ -718,7 +743,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
// checks flag
- if( ! mp->absorbing_conditions ){ exit_on_cuda_error("prepare_fields_absorb_device absorbing_conditions not properly initialized"); }
+ if( ! mp->absorbing_conditions ){ exit_on_error("prepare_fields_absorb_device absorbing_conditions not properly initialized"); }
// crust_mantle
mp->nspec2D_xmin_crust_mantle = *nspec2D_xmin_crust_mantle;
@@ -901,6 +926,7 @@
TRACE("prepare_mpi_buffers_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+ int size_mpi_buffer;
// prepares interprocess-edge exchange information
@@ -914,14 +940,29 @@
// ibool entries (iglob indices) values on interface
copy_todevice_int((void**)&mp->d_ibool_interfaces_crust_mantle,ibool_interfaces_crust_mantle,
(mp->num_interfaces_crust_mantle)*(mp->max_nibool_interfaces_cm));
+
+ size_mpi_buffer = NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle);
+
// allocates mpi buffer for exchange with cpu
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_crust_mantle),
- NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_crust_mantle),size_mpi_buffer*sizeof(realw)),4004);
if( mp->simulation_type == 3){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_crust_mantle),
- NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_crust_mantle),size_mpi_buffer*sizeof(realw)),4004);
}
+ // asynchronuous MPI buffer
+ if( GPU_ASYNC_COPY ){
+ // note: Allocate pinned mpi-buffers.
+ // MPI buffers use pinned memory allocated by cudaMallocHost, which
+ // enables the use of asynchronous memory copies from host <-> device
+ // send buffer
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer_cm),sizeof(realw)* size_mpi_buffer ),8004);
+ // receive buffer
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer_cm),sizeof(realw)* size_mpi_buffer ),8004);
+ if( mp->simulation_type == 3){
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_send_accel_buffer_cm),sizeof(realw)* size_mpi_buffer ),8004);
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_recv_accel_buffer_cm),sizeof(realw)* size_mpi_buffer ),8004);
+ }
+ }
}
// inner core mesh
@@ -934,14 +975,30 @@
// ibool entries (iglob indices) values on interface
copy_todevice_int((void**)&mp->d_ibool_interfaces_inner_core,ibool_interfaces_inner_core,
(mp->num_interfaces_inner_core)*(mp->max_nibool_interfaces_ic));
+
+ size_mpi_buffer = NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core);
+
// allocates mpi buffer for exchange with cpu
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_inner_core),
- NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_inner_core),size_mpi_buffer*sizeof(realw)),4004);
if( mp->simulation_type == 3){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_inner_core),
- NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_inner_core),size_mpi_buffer*sizeof(realw)),4004);
}
+ // asynchronuous MPI buffer
+ if( GPU_ASYNC_COPY ){
+ // note: Allocate pinned mpi-buffers.
+ // MPI buffers use pinned memory allocated by cudaMallocHost, which
+ // enables the use of asynchronous memory copies from host <-> device
+ // send buffer
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer_ic),sizeof(realw)*size_mpi_buffer ),8004);
+ // receive buffer
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer_ic),sizeof(realw)*size_mpi_buffer ),8004);
+ // adjoint
+ if( mp->simulation_type == 3){
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_send_accel_buffer_ic),sizeof(realw)*size_mpi_buffer ),8004);
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_recv_accel_buffer_ic),sizeof(realw)*size_mpi_buffer ),8004);
+ }
+ }
}
// outer core mesh
@@ -955,14 +1012,31 @@
// ibool entries (iglob indices) values on interface
copy_todevice_int((void**)&mp->d_ibool_interfaces_outer_core,ibool_interfaces_outer_core,
(mp->num_interfaces_outer_core)*(mp->max_nibool_interfaces_oc));
+
+ size_mpi_buffer = (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core);
+
// allocates mpi buffer for exchange with cpu
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_outer_core),
- (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_outer_core),size_mpi_buffer*sizeof(realw)),4004);
if( mp->simulation_type == 3){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_outer_core),
- (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_outer_core),size_mpi_buffer*sizeof(realw)),4004);
}
+
+ // asynchronuous MPI buffer
+ if( GPU_ASYNC_COPY ){
+ // note: Allocate pinned mpi-buffers.
+ // MPI buffers use pinned memory allocated by cudaMallocHost, which
+ // enables the use of asynchronous memory copies from host <-> device
+ // send buffer
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer_oc),sizeof(realw)*size_mpi_buffer ),8004);
+ // receive buffer
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer_oc),sizeof(realw)*size_mpi_buffer ),8004);
+ if( mp->simulation_type == 3){
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_send_accel_buffer_oc),sizeof(realw)*size_mpi_buffer ),8004);
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_b_recv_accel_buffer_oc),sizeof(realw)*size_mpi_buffer ),8004);
+ }
+ }
}
+
}
/* ----------------------------------------------------------------------------------------------- */
@@ -999,7 +1073,7 @@
NDIM*NGLL2*(mp->nspec2D_top_crust_mantle)*sizeof(realw)),7005);
}else{
// for global mesh: each crust/mantle slice should have at top a free surface
- exit_on_cuda_error("prepare_fields_noise_device NSPEC_TOP not properly initialized");
+ exit_on_error("prepare_fields_noise_device NSPEC_TOP not properly initialized");
}
@@ -1057,7 +1131,7 @@
mp->npoin_oceans = *npoin_oceans;
// checks for global partitions, each slice must have a top surface with points on it
- if( mp->npoin_oceans == 0 ){ exit_on_cuda_error("prepare_oceans_device has zero npoin_oceans"); }
+ if( mp->npoin_oceans == 0 ){ exit_on_error("prepare_oceans_device has zero npoin_oceans"); }
// global point indices
copy_todevice_int((void**)&mp->d_ibool_ocean_load,h_iglob_ocean_load,mp->npoin_oceans);
@@ -1373,7 +1447,7 @@
&channelDesc, sizeof(realw)*size), 4023);
}
#else
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_cm_tex, mp->d_displ_crust_mantle,
&channelDesc, sizeof(realw)*size), 4021);
print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_cm_tex, mp->d_accel_crust_mantle,
@@ -1619,7 +1693,7 @@
&channelDesc, sizeof(realw)*size_glob), 5023);
}
#else
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_oc_tex, mp->d_displ_outer_core,
&channelDesc, sizeof(realw)*size_glob), 5021);
print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_oc_tex, mp->d_accel_outer_core,
@@ -1866,7 +1940,7 @@
}
#else
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_ic_tex, mp->d_displ_inner_core,
&channelDesc, sizeof(realw)*size), 6021);
print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_ic_tex, mp->d_accel_inner_core,
@@ -1949,6 +2023,9 @@
// frees allocated memory arrays
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+ // synchronizes device
+ synchronize_cuda();
+
// frees memory on GPU
//------------------------------------------
@@ -1968,14 +2045,26 @@
if( mp->nrec_local > 0 ) {
cudaFree(mp->d_number_receiver_global);
cudaFree(mp->d_station_seismo_field);
- free(mp->h_station_seismo_field);
+ if( GPU_ASYNC_COPY){
+ cudaFreeHost(mp->h_station_seismo_field);
+ }else{
+ free(mp->h_station_seismo_field);
+ }
+ if( mp->simulation_type == 2 ){
+ cudaFree(mp->d_station_strain_field);
+ free(mp->h_station_strain_field);
+ }
}
cudaFree(mp->d_ispec_selected_rec);
if( mp->nadj_rec_local > 0 ){
cudaFree(mp->d_adj_sourcearrays);
cudaFree(mp->d_pre_computed_irec);
- free(mp->h_adj_sourcearrays_slice);
+ if( GPU_ASYNC_COPY){
+ cudaFreeHost(mp->h_adj_sourcearrays_slice);
+ }else{
+ free(mp->h_adj_sourcearrays_slice);
+ }
}
//------------------------------------------
@@ -2164,18 +2253,42 @@
cudaFree(mp->d_ibool_interfaces_crust_mantle);
cudaFree(mp->d_send_accel_buffer_crust_mantle);
if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_crust_mantle);
+ if( GPU_ASYNC_COPY){
+ cudaFreeHost(mp->h_send_accel_buffer_cm);
+ cudaFreeHost(mp->h_recv_accel_buffer_cm);
+ if( mp->simulation_type == 3 ){
+ cudaFreeHost(mp->h_b_send_accel_buffer_cm);
+ cudaFreeHost(mp->h_b_recv_accel_buffer_cm);
+ }
+ }
}
if( mp->num_interfaces_inner_core > 0 ){
cudaFree(mp->d_nibool_interfaces_inner_core);
cudaFree(mp->d_ibool_interfaces_inner_core);
cudaFree(mp->d_send_accel_buffer_inner_core);
if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_inner_core);
+ if( GPU_ASYNC_COPY){
+ cudaFreeHost(mp->h_send_accel_buffer_ic);
+ cudaFreeHost(mp->h_recv_accel_buffer_ic);
+ if( mp->simulation_type == 3 ){
+ cudaFreeHost(mp->h_b_send_accel_buffer_ic);
+ cudaFreeHost(mp->h_b_recv_accel_buffer_ic);
+ }
+ }
}
if( mp->num_interfaces_outer_core > 0 ){
cudaFree(mp->d_nibool_interfaces_outer_core);
cudaFree(mp->d_ibool_interfaces_outer_core);
cudaFree(mp->d_send_accel_buffer_outer_core);
if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_outer_core);
+ if( GPU_ASYNC_COPY){
+ cudaFreeHost(mp->h_send_accel_buffer_oc);
+ cudaFreeHost(mp->h_recv_accel_buffer_oc);
+ if( mp->simulation_type == 3 ){
+ cudaFreeHost(mp->h_b_send_accel_buffer_oc);
+ cudaFreeHost(mp->h_b_recv_accel_buffer_oc);
+ }
+ }
}
//------------------------------------------
@@ -2393,12 +2506,17 @@
cudaFree(mp->d_normal_ocean_load);
}
+ // cleans up asychronuous streams
+ cudaStreamDestroy(mp->compute_stream);
+ cudaStreamDestroy(mp->copy_stream);
+
+ // synchronizes device
+ synchronize_cuda();
+
// releases previous contexts
#if CUDA_VERSION < 4000
- cudaThreadSynchronize();
cudaThreadExit();
#else
- cudaDeviceSynchronize();
cudaDeviceReset();
#endif
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2014-01-13 16:06:21 UTC (rev 22991)
@@ -1,4 +1,4 @@
-/*
+/*
!=====================================================================
!
! S p e c f e m 3 D G l o b e V e r s i o n 6 . 0
@@ -34,97 +34,110 @@
typedef float realw;
+
-
//
// src/cuda/assemble_MPI_scalar_cuda.cu
//
void FC_FUNC_(transfer_boun_pot_from_device,
TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer_f,
- realw* send_potential_dot_dot_buffer,
- int* FORWARD_OR_ADJOINT){}
+ realw* send_buffer,
+ int* FORWARD_OR_ADJOINT){}
void FC_FUNC_(transfer_asmbl_pot_to_device,
TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
realw* buffer_recv_scalar,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
//
// src/cuda/assemble_MPI_vector_cuda.cu
//
-void FC_FUNC_(transfer_boun_accel_from_device,
- TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
- realw* send_accel_buffer,
- int* IREGION,
- int* FORWARD_OR_ADJOINT){}
+void FC_FUNC_(transfer_boun_from_device,
+ TRANSFER_BOUN_FROM_DEVICE)(long* Mesh_pointer_f,
+ realw* send_accel_buffer,
+ int* IREGION,
+ int* FORWARD_OR_ADJOINT){}
void FC_FUNC_(transfer_asmbl_accel_to_device,
TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
realw* buffer_recv_vector,
int* IREGION,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
+void FC_FUNC_(transfer_buffer_to_device_async,
+ TRANSFER_BUFFER_TO_DEVICE_ASYNC)(long* Mesh_pointer,
+ realw* buffer,
+ int* IREGION,
+ int* FORWARD_OR_ADJOINT) {}
+void FC_FUNC_(sync_copy_from_device,
+ SYNC_copy_FROM_DEVICE)(long* Mesh_pointer,
+ int* iphase,
+ realw* send_buffer,
+ int* IREGION,
+ int* FORWARD_OR_ADJOINT) {}
+
+
//
// src/cuda/check_fields_cuda.cu
//
void FC_FUNC_(pause_for_debug,
- PAUSE_FOR_DEBUG)() {}
+ PAUSE_FOR_DEBUG)() {}
void FC_FUNC_(output_free_device_memory,
- OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
+ OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
void FC_FUNC_(get_free_device_memory,
- get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+ get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
void FC_FUNC_(check_norm_acoustic_from_device,
CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(check_norm_elastic_from_device,
CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(check_norm_strain_from_device,
CHECK_NORM_STRAIN_FROM_DEVICE)(realw* strain_norm,
realw* strain_norm2,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(check_max_norm_displ_gpu,
- CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
void FC_FUNC_(check_max_norm_vector,
- CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
+ CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
void FC_FUNC_(check_max_norm_displ,
- CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
+ CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
void FC_FUNC_(check_max_norm_b_displ_gpu,
- CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
void FC_FUNC_(check_max_norm_b_accel_gpu,
- CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
void FC_FUNC_(check_max_norm_b_veloc_gpu,
- CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
void FC_FUNC_(check_max_norm_b_displ,
- CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
+ CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
void FC_FUNC_(check_max_norm_b_accel,
- CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
+ CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
void FC_FUNC_(check_error_vectors,
- CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
+ CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
void FC_FUNC_(get_max_accel,
- GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
+ GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
//
@@ -134,45 +147,53 @@
void FC_FUNC_(compute_add_sources_cuda,
COMPUTE_ADD_SOURCES_CUDA)(long* Mesh_pointer_f,
int* NSOURCESf,
- double* h_stf_pre_compute) {}
+ double* h_stf_pre_compute) {}
void FC_FUNC_(compute_add_sources_backward_cuda,
COMPUTE_ADD_SOURCES_BACKWARD_CUDA)(long* Mesh_pointer_f,
int* NSOURCESf,
- double* h_stf_pre_compute) {}
+ double* h_stf_pre_compute) {}
void FC_FUNC_(compute_add_sources_adjoint_cuda,
COMPUTE_ADD_SOURCES_ADJOINT_CUDA)(long* Mesh_pointer,
- int* nrec,
- realw* h_adj_sourcearrays,
- int* h_islice_selected_rec,
- int* h_ispec_selected_rec,
- int* time_index) {}
+ int* h_nrec) {}
+void FC_FUNC_(transfer_adj_to_device,
+ TRANSFER_ADJ_TO_DEVICE)(long* Mesh_pointer,
+ int* h_nrec,
+ realw* h_adj_sourcearrays,
+ int* h_islice_selected_rec) {}
+void FC_FUNC_(transfer_adj_to_device_async,
+ TRANSFER_ADJ_TO_DEVICE_ASYNC)(long* Mesh_pointer,
+ int* h_nrec,
+ realw* h_adj_sourcearrays,
+ int* h_islice_selected_rec) {}
+
+
//
// src/cuda/compute_coupling_cuda.cu
//
void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(compute_coupling_fluid_icb_cuda,
COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(compute_coupling_icb_fluid_cuda,
COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(compute_coupling_ocean_cuda,
COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
//
@@ -182,7 +203,7 @@
void FC_FUNC_(compute_forces_crust_mantle_cuda,
COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
int* iphase,
- int* FORWARD_OR_ADJOINT_f) {}
+ int* FORWARD_OR_ADJOINT_f) {}
//
@@ -192,7 +213,7 @@
void FC_FUNC_(compute_forces_inner_core_cuda,
COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
int* iphase,
- int* FORWARD_OR_ADJOINT_f) {}
+ int* FORWARD_OR_ADJOINT_f) {}
//
@@ -203,7 +224,7 @@
COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
int* iphase,
realw* time_f,
- int* FORWARD_OR_ADJOINT_f) {}
+ int* FORWARD_OR_ADJOINT_f) {}
//
@@ -211,22 +232,22 @@
//
void FC_FUNC_(compute_kernels_cm_cuda,
- COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+ COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
void FC_FUNC_(compute_kernels_ic_cuda,
- COMPUTE_KERNELS_IC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+ COMPUTE_KERNELS_IC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
void FC_FUNC_(compute_kernels_oc_cuda,
- COMPUTE_KERNELS_OC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+ COMPUTE_KERNELS_OC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
void FC_FUNC_(compute_kernels_strgth_noise_cu,
COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
realw* h_noise_surface_movie,
- realw* deltat_f) {}
+ realw* deltat_f) {}
void FC_FUNC_(compute_kernels_hess_cuda,
COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
- realw* deltat_f) {}
+ realw* deltat_f) {}
//
@@ -236,16 +257,16 @@
void FC_FUNC_(compute_stacey_acoustic_cuda,
COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
realw* absorb_potential,
- int* itype) {}
+ int* itype) {}
void FC_FUNC_(compute_stacey_acoustic_backward_cuda,
COMPUTE_STACEY_ACOUSTIC_BACKWARD_CUDA)(long* Mesh_pointer_f,
realw* absorb_potential,
- int* itype) {}
+ int* itype) {}
void FC_FUNC_(compute_stacey_acoustic_undoatt_cuda,
COMPUTE_STACEY_ACOUSTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
- int* itype) {}
+ int* itype) {}
//
@@ -255,16 +276,16 @@
void FC_FUNC_(compute_stacey_elastic_cuda,
COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
realw* absorb_field,
- int* itype) {}
+ int* itype) {}
void FC_FUNC_(compute_stacey_elastic_backward_cuda,
COMPUTE_STACEY_ELASTIC_BACKWARD_CUDA)(long* Mesh_pointer_f,
realw* absorb_field,
- int* itype) {}
+ int* itype) {}
void FC_FUNC_(compute_stacey_elastic_undoatt_cuda,
COMPUTE_STACEY_ELASTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
- int* itype) {}
+ int* itype) {}
//
@@ -272,39 +293,39 @@
//
void FC_FUNC_(initialize_cuda_device,
- INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+ INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
exit(1);
-}
+}
//
// src/cuda/noise_tomography_cuda.cu
//
-void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
-void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
-void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
void FC_FUNC_(noise_transfer_surface_to_host,
NOISE_TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
- realw* h_noise_surface_movie) {}
+ realw* h_noise_surface_movie) {}
void FC_FUNC_(noise_add_source_master_rec_cu,
NOISE_ADD_SOURCE_MASTER_REC_CU)(long* Mesh_pointer_f,
int* it_f,
int* irec_master_noise_f,
- int* islice_selected_rec) {}
+ int* islice_selected_rec) {}
void FC_FUNC_(noise_add_surface_movie_cuda,
NOISE_ADD_SURFACE_MOVIE_CUDA)(long* Mesh_pointer_f,
- realw* h_noise_surface_movie) {}
+ realw* h_noise_surface_movie) {}
//
@@ -339,7 +360,7 @@
int* SAVE_BOUNDARY_MESH_f,
int* USE_MESH_COLORING_GPU_f,
int* ANISOTROPIC_KL_f,int* APPROXIMATE_HESS_KL_f,
- realw* deltat_f,realw* b_deltat_f) {}
+ realw* deltat_f,realw* b_deltat_f) {}
void FC_FUNC_(prepare_fields_rotation_device,
PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
@@ -349,7 +370,7 @@
realw* b_two_omega_earth_f,
realw* b_A_array_rotation,
realw* b_B_array_rotation,
- int* NSPEC_OUTER_CORE_ROTATION) {}
+ int* NSPEC_OUTER_CORE_ROTATION) {}
void FC_FUNC_(prepare_fields_gravity_device,
PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
@@ -363,7 +384,7 @@
realw* minus_g_icb,
realw* minus_g_cmb,
double* RHO_BOTTOM_OC,
- double* RHO_TOP_OC) {}
+ double* RHO_TOP_OC) {}
void FC_FUNC_(prepare_fields_attenuat_device,
PREPARE_FIELDS_ATTENUAT_DEVICE)(long* Mesh_pointer_f,
@@ -392,7 +413,7 @@
realw* factor_common_inner_core,
realw* one_minus_sum_beta_inner_core,
realw* alphaval,realw* betaval,realw* gammaval,
- realw* b_alphaval,realw* b_betaval,realw* b_gammaval) {}
+ realw* b_alphaval,realw* b_betaval,realw* b_gammaval) {}
void FC_FUNC_(prepare_fields_strain_device,
PREPARE_FIELDS_STRAIN_DEVICE)(long* Mesh_pointer_f,
@@ -419,7 +440,7 @@
realw* b_epsilondev_xz_inner_core,
realw* b_epsilondev_yz_inner_core,
realw* eps_trace_over_3_inner_core,
- realw* b_eps_trace_over_3_inner_core) {}
+ realw* b_eps_trace_over_3_inner_core) {}
void FC_FUNC_(prepare_fields_absorb_device,
PREPARE_FIELDS_ABSORB_DEVICE)(long* Mesh_pointer_f,
@@ -448,7 +469,7 @@
int* ibelm_ymin_outer_core,int* ibelm_ymax_outer_core,
realw* jacobian2D_xmin_outer_core, realw* jacobian2D_xmax_outer_core,
realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
- realw* vp_outer_core) {}
+ realw* vp_outer_core) {}
void FC_FUNC_(prepare_mpi_buffers_device,
PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f,
@@ -463,7 +484,7 @@
int* num_interfaces_outer_core,
int* max_nibool_interfaces_oc,
int* nibool_interfaces_outer_core,
- int* ibool_interfaces_outer_core){}
+ int* ibool_interfaces_outer_core){}
void FC_FUNC_(prepare_fields_noise_device,
PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
@@ -475,14 +496,14 @@
realw* normal_y_noise,
realw* normal_z_noise,
realw* mask_noise,
- realw* jacobian2D_top_crust_mantle) {}
+ realw* jacobian2D_top_crust_mantle) {}
void FC_FUNC_(prepare_oceans_device,
PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
int* npoin_oceans,
int* h_iglob_ocean_load,
realw* h_rmass_ocean_load_selected,
- realw* h_normal_ocean_load) {}
+ realw* h_normal_ocean_load) {}
void FC_FUNC_(prepare_crust_mantle_device,
PREPARE_CRUST_MANTLE_DEVICE)(long* Mesh_pointer_f,
@@ -514,7 +535,7 @@
int* NCHUNKS_VAL,
int* num_colors_outer,
int* num_colors_inner,
- int* num_elem_colors) {}
+ int* num_elem_colors) {}
void FC_FUNC_(prepare_outer_core_device,
PREPARE_OUTER_CORE_DEVICE)(long* Mesh_pointer_f,
@@ -539,7 +560,7 @@
int* h_ibelm_bottom_outer_core,
int* num_colors_outer,
int* num_colors_inner,
- int* num_elem_colors) {}
+ int* num_elem_colors) {}
void FC_FUNC_(prepare_inner_core_device,
PREPARE_INNER_CORE_DEVICE)(long* Mesh_pointer_f,
@@ -562,11 +583,11 @@
int* h_ibelm_top_inner_core,
int* num_colors_outer,
int* num_colors_inner,
- int* num_elem_colors) {}
+ int* num_elem_colors) {}
void FC_FUNC_(prepare_cleanup_device,
PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
- int* NCHUNKS_VAL) {}
+ int* NCHUNKS_VAL) {}
//
@@ -574,88 +595,88 @@
//
void FC_FUNC_(transfer_fields_cm_to_device,
- TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_fields_ic_to_device,
- TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_fields_oc_to_device,
- TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_cm_to_device,
TRANSFER_FIELDS_B_CM_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_ic_to_device,
TRANSFER_FIELDS_B_IC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_oc_to_device,
TRANSFER_FIELDS_B_OC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_fields_cm_from_device,
- TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_fields_ic_from_device,
- TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_fields_oc_from_device,
- TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_cm_from_device,
TRANSFER_B_FIELDS_CM_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_ic_from_device,
TRANSFER_B_FIELDS_IC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_oc_from_device,
TRANSFER_B_FIELDS_OC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_displ_cm_from_device,
- TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_displ_cm_from_device,
- TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_displ_ic_from_device,
- TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_displ_ic_from_device,
- TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_displ_oc_from_device,
- TRANSFER_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_displ_oc_from_device,
- TRANSFER_B_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_B_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_veloc_cm_from_device,
- TRANSFER_VELOC_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
+ TRANSFER_VELOC_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_veloc_ic_from_device,
- TRANSFER_VELOC_IC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
+ TRANSFER_VELOC_IC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_veloc_oc_from_device,
- TRANSFER_VELOC_OC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
+ TRANSFER_VELOC_OC_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_accel_cm_to_device,
- TRANSFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_accel_cm_from_device,
- TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_accel_cm_from_device,
- TRANSFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
+ TRANSFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_accel_ic_from_device,
- TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_accel_oc_from_device,
- TRANSFER_ACCEL_OC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_ACCEL_OC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_strain_cm_from_device,
TRANSFER_STRAIN_CM_FROM_DEVICE)(long* Mesh_pointer,
@@ -664,7 +685,7 @@
realw* epsilondev_yy,
realw* epsilondev_xy,
realw* epsilondev_xz,
- realw* epsilondev_yz) {}
+ realw* epsilondev_yz) {}
void FC_FUNC_(transfer_b_strain_cm_to_device,
TRANSFER_B_STRAIN_CM_TO_DEVICE)(long* Mesh_pointer,
@@ -672,7 +693,7 @@
realw* epsilondev_yy,
realw* epsilondev_xy,
realw* epsilondev_xz,
- realw* epsilondev_yz) {}
+ realw* epsilondev_yz) {}
void FC_FUNC_(transfer_strain_ic_from_device,
TRANSFER_STRAIN_IC_FROM_DEVICE)(long* Mesh_pointer,
@@ -681,7 +702,7 @@
realw* epsilondev_yy,
realw* epsilondev_xy,
realw* epsilondev_xz,
- realw* epsilondev_yz) {}
+ realw* epsilondev_yz) {}
void FC_FUNC_(transfer_b_strain_ic_to_device,
TRANSFER_B_STRAIN_IC_TO_DEVICE)(long* Mesh_pointer,
@@ -689,7 +710,7 @@
realw* epsilondev_yy,
realw* epsilondev_xy,
realw* epsilondev_xz,
- realw* epsilondev_yz) {}
+ realw* epsilondev_yz) {}
void FC_FUNC_(transfer_rmemory_cm_from_device,
TRANSFER_RMEMORY_CM_FROM_DEVICE)(long* Mesh_pointer,
@@ -697,7 +718,7 @@
realw* R_yy,
realw* R_xy,
realw* R_xz,
- realw* R_yz) {}
+ realw* R_yz) {}
void FC_FUNC_(transfer_b_rmemory_cm_to_device,
TRANSFER_B_RMEMORY_CM_TO_DEVICE)(long* Mesh_pointer,
@@ -705,7 +726,7 @@
realw* b_R_yy,
realw* b_R_xy,
realw* b_R_xz,
- realw* b_R_yz) {}
+ realw* b_R_yz) {}
void FC_FUNC_(transfer_rmemory_ic_from_device,
TRANSFER_RMEMORY_IC_FROM_DEVICE)(long* Mesh_pointer,
@@ -713,7 +734,7 @@
realw* R_yy,
realw* R_xy,
realw* R_xz,
- realw* R_yz) {}
+ realw* R_yz) {}
void FC_FUNC_(transfer_b_rmemory_ic_to_device,
TRANSFER_B_RMEMORY_IC_TO_DEVICE)(long* Mesh_pointer,
@@ -721,17 +742,17 @@
realw* b_R_yy,
realw* b_R_xy,
realw* b_R_xz,
- realw* b_R_yz) {}
+ realw* b_R_yz) {}
void FC_FUNC_(transfer_rotation_from_device,
TRANSFER_ROTATION_FROM_DEVICE)(long* Mesh_pointer,
realw* A_array_rotation,
- realw* B_array_rotation) {}
+ realw* B_array_rotation) {}
void FC_FUNC_(transfer_b_rotation_to_device,
TRANSFER_B_ROTATION_TO_DEVICE)(long* Mesh_pointer,
realw* A_array_rotation,
- realw* B_array_rotation) {}
+ realw* B_array_rotation) {}
void FC_FUNC_(transfer_kernels_cm_to_host,
TRANSFER_KERNELS_CM_TO_HOST)(long* Mesh_pointer,
@@ -739,30 +760,30 @@
realw* h_alpha_kl,
realw* h_beta_kl,
realw* h_cijkl_kl,
- int* NSPEC) {}
+ int* NSPEC) {}
void FC_FUNC_(transfer_kernels_ic_to_host,
TRANSFER_KERNELS_IC_TO_HOST)(long* Mesh_pointer,
realw* h_rho_kl,
realw* h_alpha_kl,
realw* h_beta_kl,
- int* NSPEC) {}
+ int* NSPEC) {}
void FC_FUNC_(transfer_kernels_oc_to_host,
TRANSFER_KERNELS_OC_TO_HOST)(long* Mesh_pointer,
realw* h_rho_kl,
realw* h_alpha_kl,
- int* NSPEC) {}
+ int* NSPEC) {}
void FC_FUNC_(transfer_kernels_noise_to_host,
TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
realw* h_Sigma_kl,
- int* NSPEC) {}
+ int* NSPEC) {}
void FC_FUNC_(transfer_kernels_hess_cm_tohost,
TRANSFER_KERNELS_HESS_CM_TOHOST)(long* Mesh_pointer,
realw* h_hess_kl,
- int* NSPEC) {}
+ int* NSPEC) {}
//
@@ -774,39 +795,39 @@
realw* deltat_f,
realw* deltatsqover2_f,
realw* deltatover2_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(update_displacement_cm_cuda,
UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
realw* deltat_f,
realw* deltatsqover2_f,
realw* deltatover2_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(update_displacement_oc_cuda,
UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
realw* deltat_f,
realw* deltatsqover2_f,
realw* deltatover2_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(multiply_accel_elastic_cuda,
MULTIPLY_ACCEL_ELASTIC_CUDA)(long* Mesh_pointer,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(update_veloc_elastic_cuda,
UPDATE_VELOC_ELASTIC_CUDA)(long* Mesh_pointer,
realw* deltatover2_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(multiply_accel_acoustic_cuda,
MULTIPLY_ACCEL_ACOUSTIC_CUDA)(long* Mesh_pointer,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(update_veloc_acoustic_cuda,
UPDATE_VELOC_ACOUSTIC_CUDA)(long* Mesh_pointer,
realw* deltatover2_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
//
@@ -826,5 +847,14 @@
int* number_receiver_global,
int* ispec_selected_rec,
int* ispec_selected_source,
- int* ibool) {}
+ int* ibool) {}
+void FC_FUNC_(transfer_seismo_from_device_async,
+ TRANSFER_SEISMO_FROM_DEVICE_ASYNC)(long* Mesh_pointer_f,
+ realw* displ,
+ realw* b_displ,
+ int* number_receiver_global,
+ int* ispec_selected_rec,
+ int* ispec_selected_source,
+ int* ibool) {}
+
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/update_displacement_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/update_displacement_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -82,7 +82,7 @@
int size = NDIM * mp->NGLOB_INNER_CORE;
// debug
-#if DEBUG_BACKWARD_SIMULATIONS == 1
+#if DEBUG_BACKWARD_SIMULATIONS == 1 && DEBUG == 1
realw max_d,max_v,max_a;
max_d = get_device_array_maximum_value(mp->d_b_displ_inner_core, size);
max_v = get_device_array_maximum_value(mp->d_b_veloc_inner_core, size);
@@ -107,7 +107,7 @@
if( *FORWARD_OR_ADJOINT == 1 ){
//launch kernel
- UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
+ UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_inner_core,
mp->d_veloc_inner_core,
mp->d_accel_inner_core,
size,deltat,deltatsqover2,deltatover2);
@@ -116,7 +116,7 @@
DEBUG_BACKWARD_UPDATE();
// kernel for backward fields
- UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
+ UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_inner_core,
mp->d_b_veloc_inner_core,
mp->d_b_accel_inner_core,
size,deltat,deltatsqover2,deltatover2);
@@ -149,7 +149,7 @@
int size = NDIM * mp->NGLOB_CRUST_MANTLE;
// debug
-#if DEBUG_BACKWARD_SIMULATIONS == 1
+#if DEBUG_BACKWARD_SIMULATIONS == 1 && DEBUG == 1
realw max_d,max_v,max_a;
max_d = get_device_array_maximum_value(mp->d_b_displ_crust_mantle, size);
max_v = get_device_array_maximum_value(mp->d_b_veloc_crust_mantle, size);
@@ -174,7 +174,7 @@
if( *FORWARD_OR_ADJOINT == 1 ){
//launch kernel
- UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+ UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_crust_mantle,
mp->d_veloc_crust_mantle,
mp->d_accel_crust_mantle,
size,deltat,deltatsqover2,deltatover2);
@@ -183,7 +183,7 @@
DEBUG_BACKWARD_UPDATE();
// kernel for backward fields
- UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+ UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_crust_mantle,
mp->d_b_veloc_crust_mantle,
mp->d_b_accel_crust_mantle,
size,deltat,deltatsqover2,deltatover2);
@@ -247,7 +247,7 @@
int size = mp->NGLOB_OUTER_CORE;
// debug
-#if DEBUG_BACKWARD_SIMULATIONS == 1
+#if DEBUG_BACKWARD_SIMULATIONS == 1 && DEBUG == 1
realw max_d,max_v,max_a;
max_d = get_device_array_maximum_value(mp->d_b_displ_outer_core, size);
max_v = get_device_array_maximum_value(mp->d_b_veloc_outer_core, size);
@@ -272,7 +272,7 @@
if( *FORWARD_OR_ADJOINT == 1 ){
//launch kernel
- UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
+ UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ_outer_core,
mp->d_veloc_outer_core,
mp->d_accel_outer_core,
size,deltat,deltatsqover2,deltatover2);
@@ -280,7 +280,7 @@
// debug
DEBUG_BACKWARD_UPDATE();
- UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,
+ UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ_outer_core,
mp->d_b_veloc_outer_core,
mp->d_b_accel_outer_core,
size,deltat,deltatsqover2,deltatover2);
@@ -375,7 +375,7 @@
threads = dim3(blocksize,1,1);
if( *FORWARD_OR_ADJOINT == 1 ){
- multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+ multiply_accel_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_crust_mantle,
mp->d_veloc_crust_mantle,
mp->NGLOB_CRUST_MANTLE,
mp->two_omega_earth,
@@ -386,7 +386,7 @@
// debug
DEBUG_BACKWARD_UPDATE();
- multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+ multiply_accel_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_crust_mantle,
mp->d_b_veloc_crust_mantle,
mp->NGLOB_CRUST_MANTLE,
mp->b_two_omega_earth,
@@ -404,7 +404,7 @@
threads = dim3(blocksize,1,1);
if( *FORWARD_OR_ADJOINT == 1 ){
- multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_accel_inner_core,
+ multiply_accel_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_inner_core,
mp->d_veloc_inner_core,
mp->NGLOB_INNER_CORE,
mp->two_omega_earth,
@@ -415,7 +415,7 @@
// debug
DEBUG_BACKWARD_UPDATE();
- multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_b_accel_inner_core,
+ multiply_accel_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_inner_core,
mp->d_b_veloc_inner_core,
mp->NGLOB_INNER_CORE,
mp->b_two_omega_earth,
@@ -480,12 +480,12 @@
threads = dim3(blocksize,1,1);
if( *FORWARD_OR_ADJOINT == 1 ){
- update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+ update_veloc_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc_crust_mantle,
mp->d_accel_crust_mantle,
mp->NGLOB_CRUST_MANTLE,
deltatover2);
}else if( *FORWARD_OR_ADJOINT == 3 ){
- update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+ update_veloc_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_veloc_crust_mantle,
mp->d_b_accel_crust_mantle,
mp->NGLOB_CRUST_MANTLE,
deltatover2);
@@ -500,12 +500,12 @@
threads = dim3(blocksize,1,1);
if( *FORWARD_OR_ADJOINT == 1 ){
- update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
+ update_veloc_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc_inner_core,
mp->d_accel_inner_core,
mp->NGLOB_INNER_CORE,
deltatover2);
}else if( *FORWARD_OR_ADJOINT == 3 ){
- update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
+ update_veloc_elastic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_veloc_inner_core,
mp->d_b_accel_inner_core,
mp->NGLOB_INNER_CORE,
deltatover2);
@@ -562,14 +562,14 @@
// multiplies accel with inverse of mass matrix
if( *FORWARD_OR_ADJOINT == 1 ){
- multiply_accel_acoustic_cuda_device<<< grid, threads>>>(mp->d_accel_outer_core,
+ multiply_accel_acoustic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel_outer_core,
mp->NGLOB_OUTER_CORE,
mp->d_rmass_outer_core);
}else if( *FORWARD_OR_ADJOINT == 3 ){
// debug
DEBUG_BACKWARD_UPDATE();
- multiply_accel_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_accel_outer_core,
+ multiply_accel_acoustic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel_outer_core,
mp->NGLOB_OUTER_CORE,
mp->d_b_rmass_outer_core);
}
@@ -624,7 +624,7 @@
// updates velocity
if( *FORWARD_OR_ADJOINT == 1 ){
- update_veloc_acoustic_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
+ update_veloc_acoustic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc_outer_core,
mp->d_accel_outer_core,
mp->NGLOB_OUTER_CORE,
deltatover2);
@@ -632,7 +632,7 @@
// debug
DEBUG_BACKWARD_UPDATE();
- update_veloc_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
+ update_veloc_acoustic_cuda_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_veloc_outer_core,
mp->d_b_accel_outer_core,
mp->NGLOB_OUTER_CORE,
deltatover2);
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/write_seismograms_cuda.cu 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/cuda/write_seismograms_cuda.cu 2014-01-13 16:06:21 UTC (rev 22991)
@@ -47,53 +47,61 @@
int* ispec_selected_rec,
int* ibool,
realw* station_seismo_field,
- realw* desired_field,
+ realw* d_field,
int nrec_local) {
// vector fields
int blockID = blockIdx.x + blockIdx.y*gridDim.x;
+ int tx = threadIdx.x;
+ int irec,ispec,iglob;
+
if(blockID < nrec_local) {
- int irec = number_receiver_global[blockID]-1;
- int ispec = ispec_selected_rec[irec]-1;
- int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+ irec = number_receiver_global[blockID]-1;
+ ispec = ispec_selected_rec[irec]-1;
+ iglob = ibool[tx + NGLL3*ispec]-1;
- station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+0] = desired_field[3*iglob];
- station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+1] = desired_field[3*iglob+1];
- station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+2] = desired_field[3*iglob+2];
+ station_seismo_field[3*NGLL3*blockID + 3*tx+0] = d_field[3*iglob];
+ station_seismo_field[3*NGLL3*blockID + 3*tx+1] = d_field[3*iglob+1];
+ station_seismo_field[3*NGLL3*blockID + 3*tx+2] = d_field[3*iglob+2];
}
}
/* ----------------------------------------------------------------------------------------------- */
-__global__ void write_seismograms_transfer_scalar_from_device_kernel(int* number_receiver_global,
+__global__ void write_seismograms_transfer_strain_from_device_kernel(int* number_receiver_global,
int* ispec_selected_rec,
int* ibool,
- realw* station_seismo_field,
- realw* desired_field,
+ realw* station_strain_field,
+ realw* d_field,
int nrec_local) {
// scalar fields
int blockID = blockIdx.x + blockIdx.y*gridDim.x;
+ int tx = threadIdx.x;
+ int irec,ispec,iglob;
+
if(blockID < nrec_local) {
- int irec = number_receiver_global[blockID]-1;
- int ispec = ispec_selected_rec[irec]-1;
- int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+ irec = number_receiver_global[blockID]-1;
+ ispec = ispec_selected_rec[irec]-1;
+ iglob = ibool[tx + NGLL3*ispec]-1;
- station_seismo_field[NGLL3*blockID + threadIdx.x] = desired_field[iglob];
+ station_strain_field[NGLL3*blockID + tx] = d_field[iglob];
}
}
/* ----------------------------------------------------------------------------------------------- */
-void write_seismograms_transfer_from_device(Mesh* mp, realw* d_field,realw* h_field,
- int* number_receiver_global,
- int* d_ispec_selected,
- int* h_ispec_selected,
- int* ibool) {
+void write_seismograms_transfer_from_device(Mesh* mp,
+ realw* d_field,
+ realw* h_field,
+ int* number_receiver_global,
+ int* d_ispec_selected,
+ int* h_ispec_selected,
+ int* ibool) {
TRACE("write_seismograms_transfer_from_device");
@@ -111,8 +119,13 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
+ // waits for previous copy call to be finished
+ if( GPU_ASYNC_COPY ){
+ cudaStreamSynchronize(mp->copy_stream);
+ }
+
// prepare field transfer array on device
- write_seismograms_transfer_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+ write_seismograms_transfer_from_device_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_number_receiver_global,
d_ispec_selected,
mp->d_ibool_crust_mantle,
mp->d_station_seismo_field,
@@ -120,18 +133,31 @@
mp->nrec_local);
// copies array to CPU
- cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
- 3*NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+ if( GPU_ASYNC_COPY ){
+ // waits for previous compute call to be finished
+ cudaStreamSynchronize(mp->compute_stream);
- for(irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
- irec = number_receiver_global[irec_local] - 1;
- ispec = h_ispec_selected[irec] - 1;
+ // asynchronuous copy
+ // note: we need to update the host array in a subsequent call to transfer_seismo_from_device_async() routine
+ cudaMemcpyAsync(mp->h_station_seismo_field,mp->d_station_seismo_field,
+ 3*NGLL3*(mp->nrec_local)*sizeof(realw),
+ cudaMemcpyDeviceToHost,mp->copy_stream);
+ }else{
+ // synchronuous copy
+ print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
+ 3*NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost),77000);
- for(i = 0; i < NGLL3; i++) {
- iglob = ibool[i+NGLL3*ispec] - 1;
- h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
- h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
- h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
+ // updates array on CPU
+ for(irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
+ irec = number_receiver_global[irec_local] - 1;
+ ispec = h_ispec_selected[irec] - 1;
+
+ for(i = 0; i < NGLL3; i++) {
+ iglob = ibool[i+NGLL3*ispec] - 1;
+ h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
+ h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
+ h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
+ }
}
}
@@ -142,14 +168,15 @@
/* ----------------------------------------------------------------------------------------------- */
-void write_seismograms_transfer_scalar_from_device(Mesh* mp,
- realw* d_field,realw* h_field,
+void write_seismograms_transfer_strain_from_device(Mesh* mp,
+ realw* d_field,
+ realw* h_field,
int* number_receiver_global,
int* d_ispec_selected,
int* h_ispec_selected,
int* ibool) {
- TRACE("write_seismograms_transfer_scalar_from_device");
+ TRACE("write_seismograms_transfer_strain_from_device");
int irec_local,irec;
int ispec,iglob,i;
@@ -166,28 +193,30 @@
dim3 threads(blocksize,1,1);
// prepare field transfer array on device
- write_seismograms_transfer_scalar_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
+ write_seismograms_transfer_strain_from_device_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_number_receiver_global,
d_ispec_selected,
mp->d_ibool_crust_mantle,
- mp->d_station_seismo_field,
+ mp->d_station_strain_field,
d_field,
mp->nrec_local);
// copies array to CPU
- cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
- NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+ // synchronuous copy
+ print_CUDA_error_if_any(cudaMemcpy(mp->h_station_strain_field,mp->d_station_strain_field,
+ NGLL3*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost),77001);
+ // updates host array
for(irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
irec = number_receiver_global[irec_local] - 1;
ispec = h_ispec_selected[irec] - 1;
for(i = 0; i < NGLL3; i++) {
iglob = ibool[i+NGLL3*ispec] - 1;
- h_field[iglob] = mp->h_station_seismo_field[i+irec_local*NGLL3];
+ h_field[iglob] = mp->h_station_strain_field[i+irec_local*NGLL3];
}
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("write_seismograms_transfer_scalar_from_device");
+ exit_on_cuda_error("write_seismograms_transfer_strain_from_device");
#endif
}
@@ -208,15 +237,17 @@
int* ispec_selected_rec,
int* ispec_selected_source,
int* ibool) {
-TRACE("write_seismograms_transfer_cuda");
+ TRACE("write_seismograms_transfer_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
// checks if anything to do
if( mp->nrec_local == 0 ) return;
+ // transfers displacement values in receiver elements from GPU to CPU
switch( mp->simulation_type ){
case 1:
+ // forward simulation
write_seismograms_transfer_from_device(mp,mp->d_displ_crust_mantle,
displ,
number_receiver_global,
@@ -225,50 +256,130 @@
break;
case 2:
+ // adjoint simulation
write_seismograms_transfer_from_device(mp,mp->d_displ_crust_mantle,
displ,
number_receiver_global,
mp->d_ispec_selected_source,
ispec_selected_source, ibool);
- write_seismograms_transfer_scalar_from_device(mp,mp->d_eps_trace_over_3_crust_mantle,
+ // strain
+ write_seismograms_transfer_strain_from_device(mp,mp->d_eps_trace_over_3_crust_mantle,
eps_trace_over_3,
number_receiver_global,
mp->d_ispec_selected_source,
ispec_selected_source, ibool);
- write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_xx_crust_mantle,
+ write_seismograms_transfer_strain_from_device(mp,mp->d_epsilondev_xx_crust_mantle,
epsilondev_xx,
number_receiver_global,
mp->d_ispec_selected_source,
ispec_selected_source, ibool);
- write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_yy_crust_mantle,
+ write_seismograms_transfer_strain_from_device(mp,mp->d_epsilondev_yy_crust_mantle,
epsilondev_yy,
number_receiver_global,
mp->d_ispec_selected_source,
ispec_selected_source, ibool);
- write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_xy_crust_mantle,
+ write_seismograms_transfer_strain_from_device(mp,mp->d_epsilondev_xy_crust_mantle,
epsilondev_xy,
number_receiver_global,
mp->d_ispec_selected_source,
ispec_selected_source, ibool);
- write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_xz_crust_mantle,
+ write_seismograms_transfer_strain_from_device(mp,mp->d_epsilondev_xz_crust_mantle,
epsilondev_xz,
number_receiver_global,
mp->d_ispec_selected_source,
ispec_selected_source, ibool);
- write_seismograms_transfer_scalar_from_device(mp,mp->d_epsilondev_yz_crust_mantle,
+ write_seismograms_transfer_strain_from_device(mp,mp->d_epsilondev_yz_crust_mantle,
epsilondev_yz,
number_receiver_global,
mp->d_ispec_selected_source,
ispec_selected_source, ibool);
+
break;
case 3:
- write_seismograms_transfer_from_device(mp,mp->d_b_displ_crust_mantle,b_displ,
- number_receiver_global,
- mp->d_ispec_selected_rec,
- ispec_selected_rec, ibool);
+ // kernel simulation
+ write_seismograms_transfer_from_device(mp,mp->d_b_displ_crust_mantle,
+ b_displ,
+ number_receiver_global,
+ mp->d_ispec_selected_rec,
+ ispec_selected_rec, ibool);
break;
}
+
}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// data transfer to CPU host
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_seismo_from_device_async,
+ TRANSFER_SEISMO_FROM_DEVICE_ASYNC)(long* Mesh_pointer_f,
+ realw* displ,
+ realw* b_displ,
+ int* number_receiver_global,
+ int* ispec_selected_rec,
+ int* ispec_selected_source,
+ int* ibool) {
+
+ TRACE("transfer_seismo_from_device_async");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
+ int irec,ispec,iglob,i;
+ realw* h_field;
+ int* h_ispec_selected;
+
+ // checks if anything to do
+ if( mp->nrec_local == 0 ) return;
+
+ // checks async-memcpy
+ if( GPU_ASYNC_COPY == 0 ){
+ exit_on_error("transfer_seismo_from_device_async must be called with GPU_ASYNC_COPY == 1, please check mesh_constants_cuda.h");
+ }
+
+ // waits for previous copy call to be finished
+ cudaStreamSynchronize(mp->copy_stream);
+
+ // transfers displacements
+ // select target array on host
+ switch( mp->simulation_type ){
+ case 1:
+ // forward simulation
+ h_field = displ;
+ h_ispec_selected = ispec_selected_rec;
+ break;
+
+ case 2:
+ // adjoint simulation
+ h_field = displ;
+ h_ispec_selected = ispec_selected_source;
+ break;
+
+ case 3:
+ // kernel simulation
+ h_field = b_displ;
+ h_ispec_selected = ispec_selected_rec;
+ break;
+ }
+
+ // updates corresponding array on CPU
+ for(int irec_local = 0 ; irec_local < mp->nrec_local; irec_local++) {
+ irec = number_receiver_global[irec_local] - 1;
+ ispec = h_ispec_selected[irec] - 1;
+
+ for(i = 0; i < NGLL3; i++) {
+ iglob = ibool[i+NGLL3*ispec] - 1;
+ h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
+ h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
+ h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
+ }
+ }
+
+}
+
+
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_crust.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_crust.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/model_crust.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -67,10 +67,10 @@
integer :: ier
! allocate crustal arrays
- allocate( thlr(NKEYS_CRUST,NLAYERS_CRUST), &
- velocp(NKEYS_CRUST,NLAYERS_CRUST), &
- velocs(NKEYS_CRUST,NLAYERS_CRUST), &
- dens(NKEYS_CRUST,NLAYERS_CRUST), &
+ allocate(thlr(NLAYERS_CRUST,NKEYS_CRUST), &
+ velocp(NLAYERS_CRUST,NKEYS_CRUST), &
+ velocs(NLAYERS_CRUST,NKEYS_CRUST), &
+ dens(NLAYERS_CRUST,NKEYS_CRUST), &
stat=ier)
if( ier /= 0 ) call exit_MPI(myrank,'error allocating crustal arrays')
@@ -184,7 +184,9 @@
end subroutine model_crust
-!---------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
subroutine read_crust_model()
@@ -226,12 +228,14 @@
do ikey=1,NKEYS_CRUST
read (1,"(a2)") code(ikey)
- read (1,*) (velocp(ikey,i),i=1,NLAYERS_CRUST)
- read (1,*) (velocs(ikey,i),i=1,NLAYERS_CRUST)
- read (1,*) (dens(ikey,i),i=1,NLAYERS_CRUST)
- read (1,*) (thlr(ikey,i),i=1,NLAYERS_CRUST-1),thlr(ikey,NLAYERS_CRUST)
- if(thlr(ikey,NLAYERS_CRUST) > h_moho_max) h_moho_max = thlr(ikey,NLAYERS_CRUST)
- if(thlr(ikey,NLAYERS_CRUST) < h_moho_min) h_moho_min = thlr(ikey,NLAYERS_CRUST)
+ read (1,*) (velocp(i,ikey),i=1,NLAYERS_CRUST)
+ read (1,*) (velocs(i,ikey),i=1,NLAYERS_CRUST)
+ read (1,*) (dens(i,ikey),i=1,NLAYERS_CRUST)
+ read (1,*) (thlr(i,ikey),i=1,NLAYERS_CRUST-1),thlr(NLAYERS_CRUST,ikey)
+
+ ! limit moho thickness
+ if(thlr(NLAYERS_CRUST,ikey) > h_moho_max) h_moho_max = thlr(NLAYERS_CRUST,ikey)
+ if(thlr(NLAYERS_CRUST,ikey) < h_moho_min) h_moho_min = thlr(NLAYERS_CRUST,ikey)
enddo
close(1)
@@ -240,7 +244,9 @@
end subroutine read_crust_model
-!---------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
subroutine crust_CAPsmoothed(lat,lon,velp,vels,rho,thick,abbreviation,&
code,thlr,velocp,velocs,dens)
@@ -263,7 +269,7 @@
! argument variables
double precision :: lat,lon
double precision,dimension(NLAYERS_CRUST) :: rho,thick,velp,vels
- double precision,dimension(NKEYS_CRUST,NLAYERS_CRUST) :: thlr,velocp,velocs,dens
+ double precision,dimension(NLAYERS_CRUST,NKEYS_CRUST) :: thlr,velocp,velocs,dens
character(len=2) :: code(NKEYS_CRUST)
character(len=2) :: abbreviation(NCAP_CRUST/2,NCAP_CRUST)
@@ -280,13 +286,15 @@
!-------------------------------
! local variables
- double precision xlon(NTHETA*NPHI),xlat(NTHETA*NPHI),weight(NTHETA*NPHI)
- double precision rhol(NLAYERS_CRUST),thickl(NLAYERS_CRUST),velpl(NLAYERS_CRUST),velsl(NLAYERS_CRUST)
- double precision weightl,cap_degree,dist
- double precision h_sed
- integer i,icolat,ilon,ierr
- character(len=2) crustaltype
+ double precision :: xlon(NTHETA*NPHI),xlat(NTHETA*NPHI),weight(NTHETA*NPHI)
+ double precision :: rhol(NLAYERS_CRUST),thickl(NLAYERS_CRUST),velpl(NLAYERS_CRUST),velsl(NLAYERS_CRUST)
+ double precision :: weightl,cap_degree,dist
+ double precision :: h_sed
+ integer :: i,icolat,ilon
+ character(len=2) :: crustaltype
+
+
! checks latitude/longitude
if(lat > 90.0d0 .or. lat < -90.0d0 .or. lon > 180.0d0 .or. lon < -180.0d0) &
stop 'error in latitude/longitude range in crust'
@@ -328,10 +336,11 @@
do i=1,NTHETA*NPHI
! gets crust values
call icolat_ilon(xlat(i),xlon(i),icolat,ilon)
+
crustaltype = abbreviation(icolat,ilon)
+
call get_crust_structure(crustaltype,velpl,velsl,rhol,thickl, &
- code,thlr,velocp,velocs,dens,ierr)
- if(ierr /= 0) stop 'error in routine get_crust_structure'
+ code,thlr,velocp,velocs,dens)
! sediment thickness
h_sed = thickl(3) + thickl(4)
@@ -359,7 +368,9 @@
end subroutine crust_CAPsmoothed
-!------------------------------------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
subroutine icolat_ilon(xlat,xlon,icolat,ilon)
@@ -372,57 +383,65 @@
if(xlat > 90.0d0 .or. xlat < -90.0d0 .or. xlon > 180.0d0 .or. xlon < -180.0d0) &
stop 'error in latitude/longitude range in icolat_ilon'
- icolat=int(1+( (90.d0-xlat)*0.5d0 ))
- if(icolat == 91) icolat=90
- ilon=int(1+( (180.d0+xlon)*0.5d0 ))
- if(ilon == 181) ilon=1
+ icolat = int(1+( (90.d0-xlat)*0.5d0 ))
+ if(icolat == 91) icolat = 90
+
+ ilon = int(1+( (180.d0+xlon)*0.5d0 ))
+ if(ilon == 181) ilon = 1
+
if(icolat>90 .or. icolat<1) stop 'error in routine icolat_ilon'
if(ilon<1 .or. ilon>180) stop 'error in routine icolat_ilon'
end subroutine icolat_ilon
-!---------------------------------------------------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
- subroutine get_crust_structure(type,vptyp,vstyp,rhtyp,thtp, &
- code,thlr,velocp,velocs,dens,ierr)
+ subroutine get_crust_structure(crustaltype,vptyp,vstyp,rhtyp,thtp, &
+ code,thlr,velocp,velocs,dens)
use model_crust_par,only: NLAYERS_CRUST,NKEYS_CRUST
implicit none
! argument variables
- integer ierr
- double precision rhtyp(NLAYERS_CRUST),thtp(NLAYERS_CRUST)
- double precision vptyp(NLAYERS_CRUST),vstyp(NLAYERS_CRUST)
- character(len=2) type,code(NKEYS_CRUST)
- double precision thlr(NKEYS_CRUST,NLAYERS_CRUST),velocp(NKEYS_CRUST,NLAYERS_CRUST)
- double precision velocs(NKEYS_CRUST,NLAYERS_CRUST),dens(NKEYS_CRUST,NLAYERS_CRUST)
+ character(len=2) :: crustaltype
+ double precision :: rhtyp(NLAYERS_CRUST),thtp(NLAYERS_CRUST)
+ double precision :: vptyp(NLAYERS_CRUST),vstyp(NLAYERS_CRUST)
+ character(len=2) :: code(NKEYS_CRUST)
+ double precision :: thlr(NLAYERS_CRUST,NKEYS_CRUST),velocp(NLAYERS_CRUST,NKEYS_CRUST)
+ double precision :: velocs(NLAYERS_CRUST,NKEYS_CRUST),dens(NLAYERS_CRUST,NKEYS_CRUST)
! local variables
- integer i,ikey
+ integer :: ikey,ikey_selected
- ierr=1
+ ! determines layer index
+ ikey_selected = 0
do ikey=1,NKEYS_CRUST
- if (code(ikey) == type) then
- do i=1,NLAYERS_CRUST
- vptyp(i)=velocp(ikey,i)
- vstyp(i)=velocs(ikey,i)
- rhtyp(i)=dens(ikey,i)
- enddo
- do i=1,NLAYERS_CRUST-1
- thtp(i)=thlr(ikey,i)
- enddo
- ! get distance to Moho from the bottom of the ocean or the ice
- thtp(NLAYERS_CRUST)=thlr(ikey,NLAYERS_CRUST)-thtp(1)-thtp(2)
- ierr=0
+ if (code(ikey) == crustaltype) then
+ ikey_selected = ikey
+ exit
endif
enddo
+ if( ikey_selected == 0 ) stop 'error in get_crust_structure: could not find ikey for selected crustal type'
+ ! sets vp,vs and rho for all layers
+ vptyp(:) = velocp(:,ikey_selected)
+ vstyp(:) = velocs(:,ikey_selected)
+ rhtyp(:) = dens(:,ikey_selected)
+ thtp(:) = thlr(:,ikey_selected)
+
+ ! get distance to Moho from the bottom of the ocean or the ice
+ thtp(NLAYERS_CRUST) = thtp(NLAYERS_CRUST) - thtp(1) - thtp(2)
+
end subroutine get_crust_structure
-!---------------------------
+!
+!-------------------------------------------------------------------------------------------------
+!
subroutine CAP_vardegree(lon,lat,xlon,xlat,weight,CAP_DEGREE,NTHETA,NPHI)
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/rules.mk
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/rules.mk 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/meshfem3D/rules.mk 2014-01-13 16:06:21 UTC (rev 22991)
@@ -269,9 +269,9 @@
$O/%.check_adios_module.o: $S/%.F90 $O/shared_par.shared_module.o $O/meshfem3D_par.check_module.o $O/adios_helpers.shared_adios.o
${FCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
-$O/%.check_adios.o: $S/%.f90 $O/shared_par.shared_module.o $O/meshfem3D_par.check_module.o $O/adios_helpers.shared_adios.o $O/write_AVS_DX_global_data_adios.check_adios_module.o
+$O/%.check_adios.o: $S/%.f90 $O/shared_par.shared_module.o $O/meshfem3D_par.check_module.o $O/adios_helpers.shared_adios.o $O/write_AVS_DX_global_data_adios.check_adios_module.o $O/write_AVS_DX_surface_data_adios.check_adios_module.o
${FCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
-$O/%.check_adios.o: $S/%.F90 $O/shared_par.shared_module.o $O/meshfem3D_par.check_module.o $O/adios_helpers.shared_adios.o $O/write_AVS_DX_global_data_adios.check_adios_module.o
+$O/%.check_adios.o: $S/%.F90 $O/shared_par.shared_module.o $O/meshfem3D_par.check_module.o $O/adios_helpers.shared_adios.o $O/write_AVS_DX_global_data_adios.check_adios_module.o $O/write_AVS_DX_surface_data_adios.check_adios_module.o
${FCCOMPILE_CHECK} ${FCFLAGS_f90} -c -o $@ $<
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/write_VTK_file.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/write_VTK_file.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/shared/write_VTK_file.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -193,7 +193,7 @@
! type: hexahedrons
write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec
- write(IOVTK,*) (12,ispec=1,nspec)
+ write(IOVTK,'(6i12)') (12,ispec=1,nspec)
write(IOVTK,*) ""
write(IOVTK,'(a,i12)') "CELL_DATA ",nspec
@@ -268,7 +268,7 @@
! type: hexahedrons
write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec
- write(IOVTK,*) (12,ispec=1,nspec)
+ write(IOVTK,'(6i12)') (12,ispec=1,nspec)
write(IOVTK,*) ""
write(IOVTK,'(a,i12)') "CELL_DATA ",nspec
@@ -371,7 +371,7 @@
! type: hexahedrons
write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec
- write(IOVTK,*) (12,ispec=1,nspec)
+ write(IOVTK,'(6i12)') (12,ispec=1,nspec)
write(IOVTK,*) ""
! x components
@@ -561,7 +561,7 @@
! type: hexahedrons
write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec*NPROCTOT
- write(IOVTK,*) (12,ispec=1,nspec*NPROCTOT)
+ write(IOVTK,'(6i12)') (12,ispec=1,nspec*NPROCTOT)
write(IOVTK,*) ""
! x components
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_scalar.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_scalar.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_scalar.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -238,17 +238,20 @@
end subroutine assemble_MPI_scalar_w
+
+!-------------------------------------------------------------------------------------------------
!
+! CUDA routines
+!
!-------------------------------------------------------------------------------------------------
-!
+
subroutine assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROC, &
buffer_send_scalar,buffer_recv_scalar, &
num_interfaces,max_nibool_interfaces, &
nibool_interfaces, &
my_neighbours, &
- request_send_scalar,request_recv_scalar, &
- FORWARD_OR_ADJOINT)
+ request_send_scalar,request_recv_scalar)
! non-blocking MPI send
@@ -270,20 +273,15 @@
integer, dimension(num_interfaces) :: nibool_interfaces,my_neighbours
integer, dimension(num_interfaces) :: request_send_scalar,request_recv_scalar
- integer :: FORWARD_OR_ADJOINT
-
! local parameters
integer iinterface
-! sends only if more than one partition
+ ! note: preparation of the contribution between partitions using MPI
+ ! transfers mpi buffers to CPU in already done in transfer_boun_pot_from_device() call
+
+ ! sends only if more than one partition
if(NPROC > 1) then
- ! preparation of the contribution between partitions using MPI
- ! transfers mpi buffers to CPU
- call transfer_boun_pot_from_device(Mesh_pointer, &
- buffer_send_scalar, &
- FORWARD_OR_ADJOINT)
-
! send messages
do iinterface = 1, num_interfaces
! non-blocking synchronous send request
@@ -323,8 +321,7 @@
integer :: num_interfaces,max_nibool_interfaces
- real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces,num_interfaces) :: &
- buffer_recv_scalar
+ real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces,num_interfaces) :: buffer_recv_scalar
integer, dimension(num_interfaces) :: request_send_scalar,request_recv_scalar
integer :: FORWARD_OR_ADJOINT
@@ -360,3 +357,64 @@
endif
end subroutine assemble_MPI_scalar_write_cuda
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+! with cuda functions...
+
+ subroutine transfer_boundarypot_to_device(Mesh_pointer, NPROC, &
+ buffer_recv_scalar, &
+ num_interfaces,max_nibool_interfaces,&
+ request_recv_scalar, &
+ IREGION,FORWARD_OR_ADJOINT)
+
+ use constants
+
+ implicit none
+
+ integer(kind=8) :: Mesh_pointer
+
+ integer :: NPROC
+
+ ! array to assemble
+ integer :: num_interfaces,max_nibool_interfaces
+
+ real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces,num_interfaces) :: buffer_recv_scalar
+
+ integer, dimension(num_interfaces) :: request_recv_scalar
+
+ integer :: IREGION
+ integer :: FORWARD_OR_ADJOINT
+
+ ! local parameters
+ integer :: iinterface
+
+ ! here we have to assemble all the contributions between partitions using MPI
+
+ ! assemble only if more than one partition
+ if(NPROC > 1) then
+
+ ! waits for communications completion (recv)
+ do iinterface = 1, num_interfaces
+ call wait_req(request_recv_scalar(iinterface))
+ enddo
+
+ ! sends contributions to GPU
+ call transfer_buffer_to_device_async(Mesh_pointer, &
+ buffer_recv_scalar, &
+ IREGION,FORWARD_OR_ADJOINT)
+ endif
+
+ ! This step is done via previous function transfer_and_assemble...
+ ! do iinterface = 1, num_interfaces
+ ! do ipoin = 1, nibool_interfaces(iinterface)
+ ! array_val(ibool_interfaces(ipoin,iinterface)) = &
+ ! array_val(ibool_interfaces(ipoin,iinterface)) + buffer_recv_scalar(ipoin,iinterface)
+ ! enddo
+ ! enddo
+
+ end subroutine transfer_boundarypot_to_device
+
+
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_vector.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/assemble_MPI_vector.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -98,7 +98,72 @@
!-------------------------------------------------------------------------------------------------
!
+ subroutine assemble_MPI_vector_w(NPROC,nglob, &
+ array_val, &
+ buffer_recv_vector, &
+ num_interfaces,max_nibool_interfaces, &
+ nibool_interfaces,ibool_interfaces, &
+ request_send_vector,request_recv_vector)
+! waits for data to receive and assembles
+
+ use constants
+
+ implicit none
+
+ integer :: NPROC
+ integer :: nglob
+
+ ! array to assemble
+ real(kind=CUSTOM_REAL), dimension(NDIM,nglob) :: array_val
+
+ integer :: num_interfaces,max_nibool_interfaces
+
+ real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: &
+ buffer_recv_vector
+
+ integer, dimension(num_interfaces) :: nibool_interfaces
+ integer, dimension(max_nibool_interfaces,num_interfaces) :: ibool_interfaces
+ integer, dimension(num_interfaces) :: request_send_vector,request_recv_vector
+
+ integer ipoin,iinterface
+
+! here we have to assemble all the contributions between partitions using MPI
+
+ ! assemble only if more than one partition
+ if(NPROC > 1) then
+
+ ! wait for communications completion (recv)
+ do iinterface = 1, num_interfaces
+ call wait_req(request_recv_vector(iinterface))
+ enddo
+
+ ! adding contributions of neighbours
+ do iinterface = 1, num_interfaces
+ do ipoin = 1, nibool_interfaces(iinterface)
+ array_val(:,ibool_interfaces(ipoin,iinterface)) = &
+ array_val(:,ibool_interfaces(ipoin,iinterface)) &
+ + buffer_recv_vector(:,ipoin,iinterface)
+ enddo
+ enddo
+
+ ! wait for communications completion (send)
+ do iinterface = 1, num_interfaces
+ call wait_req(request_send_vector(iinterface))
+ enddo
+
+ endif
+
+ end subroutine assemble_MPI_vector_w
+
+
+!-------------------------------------------------------------------------------------------------
+!
+! CUDA routines
+!
+!-------------------------------------------------------------------------------------------------
+
+
! interrupt might improve MPI performance
! see: https://computing.llnl.gov/tutorials/mpi_performance/#Sender-ReceiverSync
!
@@ -110,8 +175,7 @@
num_interfaces,max_nibool_interfaces, &
nibool_interfaces, &
my_neighbours, &
- request_send_vector,request_recv_vector,&
- IREGION,FORWARD_OR_ADJOINT)
+ request_send_vector,request_recv_vector)
! sends data
! note: array to assemble already filled into buffer_send_vector array
@@ -131,21 +195,15 @@
integer, dimension(num_interfaces) :: nibool_interfaces,my_neighbours
integer, dimension(num_interfaces) :: request_send_vector,request_recv_vector
- integer :: IREGION
- integer :: FORWARD_OR_ADJOINT
-
! local parameters
integer :: iinterface
+ ! note: preparation of the contribution between partitions using MPI
+ ! already done in transfer_boun_from_device() routines
+
! send only if more than one partition
if(NPROC > 1) then
- ! preparation of the contribution between partitions using MPI
- ! transfers mpi buffers to CPU
- call transfer_boun_accel_from_device(Mesh_pointer, &
- buffer_send_vector,&
- IREGION,FORWARD_OR_ADJOINT)
-
! send messages
do iinterface = 1, num_interfaces
call isend_cr(buffer_send_vector(1,1,iinterface), &
@@ -169,12 +227,11 @@
!-------------------------------------------------------------------------------------------------
!
- subroutine assemble_MPI_vector_w(NPROC,nglob, &
- array_val, &
- buffer_recv_vector, &
- num_interfaces,max_nibool_interfaces, &
- nibool_interfaces,ibool_interfaces, &
- request_send_vector,request_recv_vector)
+ subroutine assemble_MPI_vector_write_cuda(Mesh_pointer,NPROC, &
+ buffer_recv_vector, &
+ num_interfaces,max_nibool_interfaces, &
+ request_send_vector,request_recv_vector, &
+ IREGION,FORWARD_OR_ADJOINT )
! waits for data to receive and assembles
@@ -182,23 +239,22 @@
implicit none
+ integer(kind=8) :: Mesh_pointer
+
integer :: NPROC
- integer :: nglob
- ! array to assemble
- real(kind=CUSTOM_REAL), dimension(NDIM,nglob) :: array_val
-
integer :: num_interfaces,max_nibool_interfaces
- real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: &
- buffer_recv_vector
-
- integer, dimension(num_interfaces) :: nibool_interfaces
- integer, dimension(max_nibool_interfaces,num_interfaces) :: ibool_interfaces
+ real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: buffer_recv_vector
integer, dimension(num_interfaces) :: request_send_vector,request_recv_vector
- integer ipoin,iinterface
+ integer :: IREGION
+ integer :: FORWARD_OR_ADJOINT
+ ! local parameters
+
+ integer :: iinterface
+
! here we have to assemble all the contributions between partitions using MPI
! assemble only if more than one partition
@@ -210,14 +266,18 @@
enddo
! adding contributions of neighbours
- do iinterface = 1, num_interfaces
- do ipoin = 1, nibool_interfaces(iinterface)
- array_val(:,ibool_interfaces(ipoin,iinterface)) = &
- array_val(:,ibool_interfaces(ipoin,iinterface)) &
- + buffer_recv_vector(:,ipoin,iinterface)
- enddo
- enddo
+ call transfer_asmbl_accel_to_device(Mesh_pointer, &
+ buffer_recv_vector, &
+ IREGION,FORWARD_OR_ADJOINT)
+ ! This step is done via previous function transfer_and_assemble...
+ ! do iinterface = 1, num_interfaces
+ ! do ipoin = 1, nibool_interfaces(iinterface)
+ ! array_val(:,ibool_interfaces(ipoin,iinterface)) = &
+ ! array_val(:,ibool_interfaces(ipoin,iinterface)) + buffer_recv_vector(:,ipoin,iinterface)
+ ! enddo
+ ! enddo
+
! wait for communications completion (send)
do iinterface = 1, num_interfaces
call wait_req(request_send_vector(iinterface))
@@ -225,21 +285,21 @@
endif
- end subroutine assemble_MPI_vector_w
+ end subroutine assemble_MPI_vector_write_cuda
!
!-------------------------------------------------------------------------------------------------
!
- subroutine assemble_MPI_vector_write_cuda(Mesh_pointer,NPROC, &
+! with cuda functions...
+
+ subroutine transfer_boundary_to_device(Mesh_pointer, NPROC, &
buffer_recv_vector, &
- num_interfaces,max_nibool_interfaces, &
- request_send_vector,request_recv_vector, &
- IREGION,FORWARD_OR_ADJOINT )
+ num_interfaces,max_nibool_interfaces,&
+ request_recv_vector, &
+ IREGION,FORWARD_OR_ADJOINT)
-! waits for data to receive and assembles
-
use constants
implicit none
@@ -248,47 +308,43 @@
integer :: NPROC
+ ! array to assemble
integer :: num_interfaces,max_nibool_interfaces
- real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: buffer_recv_vector
- integer, dimension(num_interfaces) :: request_send_vector,request_recv_vector
+ real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: &
+ buffer_recv_vector
+ integer, dimension(num_interfaces) :: request_recv_vector
+
integer :: IREGION
integer :: FORWARD_OR_ADJOINT
! local parameters
-
integer :: iinterface
-! here we have to assemble all the contributions between partitions using MPI
+ ! here we have to assemble all the contributions between partitions using MPI
! assemble only if more than one partition
if(NPROC > 1) then
- ! wait for communications completion (recv)
+ ! waits for communications completion (recv)
do iinterface = 1, num_interfaces
call wait_req(request_recv_vector(iinterface))
enddo
- ! adding contributions of neighbours
- call transfer_asmbl_accel_to_device(Mesh_pointer, &
- buffer_recv_vector, &
- IREGION,FORWARD_OR_ADJOINT)
+ ! sends contributions to GPU
+ call transfer_buffer_to_device_async(Mesh_pointer, &
+ buffer_recv_vector, &
+ IREGION,FORWARD_OR_ADJOINT)
+ endif
- ! This step is done via previous function transfer_and_assemble...
- ! do iinterface = 1, num_interfaces
- ! do ipoin = 1, nibool_interfaces(iinterface)
- ! array_val(:,ibool_interfaces(ipoin,iinterface)) = &
- ! array_val(:,ibool_interfaces(ipoin,iinterface)) + buffer_recv_vector(:,ipoin,iinterface)
- ! enddo
- ! enddo
+ ! This step is done via previous function transfer_and_assemble...
+ ! do iinterface = 1, num_interfaces
+ ! do ipoin = 1, nibool_interfaces(iinterface)
+ ! array_val(:,ibool_interfaces(ipoin,iinterface)) = &
+ ! array_val(:,ibool_interfaces(ipoin,iinterface)) + buffer_recv_vector(:,ipoin,iinterface)
+ ! enddo
+ ! enddo
- ! wait for communications completion (send)
- do iinterface = 1, num_interfaces
- call wait_req(request_send_vector(iinterface))
- enddo
+ end subroutine transfer_boundary_to_device
- endif
-
- end subroutine assemble_MPI_vector_write_cuda
-
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_add_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_add_sources.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_add_sources.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -151,11 +151,14 @@
implicit none
! local parameters
- real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: adj_sourcearray
- integer :: irec,irec_local,i,j,k,iglob,it_sub_adj,itime
+ real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: tmp_sourcearray
+ integer :: irec,irec_local,i,j,k,iglob,it_sub_adj,itime,ier
+ integer :: ivec_index
character(len=150) :: adj_source_file
logical :: ibool_read_adj_arrays
+ ! note: we check if nadj_rec_local > 0 before calling this routine
+
! figure out if we need to read in a chunk of the adjoint source at this timestep
it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) ) !chunk_number
ibool_read_adj_arrays = (((it == it_begin) .or. (mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) &
@@ -163,12 +166,13 @@
! needs to read in a new chunk/block of the adjoint source
if(ibool_read_adj_arrays) then
+ ! allocates temporary source array
+ allocate(tmp_sourcearray(NDIM,NGLLX,NGLLY,NGLLZ,NTSTEP_BETWEEN_READ_ADJSRC),stat=ier)
+ if( ier /= 0 ) call exit_MPI(myrank,'error allocating array tmp_sourcearray')
- ! temporary source array
- allocate(adj_sourcearray(NDIM,NGLLX,NGLLY,NGLLZ,NTSTEP_BETWEEN_READ_ADJSRC))
- adj_sourcearray = 0._CUSTOM_REAL
+ tmp_sourcearray(:,:,:,:,:) = 0._CUSTOM_REAL
+ irec_local = 0
- irec_local = 0
do irec = 1, nrec
! check that the source slice number is okay
if(islice_selected_rec(irec) < 0 .or. islice_selected_rec(irec) > NPROCTOT_VAL-1) then
@@ -184,23 +188,25 @@
adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
call compute_arrays_source_adjoint(myrank,adj_source_file, &
xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- nu(:,:,irec),adj_sourcearray, xigll,yigll,zigll,iadjsrc_len(it_sub_adj), &
+ nu(:,:,irec),tmp_sourcearray, xigll,yigll,zigll,iadjsrc_len(it_sub_adj), &
iadjsrc,it_sub_adj,NSTEP_SUB_ADJ,NTSTEP_BETWEEN_READ_ADJSRC,DT)
! stores source array
! note: the adj_sourcearrays has a time stepping from 1 to NTSTEP_BETWEEN_READ_ADJSRC
! this gets overwritten every time a new block/chunk is read in
do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
- adj_sourcearrays(:,:,:,:,irec_local,itime) = adj_sourcearray(:,:,:,:,itime)
+ adj_sourcearrays(:,:,:,:,irec_local,itime) = tmp_sourcearray(:,:,:,:,itime)
enddo
endif
enddo
+
+ ! checks that number of read sources is valid
if(irec_local /= nadj_rec_local) &
call exit_MPI(myrank,'irec_local /= nadj_rec_local in adjoint simulation')
- deallocate(adj_sourcearray)
-
+ ! frees temporary array
+ deallocate(tmp_sourcearray)
endif
! adds adjoint sources
@@ -213,6 +219,9 @@
if(myrank == islice_selected_rec(irec)) then
irec_local = irec_local + 1
+ ! adjoint source array index
+ ivec_index = iadj_vec(it)
+
! adds source contributions
do k=1,NGLLZ
do j=1,NGLLY
@@ -277,7 +286,7 @@
! assuming that until that end the backward/reconstructed wavefield and adjoint fields
! have a zero contribution to adjoint kernels.
accel_crust_mantle(:,iglob) = accel_crust_mantle(:,iglob) &
- + adj_sourcearrays(:,i,j,k,irec_local,iadj_vec(it))
+ + adj_sourcearrays(:,i,j,k,irec_local,ivec_index)
enddo
enddo
@@ -287,10 +296,54 @@
enddo
else
+
! on GPU
- call compute_add_sources_adjoint_cuda(Mesh_pointer,nrec,adj_sourcearrays, &
- islice_selected_rec,ispec_selected_rec, &
- iadj_vec(it))
+ ! note: adjoint sourcearrays can become very big when used with many receiver stations
+ ! we overlap here the memory transfer to GPUs
+
+ ! current time index
+ ivec_index = iadj_vec(it)
+
+ if( GPU_ASYNC_COPY ) then
+ ! only synchronuously transfers array at beginning or whenever new arrays were read in
+ if( ibool_read_adj_arrays ) then
+ ! transfers adjoint arrays to GPU device memory
+ ! note: function call passes pointer to array adj_sourcearrays at corresponding time slice
+ call transfer_adj_to_device(Mesh_pointer,nrec,adj_sourcearrays(1,1,1,1,1,ivec_index), &
+ islice_selected_rec)
+ endif
+ else
+ ! synchronuously transfers adjoint arrays to GPU device memory before adding adjoint sources on GPU
+ call transfer_adj_to_device(Mesh_pointer,nrec,adj_sourcearrays(1,1,1,1,1,ivec_index), &
+ islice_selected_rec)
+ endif
+
+ ! adds adjoint source contributions
+ call compute_add_sources_adjoint_cuda(Mesh_pointer,nrec)
+
+ if( GPU_ASYNC_COPY ) then
+ ! starts asynchronuously transfer of next adjoint arrays to GPU device memory
+ ! (making sure the next adj_sourcearrays values were already read in)
+ if( (.not. ibool_read_adj_arrays) .and. &
+ (.not. mod(it,NTSTEP_BETWEEN_READ_ADJSRC) == 0) .and. &
+ (.not. it == it_end) ) then
+ ! next time index
+ ivec_index = iadj_vec(it+1)
+
+ ! checks next index
+ if( ivec_index < 1 .or. ivec_index > NTSTEP_BETWEEN_READ_ADJSRC ) then
+ print*,'error iadj_vec bounds: rank',myrank,' it = ',it,' index = ',ivec_index, &
+ 'out of bounds ',1,'to',NTSTEP_BETWEEN_READ_ADJSRC
+ call exit_MPI(myrank,'error iadj_vec index bounds')
+ endif
+
+ ! asynchronuously transfers next time slice
+ call transfer_adj_to_device_async(Mesh_pointer,nrec,adj_sourcearrays(1,1,1,1,1,ivec_index), &
+ islice_selected_rec)
+ endif
+ endif
+
+
endif
end subroutine compute_add_sources_adjoint
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.F90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.F90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -104,6 +104,20 @@
! on GPU
! includes FORWARD_OR_ADJOINT == 1
call compute_forces_outer_core_cuda(Mesh_pointer,iphase,time,1)
+
+ ! initiates asynchronuous mpi transfer
+ if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+ ! crust/mantle region
+ ! wait for asynchronous copy to finish
+ call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_scalar_outer_core,IREGION_OUTER_CORE,1)
+ ! sends mpi buffers
+ call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core, &
+ num_interfaces_outer_core,max_nibool_interfaces_oc, &
+ nibool_interfaces_outer_core,&
+ my_neighbours_outer_core, &
+ request_send_scalar_oc,request_recv_scalar_oc)
+ endif
endif
@@ -172,14 +186,25 @@
request_send_scalar_oc,request_recv_scalar_oc)
else
! on GPU
- ! outer core
- call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
- buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core, &
- num_interfaces_outer_core,max_nibool_interfaces_oc, &
- nibool_interfaces_outer_core,&
- my_neighbours_outer_core, &
- request_send_scalar_oc,request_recv_scalar_oc, &
- 1) ! <-- 1 == fwd accel
+ ! sends accel values to corresponding MPI interface neighbors
+
+ ! preparation of the contribution between partitions using MPI
+ ! transfers mpi buffers to CPU
+ ! note: for asynchronuous transfers, this transfers boundary region to host asynchronously. The
+ ! MPI-send is done after compute_forces_outer_core_cuda,
+ ! once the inner element kernels are launched, and the memcpy has finished.
+ call transfer_boun_pot_from_device(Mesh_pointer,buffer_send_scalar_outer_core,1)
+
+ if( .not. GPU_ASYNC_COPY ) then
+ ! for synchronuous transfers, sending over mpi can directly proceed
+ ! outer core
+ call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ buffer_send_scalar_outer_core,buffer_recv_scalar_outer_core, &
+ num_interfaces_outer_core,max_nibool_interfaces_oc, &
+ nibool_interfaces_outer_core,&
+ my_neighbours_outer_core, &
+ request_send_scalar_oc,request_recv_scalar_oc)
+ endif
endif
else
! make sure the last communications are finished and processed
@@ -194,6 +219,18 @@
request_send_scalar_oc,request_recv_scalar_oc)
else
! on GPU
+ if( GPU_ASYNC_COPY ) then
+ ! while inner elements compute "Kernel_2", we wait for MPI to
+ ! finish and transfer the boundary terms to the device asynchronously
+ !
+ ! transfers mpi buffers onto GPU
+ call transfer_boundarypot_to_device(Mesh_pointer,NPROCTOT_VAL,buffer_recv_scalar_outer_core, &
+ num_interfaces_outer_core,max_nibool_interfaces_oc, &
+ request_recv_scalar_oc, &
+ IREGION_OUTER_CORE,1)
+ endif
+
+ ! waits for mpi send/receive requests to be completed and assembles values
call assemble_MPI_scalar_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
buffer_recv_scalar_outer_core, &
num_interfaces_outer_core,max_nibool_interfaces_oc, &
@@ -328,6 +365,21 @@
! on GPU
! includes FORWARD_OR_ADJOINT == 3
call compute_forces_outer_core_cuda(Mesh_pointer,iphase,b_time,3)
+
+ ! initiates asynchronuous mpi transfer
+ if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+ ! crust/mantle region
+ ! wait for asynchronous copy to finish
+ call sync_copy_from_device(Mesh_pointer,iphase,b_buffer_send_scalar_outer_core,IREGION_OUTER_CORE,3)
+ ! sends mpi buffers
+ call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
+ num_interfaces_outer_core,max_nibool_interfaces_oc, &
+ nibool_interfaces_outer_core,&
+ my_neighbours_outer_core, &
+ b_request_send_scalar_oc,b_request_recv_scalar_oc)
+ endif
+
endif
@@ -402,14 +454,25 @@
b_request_send_scalar_oc,b_request_recv_scalar_oc)
else
! on GPU
- ! outer core
- call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
- b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
- num_interfaces_outer_core,max_nibool_interfaces_oc, &
- nibool_interfaces_outer_core,&
- my_neighbours_outer_core, &
- b_request_send_scalar_oc,b_request_recv_scalar_oc, &
- 3) ! <-- 3 == adjoint b_accel
+ ! sends accel values to corresponding MPI interface neighbors
+
+ ! preparation of the contribution between partitions using MPI
+ ! transfers mpi buffers to CPU
+ ! note: for asynchronuous transfers, this transfers boundary region to host asynchronously. The
+ ! MPI-send is done after compute_forces_outer_core_cuda,
+ ! once the inner element kernels are launched, and the memcpy has finished.
+ call transfer_boun_pot_from_device(Mesh_pointer,b_buffer_send_scalar_outer_core,3)
+
+ if( .not. GPU_ASYNC_COPY ) then
+ ! for synchronuous transfers, sending over mpi can directly proceed
+ ! outer core
+ call assemble_MPI_scalar_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ b_buffer_send_scalar_outer_core,b_buffer_recv_scalar_outer_core, &
+ num_interfaces_outer_core,max_nibool_interfaces_oc, &
+ nibool_interfaces_outer_core,&
+ my_neighbours_outer_core, &
+ b_request_send_scalar_oc,b_request_recv_scalar_oc)
+ endif
endif ! GPU
else
! make sure the last communications are finished and processed
@@ -425,6 +488,18 @@
b_request_send_scalar_oc,b_request_recv_scalar_oc)
else
! on GPU
+ if( GPU_ASYNC_COPY ) then
+ ! while inner elements compute "Kernel_2", we wait for MPI to
+ ! finish and transfer the boundary terms to the device asynchronously
+ !
+ ! transfers mpi buffers onto GPU
+ call transfer_boundarypot_to_device(Mesh_pointer,NPROCTOT_VAL,b_buffer_recv_scalar_outer_core, &
+ num_interfaces_outer_core,max_nibool_interfaces_oc, &
+ b_request_recv_scalar_oc, &
+ IREGION_OUTER_CORE,3)
+ endif
+
+ ! waits for mpi send/receive requests to be completed and assembles values
call assemble_MPI_scalar_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
b_buffer_recv_scalar_outer_core, &
num_interfaces_outer_core,max_nibool_interfaces_oc, &
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -152,12 +152,44 @@
! contains forward FORWARD_OR_ADJOINT == 1
! for crust/mantle
call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase,1)
+
+ ! initiates asynchronuous mpi transfer
+ if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+ ! crust/mantle region
+ ! wait for asynchronous copy to finish
+ call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_vector_crust_mantle,IREGION_CRUST_MANTLE,1)
+ ! sends mpi buffers
+ call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ buffer_send_vector_crust_mantle,buffer_recv_vector_crust_mantle, &
+ num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+ nibool_interfaces_crust_mantle,&
+ my_neighbours_crust_mantle, &
+ request_send_vector_cm,request_recv_vector_cm)
+ endif
+
! for inner core
call compute_forces_inner_core_cuda(Mesh_pointer,iphase,1)
+
+ ! initiates asynchronuous mpi transfer
+ if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+ ! inner core region
+ ! wait for asynchronous copy to finish
+ call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_vector_inner_core,IREGION_INNER_CORE,1)
+ ! sends mpi buffers
+ call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ buffer_send_vector_inner_core,buffer_recv_vector_inner_core, &
+ num_interfaces_inner_core,max_nibool_interfaces_ic, &
+ nibool_interfaces_inner_core,&
+ my_neighbours_inner_core, &
+ request_send_vector_ic,request_recv_vector_ic)
+ endif
endif ! GPU_MODE
+
+
! computes additional contributions to acceleration field
if( iphase == 1 ) then
+ ! during phase for outer elements
! absorbing boundaries
! Stacey
@@ -272,22 +304,37 @@
request_send_vector_ic,request_recv_vector_ic)
else
! on GPU
- ! crust mantle
- call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
- buffer_send_vector_crust_mantle,buffer_recv_vector_crust_mantle, &
- num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
- nibool_interfaces_crust_mantle,&
- my_neighbours_crust_mantle, &
- request_send_vector_cm,request_recv_vector_cm, &
- IREGION_CRUST_MANTLE,1) ! <-- 1 == fwd accel
- ! inner core
- call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
- buffer_send_vector_inner_core,buffer_recv_vector_inner_core, &
- num_interfaces_inner_core,max_nibool_interfaces_ic, &
- nibool_interfaces_inner_core,&
- my_neighbours_inner_core, &
- request_send_vector_ic,request_recv_vector_ic, &
- IREGION_INNER_CORE,1)
+ ! sends accel values to corresponding MPI interface neighbors
+
+ ! preparation of the contribution between partitions using MPI
+ ! transfers mpi buffers to CPU
+ ! note: in case of asynchronuous copy, this transfers boundary region to host asynchronously. The
+ ! MPI-send is done after compute_forces_viscoelastic_cuda,
+ ! once the inner element kernels are launched, and the memcpy has finished.
+ call transfer_boun_from_device(Mesh_pointer, &
+ buffer_send_vector_crust_mantle,&
+ IREGION_CRUST_MANTLE,1)
+ call transfer_boun_from_device(Mesh_pointer, &
+ buffer_send_vector_inner_core,&
+ IREGION_INNER_CORE,1)
+
+ if( .not. GPU_ASYNC_COPY ) then
+ ! for synchronuous transfers, sending over mpi can directly proceed
+ ! crust mantle
+ call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ buffer_send_vector_crust_mantle,buffer_recv_vector_crust_mantle, &
+ num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+ nibool_interfaces_crust_mantle,&
+ my_neighbours_crust_mantle, &
+ request_send_vector_cm,request_recv_vector_cm)
+ ! inner core
+ call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ buffer_send_vector_inner_core,buffer_recv_vector_inner_core, &
+ num_interfaces_inner_core,max_nibool_interfaces_ic, &
+ nibool_interfaces_inner_core,&
+ my_neighbours_inner_core, &
+ request_send_vector_ic,request_recv_vector_ic)
+ endif
endif ! GPU_MODE
else
! waits for send/receive requests to be completed and assembles values
@@ -309,6 +356,24 @@
request_send_vector_ic,request_recv_vector_ic)
else
! on GPU
+ if( GPU_ASYNC_COPY ) then
+ ! while inner elements compute "Kernel_2", we wait for MPI to
+ ! finish and transfer the boundary terms to the device asynchronously
+ !
+ ! transfers mpi buffers onto GPU
+ ! crust/mantle region
+ call transfer_boundary_to_device(Mesh_pointer,NPROCTOT_VAL,buffer_recv_vector_crust_mantle, &
+ num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+ request_recv_vector_cm, &
+ IREGION_CRUST_MANTLE,1)
+ ! inner core region
+ call transfer_boundary_to_device(Mesh_pointer,NPROCTOT_VAL,buffer_recv_vector_inner_core, &
+ num_interfaces_inner_core,max_nibool_interfaces_ic, &
+ request_recv_vector_ic, &
+ IREGION_INNER_CORE,1)
+ endif
+
+ ! waits for mpi send/receive requests to be completed and assembles values
! crust mantle
call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
buffer_recv_vector_crust_mantle, &
@@ -537,8 +602,39 @@
! contains forward FORWARD_OR_ADJOINT == 3
! for crust/mantle
call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase,3)
+
+ ! initiates asynchronuous mpi transfer
+ if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+ ! crust/mantle region
+ ! wait for asynchronous copy to finish
+ call sync_copy_from_device(Mesh_pointer,iphase,b_buffer_send_vector_cm,IREGION_CRUST_MANTLE,3)
+ ! sends mpi buffers
+ call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ b_buffer_send_vector_cm,b_buffer_recv_vector_cm, &
+ num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+ nibool_interfaces_crust_mantle,&
+ my_neighbours_crust_mantle, &
+ b_request_send_vector_cm,b_request_recv_vector_cm)
+ endif
+
! for inner core
call compute_forces_inner_core_cuda(Mesh_pointer,iphase,3)
+
+ ! initiates asynchronuous mpi transfer
+ if( GPU_ASYNC_COPY .and. iphase == 2 ) then
+ ! inner core region
+ ! wait for asynchronous copy to finish
+ call sync_copy_from_device(Mesh_pointer,iphase,b_buffer_send_vector_inner_core,IREGION_INNER_CORE,3)
+
+ ! sends mpi buffers
+ call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
+ num_interfaces_inner_core,max_nibool_interfaces_ic, &
+ nibool_interfaces_inner_core,&
+ my_neighbours_inner_core, &
+ b_request_send_vector_ic,b_request_recv_vector_ic)
+ endif
+
endif ! GPU_MODE
! computes additional contributions to acceleration field
@@ -644,25 +740,39 @@
b_request_send_vector_ic,b_request_recv_vector_ic)
else
! on GPU
- ! crust mantle
- call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
- b_buffer_send_vector_cm,b_buffer_recv_vector_cm, &
- num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
- nibool_interfaces_crust_mantle,&
- my_neighbours_crust_mantle, &
- b_request_send_vector_cm,b_request_recv_vector_cm, &
- IREGION_CRUST_MANTLE,3) ! <-- 3 == adjoint b_accel
- ! inner core
- call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
- b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
- num_interfaces_inner_core,max_nibool_interfaces_ic, &
- nibool_interfaces_inner_core,&
- my_neighbours_inner_core, &
- b_request_send_vector_ic,b_request_recv_vector_ic, &
- IREGION_INNER_CORE,3)
+ ! sends accel values to corresponding MPI interface neighbors
+
+ ! preparation of the contribution between partitions using MPI
+ ! transfers mpi buffers to CPU
+ ! note: in case of asynchronuous copy, this transfers boundary region to host asynchronously. The
+ ! MPI-send is done after compute_forces_viscoelastic_cuda,
+ ! once the inner element kernels are launched, and the memcpy has finished.
+ call transfer_boun_from_device(Mesh_pointer, &
+ b_buffer_send_vector_cm,&
+ IREGION_CRUST_MANTLE,3)
+ call transfer_boun_from_device(Mesh_pointer, &
+ b_buffer_send_vector_inner_core,&
+ IREGION_INNER_CORE,3)
+
+ if( .not. GPU_ASYNC_COPY ) then
+ ! for synchronuous transfers, sending over mpi can directly proceed
+ ! crust mantle
+ call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ b_buffer_send_vector_cm,b_buffer_recv_vector_cm, &
+ num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+ nibool_interfaces_crust_mantle,&
+ my_neighbours_crust_mantle, &
+ b_request_send_vector_cm,b_request_recv_vector_cm)
+ ! inner core
+ call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
+ b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
+ num_interfaces_inner_core,max_nibool_interfaces_ic, &
+ nibool_interfaces_inner_core,&
+ my_neighbours_inner_core, &
+ b_request_send_vector_ic,b_request_recv_vector_ic)
+ endif
endif ! GPU
else
- ! waits for send/receive requests to be completed and assembles values
! adjoint / kernel runs
! waits for send/receive requests to be completed and assembles values
if(.NOT. GPU_MODE) then
@@ -684,6 +794,25 @@
else
! on GPU
+ if( GPU_ASYNC_COPY ) then
+ ! while inner elements compute "Kernel_2", we wait for MPI to
+ ! finish and transfer the boundary terms to the device asynchronously
+ ! wait for asynchronous copy to finish
+ !
+ ! transfers mpi buffers onto GPU
+ ! crust/mantle region
+ call transfer_boundary_to_device(Mesh_pointer,NPROCTOT_VAL,b_buffer_recv_vector_cm, &
+ num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
+ b_request_recv_vector_cm, &
+ IREGION_CRUST_MANTLE,3)
+ ! inner core region
+ call transfer_boundary_to_device(Mesh_pointer,NPROCTOT_VAL,b_buffer_recv_vector_inner_core, &
+ num_interfaces_inner_core,max_nibool_interfaces_ic, &
+ b_request_recv_vector_ic, &
+ IREGION_INNER_CORE,3)
+ endif
+
+ ! waits for mpi send/receive requests to be completed and assembles values
! crust mantle
call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
b_buffer_recv_vector_cm, &
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_stacey_outer_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_stacey_outer_core.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/compute_stacey_outer_core.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -76,216 +76,212 @@
! xmin
! if two chunks exclude this face for one of them
- if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
+ if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC ) then
+ if( nspec2D_xmin_outer_core > 0 ) then
+ ! adds boundary contributions
+ if( .NOT. GPU_MODE) then
+ ! on CPU
+ do ispec2D=1,nspec2D_xmin_outer_core
- if( .NOT. GPU_MODE) then
- ! on CPU
- do ispec2D=1,nspec2D_xmin_outer_core
+ ispec=ibelm_xmin_outer_core(ispec2D)
- ispec=ibelm_xmin_outer_core(ispec2D)
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
- ! exclude elements that are not on absorbing edges
- if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
+ i=1
+ do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
+ do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- i=1
- do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
- do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
- sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+ weight = jacobian2D_xmin_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
- weight = jacobian2D_xmin_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
+ accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
- accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
-
- if( SAVE_FORWARD ) then
- absorb_xmin_outer_core(j,k,ispec2D) = weight*sn
- endif
+ if( SAVE_FORWARD ) then
+ absorb_xmin_outer_core(j,k,ispec2D) = weight*sn
+ endif
+ enddo
enddo
enddo
- enddo
- else
- ! on GPU
- if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
- absorb_xmin_outer_core, &
- 4) ! <= xmin
- endif
+ else
+ ! on GPU
+ call compute_stacey_acoustic_cuda(Mesh_pointer,absorb_xmin_outer_core,4) ! <= xmin
+ endif
- ! writes absorbing boundary values
- if( SAVE_FORWARD .and. nspec2D_xmin_outer_core > 0 ) then
- call write_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,it)
+ ! writes absorbing boundary values to file
+ if( SAVE_FORWARD ) then
+ call write_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,it)
+ endif
endif
-
endif
! xmax
! if two chunks exclude this face for one of them
- if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
+ if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB ) then
+ if( nspec2D_xmax_outer_core > 0 ) then
+ ! adds boundary contributions
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D=1,nspec2D_xmax_outer_core
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D=1,nspec2D_xmax_outer_core
+ ispec=ibelm_xmax_outer_core(ispec2D)
- ispec=ibelm_xmax_outer_core(ispec2D)
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
- ! exclude elements that are not on absorbing edges
- if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
+ i=NGLLX
+ do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
+ do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- i=NGLLX
- do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
- do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
- sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+ weight = jacobian2D_xmax_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
- weight = jacobian2D_xmax_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
+ accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
- accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+ if( SAVE_FORWARD ) then
+ absorb_xmax_outer_core(j,k,ispec2D) = weight*sn
+ endif
- if( SAVE_FORWARD ) then
- absorb_xmax_outer_core(j,k,ispec2D) = weight*sn
- endif
-
+ enddo
enddo
enddo
- enddo
- else
- ! on GPU
- if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
- absorb_xmax_outer_core, &
- 5) ! <= xmax
- endif
+ else
+ ! on GPU
+ call compute_stacey_acoustic_cuda(Mesh_pointer,absorb_xmax_outer_core,5) ! <= xmax
+ endif
- if( SAVE_FORWARD .and. nspec2D_xmax_outer_core > 0 ) then
- call write_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,it)
+ if( SAVE_FORWARD ) then
+ call write_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,it)
+ endif
endif
endif
! ymin
+ if( nspec2D_ymin_outer_core > 0 ) then
+ ! adds boundary contributions
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D=1,nspec2D_ymin_outer_core
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D=1,nspec2D_ymin_outer_core
+ ispec=ibelm_ymin_outer_core(ispec2D)
- ispec=ibelm_ymin_outer_core(ispec2D)
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
- ! exclude elements that are not on absorbing edges
- if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
+ j=1
+ do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
+ do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- j=1
- do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
- do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
- sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+ weight=jacobian2D_ymin_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
- weight=jacobian2D_ymin_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
+ accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
- accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+ if( SAVE_FORWARD ) then
+ absorb_ymin_outer_core(i,k,ispec2D) = weight*sn
+ endif
- if( SAVE_FORWARD ) then
- absorb_ymin_outer_core(i,k,ispec2D) = weight*sn
- endif
-
+ enddo
enddo
enddo
- enddo
+ else
+ ! on GPU
+ call compute_stacey_acoustic_cuda(Mesh_pointer,absorb_ymin_outer_core,6) ! <= ymin
+ endif
- else
- ! on GPU
- if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
- absorb_ymin_outer_core, &
- 6) ! <= ymin
+ if( SAVE_FORWARD ) then
+ call write_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,it)
+ endif
endif
- if( SAVE_FORWARD .and. nspec2D_ymin_outer_core > 0 ) then
- call write_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,it)
- endif
-
! ymax
+ if( nspec2D_ymax_outer_core > 0 ) then
+ ! adds boundary contributions
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D=1,nspec2D_ymax_outer_core
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D=1,nspec2D_ymax_outer_core
+ ispec=ibelm_ymax_outer_core(ispec2D)
- ispec=ibelm_ymax_outer_core(ispec2D)
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
- ! exclude elements that are not on absorbing edges
- if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
+ j=NGLLY
+ do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
+ do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- j=NGLLY
- do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
- do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
- sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+ weight=jacobian2D_ymax_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
- weight=jacobian2D_ymax_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
+ accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
- accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+ if( SAVE_FORWARD ) then
+ absorb_ymax_outer_core(i,k,ispec2D) = weight*sn
+ endif
- if( SAVE_FORWARD ) then
- absorb_ymax_outer_core(i,k,ispec2D) = weight*sn
- endif
-
+ enddo
enddo
enddo
- enddo
+ else
+ ! on GPU
+ call compute_stacey_acoustic_cuda(Mesh_pointer,absorb_ymax_outer_core,7) ! <= ymax
+ endif
- else
- ! on GPU
- if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
- absorb_ymax_outer_core, &
- 7) ! <= ymax
+ if( SAVE_FORWARD ) then
+ call write_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,it)
+ endif
endif
- if( SAVE_FORWARD .and. nspec2D_ymax_outer_core > 0 ) then
- call write_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,it)
- endif
-
! zmin
! for surface elements exactly on the ICB
+ if( nspec2D_zmin_outer_core > 0 ) then
+ ! adds boundary contributions
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D = 1,nspec2D_zmin_outer_core
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D = 1,nspec2D_zmin_outer_core
+ ispec = ibelm_bottom_outer_core(ispec2D)
- ispec = ibelm_bottom_outer_core(ispec2D)
+ k = 1
+ do j = 1,NGLLY
+ do i = 1,NGLLX
+ iglob = ibool_outer_core(i,j,k,ispec)
- k = 1
- do j = 1,NGLLY
- do i = 1,NGLLX
- iglob = ibool_outer_core(i,j,k,ispec)
+ sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
- sn = veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+ weight = jacobian2D_bottom_outer_core(i,j,ispec2D)*wgllwgll_xy(i,j)
- weight = jacobian2D_bottom_outer_core(i,j,ispec2D)*wgllwgll_xy(i,j)
+ accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
- accel_outer_core(iglob) = accel_outer_core(iglob) - weight*sn
+ if( SAVE_FORWARD ) then
+ absorb_zmin_outer_core(i,j,ispec2D) = weight*sn
+ endif
- if( SAVE_FORWARD ) then
- absorb_zmin_outer_core(i,j,ispec2D) = weight*sn
- endif
-
+ enddo
enddo
enddo
- enddo
+ else
+ ! on GPU
+ call compute_stacey_acoustic_cuda(Mesh_pointer,absorb_zmin_outer_core,8) ! <= zmin
+ endif
- else
- ! on GPU
- if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
- absorb_zmin_outer_core, &
- 8) ! <= zmin
+ if( SAVE_FORWARD ) then
+ call write_abs(8,absorb_zmin_outer_core,reclen_zmin,it)
+ endif
endif
- if( SAVE_FORWARD .and. nspec2D_zmin_outer_core > 0 ) then
- call write_abs(8,absorb_zmin_outer_core,reclen_zmin,it)
- endif
-
end subroutine compute_stacey_outer_core_forward
@@ -333,176 +329,165 @@
! simple approach is still fastest. (assuming that files are accessed on a local scratch disk)
! checks
- if (SIMULATION_TYPE /= 3 ) return
+ if( SIMULATION_TYPE /= 3 ) return
! outer core
! xmin
! if two chunks exclude this face for one of them
- if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
-
+ if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC ) then
! reads absorbing boundary values
if( nspec2D_xmin_outer_core > 0 ) then
! note: backward/reconstructed wavefields are read in after the Newmark time scheme in the first time loop
! this leads to a corresponding boundary condition at time index NSTEP - (it-1) = NSTEP - it + 1
call read_abs(4,absorb_xmin_outer_core,reclen_xmin_outer_core,NSTEP-it+1)
- endif
- if( .NOT. GPU_MODE) then
- ! on CPU
- do ispec2D=1,nspec2D_xmin_outer_core
+ ! adds boundary contributions
+ if( .NOT. GPU_MODE) then
+ ! on CPU
+ do ispec2D=1,nspec2D_xmin_outer_core
- ispec=ibelm_xmin_outer_core(ispec2D)
+ ispec=ibelm_xmin_outer_core(ispec2D)
- ! exclude elements that are not on absorbing edges
- if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
- i=1
- do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
- do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ i=1
+ do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
+ do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmin_outer_core(j,k,ispec2D)
+ b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmin_outer_core(j,k,ispec2D)
+ enddo
enddo
enddo
- enddo
-
- else
- ! on GPU
- if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
- absorb_xmin_outer_core, &
- 4) ! <= xmin
+ else
+ ! on GPU
+ call compute_stacey_acoustic_backward_cuda(Mesh_pointer,absorb_xmin_outer_core,4) ! <= xmin
+ endif
endif
-
endif
! xmax
! if two chunks exclude this face for one of them
- if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
-
+ if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB ) then
if( nspec2D_xmax_outer_core > 0 ) then
+ ! reads in boundary contributions from file
call read_abs(5,absorb_xmax_outer_core,reclen_xmax_outer_core,NSTEP-it+1)
- endif
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D=1,nspec2D_xmax_outer_core
+ ! adds boundary contributions
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D=1,nspec2D_xmax_outer_core
- ispec=ibelm_xmax_outer_core(ispec2D)
+ ispec=ibelm_xmax_outer_core(ispec2D)
- ! exclude elements that are not on absorbing edges
- if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
- i=NGLLX
- do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
- do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ i=NGLLX
+ do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
+ do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmax_outer_core(j,k,ispec2D)
+ b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_xmax_outer_core(j,k,ispec2D)
+ enddo
enddo
enddo
- enddo
-
- else
- ! on GPU
- if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
- absorb_xmax_outer_core, &
- 5) ! <= xmax
+ else
+ ! on GPU
+ call compute_stacey_acoustic_backward_cuda(Mesh_pointer,absorb_xmax_outer_core,5) ! <= xmax
+ endif
endif
-
endif
! ymin
if( nspec2D_ymin_outer_core > 0 ) then
+ ! reads in boundary contributions from file
call read_abs(6,absorb_ymin_outer_core,reclen_ymin_outer_core,NSTEP-it+1)
- endif
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D=1,nspec2D_ymin_outer_core
+ ! adds boundary contributions
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D=1,nspec2D_ymin_outer_core
- ispec=ibelm_ymin_outer_core(ispec2D)
+ ispec=ibelm_ymin_outer_core(ispec2D)
- ! exclude elements that are not on absorbing edges
- if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
- j=1
- do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
- do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ j=1
+ do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
+ do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymin_outer_core(i,k,ispec2D)
+ b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymin_outer_core(i,k,ispec2D)
+ enddo
enddo
enddo
- enddo
-
- else
- ! on GPU
- if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
- absorb_ymin_outer_core, &
- 6) ! <= ymin
+ else
+ ! on GPU
+ call compute_stacey_acoustic_backward_cuda(Mesh_pointer,absorb_ymin_outer_core,6) ! <= ymin
+ endif
endif
! ymax
-
if( nspec2D_ymax_outer_core > 0 ) then
+ ! reads in boundary contributions from file
call read_abs(7,absorb_ymax_outer_core,reclen_ymax_outer_core,NSTEP-it+1)
- endif
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D=1,nspec2D_ymax_outer_core
+ ! adds boundary contributions
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D=1,nspec2D_ymax_outer_core
- ispec=ibelm_ymax_outer_core(ispec2D)
+ ispec=ibelm_ymax_outer_core(ispec2D)
- ! exclude elements that are not on absorbing edges
- if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
- j=NGLLY
- do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
- do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ j=NGLLY
+ do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
+ do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymax_outer_core(i,k,ispec2D)
+ b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_ymax_outer_core(i,k,ispec2D)
+ enddo
enddo
enddo
- enddo
-
- else
- ! on GPU
- if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
- absorb_ymax_outer_core, &
- 7) ! <= ymax
+ else
+ ! on GPU
+ call compute_stacey_acoustic_backward_cuda(Mesh_pointer,absorb_ymax_outer_core,7) ! <= ymax
+ endif
endif
! zmin
! for surface elements exactly on the ICB
if( nspec2D_zmin_outer_core > 0 ) then
+ ! reads in boundary contributions from file
call read_abs(8,absorb_zmin_outer_core,reclen_zmin,NSTEP-it+1)
- endif
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D = 1,nspec2D_zmin_outer_core
+ ! adds boundary contributions
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D = 1,nspec2D_zmin_outer_core
- ispec = ibelm_bottom_outer_core(ispec2D)
+ ispec = ibelm_bottom_outer_core(ispec2D)
- k = 1
- do j = 1,NGLLY
- do i = 1,NGLLX
- iglob = ibool_outer_core(i,j,k,ispec)
+ k = 1
+ do j = 1,NGLLY
+ do i = 1,NGLLX
+ iglob = ibool_outer_core(i,j,k,ispec)
- b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_zmin_outer_core(i,j,ispec2D)
+ b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - absorb_zmin_outer_core(i,j,ispec2D)
+ enddo
enddo
enddo
- enddo
-
- else
- ! on GPU
- if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_backward_cuda(Mesh_pointer, &
- absorb_zmin_outer_core, &
- 8) ! <= zmin
+ else
+ ! on GPU
+ call compute_stacey_acoustic_backward_cuda(Mesh_pointer,absorb_zmin_outer_core,8) ! <= zmin
+ endif
endif
end subroutine compute_stacey_outer_core_backward
@@ -549,164 +534,167 @@
! xmin
! if two chunks exclude this face for one of them
- if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC) then
+ if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC ) then
+ if( nspec2D_xmin_outer_core > 0 ) then
+ ! adds boundary contribution
+ if( .NOT. GPU_MODE) then
+ ! on CPU
+ do ispec2D=1,nspec2D_xmin_outer_core
- if( .NOT. GPU_MODE) then
- ! on CPU
- do ispec2D=1,nspec2D_xmin_outer_core
+ ispec=ibelm_xmin_outer_core(ispec2D)
- ispec=ibelm_xmin_outer_core(ispec2D)
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
- ! exclude elements that are not on absorbing edges
- if(nkmin_xi_outer_core(1,ispec2D) == 0 .or. njmin_outer_core(1,ispec2D) == 0) cycle
+ i=1
+ do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
+ do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- i=1
- do k=nkmin_xi_outer_core(1,ispec2D),NGLLZ
- do j=njmin_outer_core(1,ispec2D),njmax_outer_core(1,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
- sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+ weight = jacobian2D_xmin_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
- weight = jacobian2D_xmin_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
+ b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
- b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
-
+ enddo
enddo
enddo
- enddo
-
- else
- ! on GPU
- if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,4) ! <= xmin
+ else
+ ! on GPU
+ call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,4) ! <= xmin
+ endif
endif
-
endif
! xmax
! if two chunks exclude this face for one of them
- if(NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB) then
+ if( NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB ) then
+ if( nspec2D_xmax_outer_core > 0 ) then
+ ! adds boundary contribution
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D=1,nspec2D_xmax_outer_core
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D=1,nspec2D_xmax_outer_core
+ ispec=ibelm_xmax_outer_core(ispec2D)
- ispec=ibelm_xmax_outer_core(ispec2D)
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
- ! exclude elements that are not on absorbing edges
- if(nkmin_xi_outer_core(2,ispec2D) == 0 .or. njmin_outer_core(2,ispec2D) == 0) cycle
+ i=NGLLX
+ do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
+ do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- i=NGLLX
- do k=nkmin_xi_outer_core(2,ispec2D),NGLLZ
- do j=njmin_outer_core(2,ispec2D),njmax_outer_core(2,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
- sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+ weight = jacobian2D_xmax_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
- weight = jacobian2D_xmax_outer_core(j,k,ispec2D)*wgllwgll_yz(j,k)
+ b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
- b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
-
+ enddo
enddo
enddo
- enddo
-
- else
- ! on GPU
- if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,5) ! <= xmax
+ else
+ ! on GPU
+ call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,5) ! <= xmax
+ endif
endif
-
endif
! ymin
+ if( nspec2D_ymin_outer_core > 0 ) then
+ ! adds boundary contribution
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D=1,nspec2D_ymin_outer_core
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D=1,nspec2D_ymin_outer_core
+ ispec=ibelm_ymin_outer_core(ispec2D)
- ispec=ibelm_ymin_outer_core(ispec2D)
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
- ! exclude elements that are not on absorbing edges
- if(nkmin_eta_outer_core(1,ispec2D) == 0 .or. nimin_outer_core(1,ispec2D) == 0) cycle
+ j=1
+ do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
+ do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- j=1
- do k=nkmin_eta_outer_core(1,ispec2D),NGLLZ
- do i=nimin_outer_core(1,ispec2D),nimax_outer_core(1,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
- sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+ weight=jacobian2D_ymin_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
- weight=jacobian2D_ymin_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
+ b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
- b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
-
+ enddo
enddo
enddo
- enddo
-
- else
- ! on GPU
- if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,6) ! <= ymin
+ else
+ ! on GPU
+ call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,6) ! <= ymin
+ endif
endif
! ymax
+ if( nspec2D_ymax_outer_core > 0 ) then
+ ! adds boundary contribution
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D=1,nspec2D_ymax_outer_core
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D=1,nspec2D_ymax_outer_core
+ ispec=ibelm_ymax_outer_core(ispec2D)
- ispec=ibelm_ymax_outer_core(ispec2D)
+ ! exclude elements that are not on absorbing edges
+ if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
- ! exclude elements that are not on absorbing edges
- if(nkmin_eta_outer_core(2,ispec2D) == 0 .or. nimin_outer_core(2,ispec2D) == 0) cycle
+ j=NGLLY
+ do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
+ do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
+ iglob=ibool_outer_core(i,j,k,ispec)
- j=NGLLY
- do k=nkmin_eta_outer_core(2,ispec2D),NGLLZ
- do i=nimin_outer_core(2,ispec2D),nimax_outer_core(2,ispec2D)
- iglob=ibool_outer_core(i,j,k,ispec)
+ sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
- sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+ weight=jacobian2D_ymax_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
- weight=jacobian2D_ymax_outer_core(i,k,ispec2D)*wgllwgll_xz(i,k)
+ b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
- b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
-
+ enddo
enddo
enddo
- enddo
-
- else
- ! on GPU
- if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,7) ! <= ymax
+ else
+ ! on GPU
+ call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,7) ! <= ymax
+ endif
endif
! zmin
! for surface elements exactly on the ICB
+ if( nspec2D_zmin_outer_core > 0 ) then
+ ! adds boundary contribution
+ if( .NOT. GPU_MODE ) then
+ ! on CPU
+ do ispec2D = 1,nspec2D_zmin_outer_core
- if( .NOT. GPU_MODE ) then
- ! on CPU
- do ispec2D = 1,nspec2D_zmin_outer_core
+ ispec = ibelm_bottom_outer_core(ispec2D)
- ispec = ibelm_bottom_outer_core(ispec2D)
+ k = 1
+ do j = 1,NGLLY
+ do i = 1,NGLLX
+ iglob = ibool_outer_core(i,j,k,ispec)
- k = 1
- do j = 1,NGLLY
- do i = 1,NGLLX
- iglob = ibool_outer_core(i,j,k,ispec)
+ sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
- sn = b_veloc_outer_core(iglob)/vp_outer_core(i,j,k,ispec)
+ weight = jacobian2D_bottom_outer_core(i,j,ispec2D)*wgllwgll_xy(i,j)
- weight = jacobian2D_bottom_outer_core(i,j,ispec2D)*wgllwgll_xy(i,j)
+ b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
- b_accel_outer_core(iglob) = b_accel_outer_core(iglob) - weight*sn
-
+ enddo
enddo
enddo
- enddo
-
- else
- ! on GPU
- if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,8) ! <= zmin
+ else
+ ! on GPU
+ call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,8) ! <= zmin
+ endif
endif
end subroutine compute_stacey_outer_core_backward_undoatt
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/get_attenuation.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -171,7 +171,7 @@
subroutine get_attenuation_scale_factor(myrank, T_c_source, tau_mu, tau_sigma, Q_mu, scale_factor)
- use constants
+ use constants,only: ZERO,ONE,TWO,PI,GRAV,RHOAV,TWO_PI,N_SLS
implicit none
@@ -227,8 +227,8 @@
scale_factor = factor_scale_mu * factor_scale_mu0
!--- check that the correction factor is close to one
- if(scale_factor < 0.8 .or. scale_factor > 1.2) then
- write(*,*)'scale factor: ', scale_factor
+ if(scale_factor < 0.8d0 .or. scale_factor > 1.2d0) then
+ print*,'error: incorrect scale factor: ', scale_factor
call exit_MPI(myrank,'incorrect correction factor in attenuation model')
endif
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_receivers.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_receivers.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -762,7 +762,7 @@
epidist_found(nrec_found) = epidist(irec)
! writes out actual receiver location to vtk file
- write(IOVTK,*) sngl(x_found(irec)), sngl(y_found(irec)), sngl(z_found(irec))
+ write(IOVTK,'(3e18.6)') sngl(x_found(irec)), sngl(y_found(irec)), sngl(z_found(irec))
endif
enddo
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_sources.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/locate_sources.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -669,9 +669,9 @@
write(IMAIN,*) ' time shift: ',tshift_cmt(isource),' seconds'
! writes out actual source position to vtk file
- write(IOVTK,*) sngl(x_found_source(isource_in_this_subset)), &
- sngl(y_found_source(isource_in_this_subset)), &
- sngl(z_found_source(isource_in_this_subset))
+ write(IOVTK,'(3e18.6)') sngl(x_found_source(isource_in_this_subset)), &
+ sngl(y_found_source(isource_in_this_subset)), &
+ sngl(z_found_source(isource_in_this_subset))
! get latitude, longitude and depth of the source that will be used
call xyz_2_rthetaphi_dble(x_found_source(isource_in_this_subset), &
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/setup_sources_receivers.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -311,6 +311,12 @@
!if (OUTPUT_SEISMOS_SAC_ALPHANUM .and. (mod(NTSTEP_BETWEEN_OUTPUT_SEISMOS,5)/=0)) &
! stop 'if OUTPUT_SEISMOS_SAC_ALPHANUM = .true. then NTSTEP_BETWEEN_OUTPUT_SEISMOS must be a multiple of 5, check the Par_file'
+ ! subsets used to save adjoint sources must not be larger than the whole time series,
+ ! otherwise we waste memory
+ if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) then
+ if(NTSTEP_BETWEEN_READ_ADJSRC > NSTEP) NTSTEP_BETWEEN_READ_ADJSRC = NSTEP
+ endif
+
end subroutine setup_timesteps
!
@@ -332,6 +338,9 @@
character(len=256) :: filename,adj_source_file
character(len=2) :: bic
integer :: ier
+ integer,dimension(:),allocatable :: tmp_rec_local_all
+ integer :: maxrec,maxproc(1)
+ double precision :: size
! user output
if( myrank == 0 ) then
@@ -430,9 +439,12 @@
read(IIN,*,iostat=ier) junk,junk
if( ier == 0 ) itime = itime + 1
enddo
- if( itime /= NSTEP) &
+ ! checks length
+ if( itime /= NSTEP) then
+ print*,'adjoint source error: ',trim(filename),' has length',itime,' but should be',NSTEP
call exit_MPI(myrank,&
'file '//trim(filename)//' has wrong length, please check with your simulation duration')
+ endif
! updates counter for found files
nadj_files_found = nadj_files_found + 1
@@ -478,6 +490,86 @@
call exit_MPI(myrank,'total number of receivers is incorrect')
endif
+ ! statistics about allocation memory for seismograms & adj_sourcearrays
+ ! gathers info about receivers on master
+ if( myrank == 0 ) then
+ ! only master process needs full arrays allocated
+ allocate(tmp_rec_local_all(NPROCTOT_VAL),stat=ier)
+ if( ier /= 0 ) call exit_MPI(myrank,'error allocating temporary array tmp_rec_local_all')
+ else
+ ! dummy arrays
+ allocate(tmp_rec_local_all(1),stat=ier)
+ if( ier /= 0 ) call exit_MPI(myrank,'error allocating temporary array tmp_rec_local_all')
+ endif
+
+ ! seismograms
+ ! gather from slaves on master
+ tmp_rec_local_all(:) = 0
+ tmp_rec_local_all(1) = nrec_local
+ if( NPROCTOT_VAL > 1 ) then
+ call gather_all_singlei(nrec_local,tmp_rec_local_all,NPROCTOT_VAL)
+ endif
+ ! user output
+ if( myrank == 0 ) then
+ ! determines maximum number of local receivers and corresponding rank
+ maxrec = maxval(tmp_rec_local_all(:))
+ maxproc = maxloc(tmp_rec_local_all(:))
+ ! seismograms array size in MB
+ if( SIMULATION_TYPE == 1 .or. SIMULATION_TYPE == 3 ) then
+ ! seismograms need seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS)
+ size = dble(maxrec) * dble(NDIM * NTSTEP_BETWEEN_OUTPUT_SEISMOS * CUSTOM_REAL / 1024. / 1024. )
+ else
+ ! adjoint seismograms need seismograms(NDIM*NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS)
+ size = dble(maxrec) * dble(NDIM * NDIM * NTSTEP_BETWEEN_OUTPUT_SEISMOS * CUSTOM_REAL / 1024. / 1024. )
+ endif
+ ! outputs info
+ write(IMAIN,*)
+ write(IMAIN,*) 'seismograms:'
+ write(IMAIN,*) ' writing out seismograms at every NTSTEP_BETWEEN_OUTPUT_SEISMOS = ',NTSTEP_BETWEEN_OUTPUT_SEISMOS
+ write(IMAIN,*) ' maximum number of local receivers is ',maxrec,' in slice ',maxproc(1)
+ write(IMAIN,*) ' size of maximum seismogram array = ', sngl(size),'MB'
+ write(IMAIN,*) ' = ', sngl(size/1024.d0),'GB'
+ write(IMAIN,*)
+ call flush_IMAIN()
+ endif
+
+ ! adjoint sources
+ if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) then
+ ! gather from slaves on master
+ tmp_rec_local_all(:) = 0
+ tmp_rec_local_all(1) = nadj_rec_local
+ if( NPROCTOT_VAL > 1 ) then
+ call gather_all_singlei(nadj_rec_local,tmp_rec_local_all,NPROCTOT_VAL)
+ endif
+ ! user output
+ if( myrank == 0 ) then
+ ! determines maximum number of local receivers and corresponding rank
+ maxrec = maxval(tmp_rec_local_all(:))
+ maxproc = maxloc(tmp_rec_local_all(:))
+ !do i = 1, NPROCTOT_VAL
+ ! if( tmp_rec_local_all(i) > maxrec ) then
+ ! maxrec = tmp_rec_local_all(i)
+ ! maxproc = i-1
+ ! endif
+ !enddo
+ ! adj_sourcearrays size in MB
+ ! adj_sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC)
+ size = dble(maxrec) * dble(NDIM * NGLLX * NGLLY * NGLLZ * NTSTEP_BETWEEN_READ_ADJSRC * CUSTOM_REAL / 1024. / 1024. )
+
+ ! outputs info
+ write(IMAIN,*) 'adjoint source arrays:'
+ write(IMAIN,*) ' reading adjoint sources at every NTSTEP_BETWEEN_READ_ADJSRC = ',NTSTEP_BETWEEN_READ_ADJSRC
+ write(IMAIN,*) ' maximum number of local adjoint sources is ',maxrec,' in slice ',maxproc(1)
+ write(IMAIN,*) ' size of maximum adjoint source array = ', sngl(size),'MB'
+ write(IMAIN,*) ' = ', sngl(size/1024.d0),'GB'
+ write(IMAIN,*)
+ call flush_IMAIN()
+ endif
+ endif
+
+ deallocate(tmp_rec_local_all)
+
+
end subroutine setup_receivers
!
@@ -545,36 +637,38 @@
! stores source arrays
call setup_sources_receivers_srcarr()
-
endif
! adjoint source arrays
if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
- ! adjoint source buffer length
- NSTEP_SUB_ADJ = ceiling( dble(NSTEP)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
+ ! initializes adjoint source buffer
+ ! reverse indexing
allocate(iadj_vec(NSTEP),stat=ier)
if( ier /= 0 ) call exit_MPI(myrank,'error allocating iadj_vec')
-
! initializes iadj_vec
do it=1,NSTEP
- iadj_vec(it) = NSTEP-it+1 ! default is for reversing entire record
+ iadj_vec(it) = NSTEP-it+1 ! default is for reversing entire record, e.g. 3000,2999,..,1
enddo
+ ! number of adjoint source blocks to read in
+ NSTEP_SUB_ADJ = ceiling( dble(NSTEP)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
+
if(nadj_rec_local > 0) then
! allocate adjoint source arrays
allocate(adj_sourcearrays(NDIM,NGLLX,NGLLY,NGLLZ,nadj_rec_local,NTSTEP_BETWEEN_READ_ADJSRC), &
- stat=ier)
+ stat=ier)
if( ier /= 0 ) call exit_MPI(myrank,'error allocating adjoint sourcearrays')
adj_sourcearrays(:,:,:,:,:,:) = 0._CUSTOM_REAL
! allocate indexing arrays
allocate(iadjsrc(NSTEP_SUB_ADJ,2), &
- iadjsrc_len(NSTEP_SUB_ADJ),stat=ier)
+ iadjsrc_len(NSTEP_SUB_ADJ),stat=ier)
if( ier /= 0 ) call exit_MPI(myrank,'error allocating adjoint indexing arrays')
+
! initializes iadjsrc, iadjsrc_len and iadj_vec
call setup_sources_receivers_adjindx(NSTEP,NSTEP_SUB_ADJ, &
- NTSTEP_BETWEEN_READ_ADJSRC, &
- iadjsrc,iadjsrc_len,iadj_vec)
+ NTSTEP_BETWEEN_READ_ADJSRC, &
+ iadjsrc,iadjsrc_len,iadj_vec)
endif
endif
@@ -659,8 +753,8 @@
!
subroutine setup_sources_receivers_adjindx(NSTEP,NSTEP_SUB_ADJ, &
- NTSTEP_BETWEEN_READ_ADJSRC, &
- iadjsrc,iadjsrc_len,iadj_vec)
+ NTSTEP_BETWEEN_READ_ADJSRC, &
+ iadjsrc,iadjsrc_len,iadj_vec)
use constants
@@ -674,9 +768,9 @@
! local parameters
integer :: iadj_block,it,it_sub_adj
+ integer :: istart,iend
- iadj_block = 1 !counts blocks
-
+ ! initializes
iadjsrc(:,:) = 0
iadjsrc_len(:) = 0
@@ -690,31 +784,51 @@
!
! see routine: compute_add_sources_adjoint()
! how the adjoint source is added to the (adjoint) acceleration field
+ !counts blocks
+ ! block number
+ ! e.g. increases from 1 (case it=1-1000), 2 (case it=1001-2000) to 3 (case it=2001-3000)
+ it_sub_adj = 0
+ iadj_block = 1
do it=1,NSTEP
! block number
! e.g. increases from 1 (case it=1-1000), 2 (case it=1001-2000) to 3 (case it=2001-3000)
- it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
+ ! beware: the call below might return a wrong integer number due to machine precision, i.e. 1000./1000. -> 2
+ !it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) )
! we are at the edge of a block
if(mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0) then
- ! block start time ( e.g. 2001)
- iadjsrc(iadj_block,1) = NSTEP-it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC+1
- ! block end time (e.g. 3000)
- iadjsrc(iadj_block,2) = NSTEP-(it_sub_adj-1)*NTSTEP_BETWEEN_READ_ADJSRC
+ ! sets it_sub_adj subset number
+ it_sub_adj = iadj_block
- ! final adj src array
- ! e.g. will be from 1000 to 1, but doesn't go below 1 in cases where NSTEP isn't
- ! a multiple of NTSTEP_BETWEEN_READ_ADJSRC
- if(iadjsrc(iadj_block,1) < 0) iadjsrc(iadj_block,1) = 1
+ ! block start time ( e.g. 2001)
+ istart = NSTEP-it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC+1
+ ! final adj src array
+ ! e.g. will be from 1000 to 1, but doesn't go below 1 in cases where NSTEP isn't
+ ! a multiple of NTSTEP_BETWEEN_READ_ADJSRC
+ if( istart < 1 ) istart = 1
- ! actual block length
- iadjsrc_len(iadj_block) = iadjsrc(iadj_block,2)-iadjsrc(iadj_block,1)+1
+ ! block end time (e.g. 3000)
+ iend = NSTEP-(it_sub_adj-1)*NTSTEP_BETWEEN_READ_ADJSRC
- ! increases block number
- iadj_block = iadj_block+1
+ iadjsrc(iadj_block,1) = istart
+ iadjsrc(iadj_block,2) = iend
+
+ ! actual block length
+ iadjsrc_len(iadj_block) = iend - istart + 1
+
+ ! increases block number
+ iadj_block = iadj_block + 1
endif
+ ! checks that ceiling function above returns correct integer values
+ if( it_sub_adj /= iadj_block-1 ) then
+ print*,'error: confusing block number for reverse adjoint source indexing'
+ print*,' it_sub_adj = ',it_sub_adj,'should be equal to ',iadj_block-1
+ print*,' it = ',it,' istart/iend = ',istart,iend,' NTSTEP_BETWEEN_READ_ADJSRC = ',NTSTEP_BETWEEN_READ_ADJSRC
+ stop 'error reverse adjoint source indexing'
+ endif
+
! time stepping for adjoint sources:
! adjoint time step that corresponds to time step in simulation (it).
! note, that adjoint source has to be time-reversed with respect to the forward wavefield
@@ -724,6 +838,9 @@
! so iadj_vec(1001) = 1000 - 0, iadj_vec(1002) = 1000 - 1, .. and so on again down to 1
! then block 3 and your guess is right now... iadj_vec(2001) to iadj_vec(3000) is 1000 down to 1. :)
iadj_vec(it) = iadjsrc_len(it_sub_adj) - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC)
+
+ ! checks that index is non-negative
+ if( iadj_vec(it) < 1 ) iadj_vec(it) = 1
enddo
end subroutine setup_sources_receivers_adjindx
@@ -800,7 +917,7 @@
else
! allocates dummy array since we need it to pass as argument e.g. in write_seismograms() routine
! note: nrec_local is zero, fortran 90/95 should allow zero-sized array allocation...
- allocate(seismograms(NDIM,nrec_local,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
+ allocate(seismograms(NDIM,0,NTSTEP_BETWEEN_OUTPUT_SEISMOS),stat=ier)
if( ier /= 0) stop 'error while allocating zero seismograms'
endif
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/write_seismograms.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/src/specfem3D/write_seismograms.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -40,6 +40,7 @@
! gets resulting array values onto CPU
if( GPU_MODE ) then
+ ! gets field values from GPU
! this transfers fields only in elements with stations for efficiency
call write_seismograms_transfer_cuda(Mesh_pointer, &
displ_crust_mantle,b_displ_crust_mantle, &
@@ -49,6 +50,15 @@
number_receiver_global, &
ispec_selected_rec,ispec_selected_source, &
ibool_crust_mantle)
+
+ ! synchronizes field values from GPU
+ if( GPU_ASYNC_COPY) then
+ call transfer_seismo_from_device_async(Mesh_pointer, &
+ displ_crust_mantle,b_displ_crust_mantle, &
+ number_receiver_global,ispec_selected_rec,ispec_selected_source, &
+ ibool_crust_mantle)
+ endif
+
endif
! computes traces at interpolated receiver locations
Modified: seismo/3D/SPECFEM3D_GLOBE/trunk/utils/Visualization/VTK_ParaView/create_slice_VTK.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/trunk/utils/Visualization/VTK_ParaView/create_slice_VTK.f90 2014-01-06 21:22:48 UTC (rev 22990)
+++ seismo/3D/SPECFEM3D_GLOBE/trunk/utils/Visualization/VTK_ParaView/create_slice_VTK.f90 2014-01-13 16:06:21 UTC (rev 22991)
@@ -313,7 +313,7 @@
! type: hexahedrons
write(IOVTK,'(a,i12)') "CELL_TYPES ",nspec
- write(IOVTK,*) (12,ispec=1,nspec)
+ write(IOVTK,'(6i12)') (12,ispec=1,nspec)
write(IOVTK,*) ""
! writes out gll-data (velocity) for each element point
More information about the CIG-COMMITS
mailing list