[cig-commits] r22782 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src: cuda specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Fri Sep 13 04:23:46 PDT 2013
Author: danielpeter
Date: 2013-09-13 04:23:45 -0700 (Fri, 13 Sep 2013)
New Revision: 22782
Modified:
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90
Log:
updates cuda routines for forward&backward wavefields
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_scalar_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -48,11 +48,16 @@
int* d_ibool_interfaces) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+ int iglob,iloc;
for( int iinterface=0; iinterface < num_interfaces; iinterface++) {
if(id<d_nibool_interfaces[iinterface]) {
- d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces*iinterface)] =
- d_potential_dot_dot_acoustic[(d_ibool_interfaces[id+max_nibool_interfaces*iinterface]-1)];
+
+ iloc = id + max_nibool_interfaces*iinterface;
+ iglob = d_ibool_interfaces[iloc] - 1;
+
+ // fills buffer
+ d_send_potential_dot_dot_buffer[iloc] = d_potential_dot_dot_acoustic[iglob];
}
}
@@ -77,13 +82,10 @@
int blocksize = BLOCKSIZE_TRANSFER;
int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_oc))/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -94,20 +96,29 @@
mp->max_nibool_interfaces_oc,
mp->d_nibool_interfaces_outer_core,
mp->d_ibool_interfaces_outer_core);
+
+ print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_accel_buffer_outer_core,
+ (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+ cudaMemcpyDeviceToHost),98000);
+
}
else if(*FORWARD_OR_ADJOINT == 3) {
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_accel_outer_core,
- mp->d_send_accel_buffer_outer_core,
+ mp->d_b_send_accel_buffer_outer_core,
mp->num_interfaces_outer_core,
mp->max_nibool_interfaces_oc,
mp->d_nibool_interfaces_outer_core,
mp->d_ibool_interfaces_outer_core);
+
+ print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_b_send_accel_buffer_outer_core,
+ (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+ cudaMemcpyDeviceToHost),98001);
+
}
- print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_accel_buffer_outer_core,
- (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
- cudaMemcpyDeviceToHost),98000);
-
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_boundary_kernel");
#endif
@@ -125,11 +136,17 @@
int* d_ibool_interfaces) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+ int iglob,iloc;
for( int iinterface=0; iinterface < num_interfaces; iinterface++) {
if(id<d_nibool_interfaces[iinterface]) {
- atomicAdd(&d_potential_dot_dot_acoustic[(d_ibool_interfaces[id+max_nibool_interfaces*iinterface]-1)],
- d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces*iinterface)]);
+
+ iloc = id + max_nibool_interfaces*iinterface;
+
+ iglob = d_ibool_interfaces[iloc]-1;
+
+ // assembles values
+ atomicAdd(&d_potential_dot_dot_acoustic[iglob],d_send_potential_dot_dot_buffer[iloc]);
}
}
}
@@ -151,25 +168,22 @@
// checks if anything to do
if( mp->num_interfaces_outer_core == 0 ) return;
- // copies scalar buffer onto GPU
- cudaMemcpy(mp->d_send_accel_buffer_outer_core, buffer_recv_scalar,
- (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
- cudaMemcpyHostToDevice);
-
// assembles on GPU
int blocksize = BLOCKSIZE_TRANSFER;
int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_oc)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
if(*FORWARD_OR_ADJOINT == 1) {
+ // copies scalar buffer onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_outer_core, buffer_recv_scalar,
+ (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+ cudaMemcpyHostToDevice),99000);
+
//assemble forward field
assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_accel_outer_core,
mp->d_send_accel_buffer_outer_core,
@@ -179,9 +193,17 @@
mp->d_ibool_interfaces_outer_core);
}
else if(*FORWARD_OR_ADJOINT == 3) {
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
+ // copies scalar buffer onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_outer_core, buffer_recv_scalar,
+ (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw),
+ cudaMemcpyHostToDevice),99001);
+
//assemble reconstructed/backward field
assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_accel_outer_core,
- mp->d_send_accel_buffer_outer_core,
+ mp->d_b_send_accel_buffer_outer_core,
mp->num_interfaces_outer_core,
mp->max_nibool_interfaces_oc,
mp->d_nibool_interfaces_outer_core,
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/assemble_MPI_vector_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -47,15 +47,18 @@
int* d_ibool_interfaces) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- int iglob;
+ int iglob,iloc;
for( int iinterface=0; iinterface < num_interfaces; iinterface++) {
if(id<d_nibool_interfaces[iinterface]) {
- iglob = d_ibool_interfaces[id+max_nibool_interfaces*iinterface]-1;
+
+ iloc = id + max_nibool_interfaces*iinterface;
+ iglob = d_ibool_interfaces[iloc]-1;
+
// fills buffer
- d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)] = d_accel[3*iglob];
- d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)+1] = d_accel[3*iglob + 1];
- d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)+2] = d_accel[3*iglob + 2];
+ d_send_accel_buffer[3*iloc] = d_accel[3*iglob];
+ d_send_accel_buffer[3*iloc + 1] = d_accel[3*iglob + 1];
+ d_send_accel_buffer[3*iloc + 2] = d_accel[3*iglob + 2];
}
}
@@ -68,12 +71,15 @@
extern "C"
void FC_FUNC_(transfer_boun_accel_from_device,
TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
- realw* send_accel_buffer,
- int* IREGION,
- int* FORWARD_OR_ADJOINT){
+ realw* send_accel_buffer,
+ int* IREGION,
+ int* FORWARD_OR_ADJOINT){
TRACE("transfer_boun_accel_from_device");
- int blocksize,size_padded,num_blocks_x,num_blocks_y;
+ int blocksize,size_padded;
+ int num_blocks_x,num_blocks_y;
+ dim3 grid,threads;
+
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
// crust/mantle region
@@ -82,15 +88,12 @@
blocksize = BLOCKSIZE_TRANSFER;
size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_cm)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+ grid = dim3(num_blocks_x,num_blocks_y);
+ threads = dim3(blocksize,1,1);
+
if(*FORWARD_OR_ADJOINT == 1) {
prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle,
mp->d_send_accel_buffer_crust_mantle,
@@ -98,20 +101,30 @@
mp->max_nibool_interfaces_cm,
mp->d_nibool_interfaces_crust_mantle,
mp->d_ibool_interfaces_crust_mantle);
+
+ // copies buffer to CPU
+ print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_crust_mantle,
+ NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle*sizeof(realw),
+ cudaMemcpyDeviceToHost),41000);
+
}
else if(*FORWARD_OR_ADJOINT == 3) {
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
- mp->d_send_accel_buffer_crust_mantle,
+ mp->d_b_send_accel_buffer_crust_mantle,
mp->num_interfaces_crust_mantle,
mp->max_nibool_interfaces_cm,
mp->d_nibool_interfaces_crust_mantle,
mp->d_ibool_interfaces_crust_mantle);
+ // copies buffer to CPU
+ print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_crust_mantle,
+ NDIM*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle*sizeof(realw),
+ cudaMemcpyDeviceToHost),41001);
+
}
- // copies buffer to CPU
- cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_crust_mantle,
- 3*mp->max_nibool_interfaces_cm*mp->num_interfaces_crust_mantle*sizeof(realw),
- cudaMemcpyDeviceToHost);
}
}
@@ -122,15 +135,12 @@
blocksize = BLOCKSIZE_TRANSFER;
size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ic)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+ grid = dim3(num_blocks_x,num_blocks_y);
+ threads = dim3(blocksize,1,1);
+
if(*FORWARD_OR_ADJOINT == 1) {
prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
mp->d_send_accel_buffer_inner_core,
@@ -138,21 +148,29 @@
mp->max_nibool_interfaces_ic,
mp->d_nibool_interfaces_inner_core,
mp->d_ibool_interfaces_inner_core);
+
+ // copies buffer to CPU
+ print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_inner_core,
+ NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core*sizeof(realw),
+ cudaMemcpyDeviceToHost),41000);
+
}
else if(*FORWARD_OR_ADJOINT == 3) {
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core,
- mp->d_send_accel_buffer_inner_core,
+ mp->d_b_send_accel_buffer_inner_core,
mp->num_interfaces_inner_core,
mp->max_nibool_interfaces_ic,
mp->d_nibool_interfaces_inner_core,
mp->d_ibool_interfaces_inner_core);
+ // copies buffer to CPU
+ print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer_inner_core,
+ NDIM*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core*sizeof(realw),
+ cudaMemcpyDeviceToHost),41001);
}
- // copies buffer to CPU
- cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer_inner_core,
- 3*mp->max_nibool_interfaces_ic*mp->num_interfaces_inner_core*sizeof(realw),
- cudaMemcpyDeviceToHost);
-
}
}
@@ -170,15 +188,18 @@
int* d_ibool_interfaces) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- int iglob;
+ int iglob,iloc;
for( int iinterface=0; iinterface < num_interfaces; iinterface++) {
if(id < d_nibool_interfaces[iinterface]) {
- iglob = d_ibool_interfaces[id + max_nibool_interfaces*iinterface]-1;
+
+ iloc = id + max_nibool_interfaces*iinterface;
+ iglob = d_ibool_interfaces[iloc]-1;
+
// assembles acceleration: adds contributions from buffer array
- atomicAdd(&d_accel[3*iglob],d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)]);
- atomicAdd(&d_accel[3*iglob + 1],d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)+1]);
- atomicAdd(&d_accel[3*iglob + 2],d_send_accel_buffer[3*(id + max_nibool_interfaces*iinterface)+2]);
+ atomicAdd(&d_accel[3*iglob],d_send_accel_buffer[3*iloc]);
+ atomicAdd(&d_accel[3*iglob + 1],d_send_accel_buffer[3*iloc + 1]);
+ atomicAdd(&d_accel[3*iglob + 2],d_send_accel_buffer[3*iloc + 2]);
}
}
}
@@ -193,32 +214,31 @@
int* IREGION,
int* FORWARD_OR_ADJOINT) {
TRACE("transfer_asmbl_accel_to_device");
- int blocksize,size_padded,num_blocks_x,num_blocks_y;
+ int blocksize,size_padded;
+ int num_blocks_x,num_blocks_y;
+ dim3 grid,threads;
+
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// crust/mantle region
if( *IREGION == IREGION_CRUST_MANTLE ){
if( mp->num_interfaces_crust_mantle > 0 ){
- // copies vector buffer values to GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_crust_mantle, buffer_recv_vector,
- NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
- cudaMemcpyHostToDevice),41000);
-
// assembles values
blocksize = BLOCKSIZE_TRANSFER;
size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_cm)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+ grid = dim3(num_blocks_x,num_blocks_y);
+ threads = dim3(blocksize,1,1);
+
if(*FORWARD_OR_ADJOINT == 1) {
+ // copies vector buffer values to GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_crust_mantle, buffer_recv_vector,
+ NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
+ cudaMemcpyHostToDevice),41000);
+
//assemble forward accel
assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_crust_mantle,
mp->d_send_accel_buffer_crust_mantle,
@@ -228,9 +248,17 @@
mp->d_ibool_interfaces_crust_mantle);
}
else if(*FORWARD_OR_ADJOINT == 3) {
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
+ // copies vector buffer values to GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_crust_mantle, buffer_recv_vector,
+ NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw),
+ cudaMemcpyHostToDevice),41000);
+
//assemble adjoint accel
assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
- mp->d_send_accel_buffer_crust_mantle,
+ mp->d_b_send_accel_buffer_crust_mantle,
mp->num_interfaces_crust_mantle,
mp->max_nibool_interfaces_cm,
mp->d_nibool_interfaces_crust_mantle,
@@ -246,25 +274,21 @@
// inner core region
if( *IREGION == IREGION_INNER_CORE ){
if( mp->num_interfaces_inner_core > 0 ){
- // copies buffer values to GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_inner_core, buffer_recv_vector,
- NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw),
- cudaMemcpyHostToDevice),41001);
-
// assembles values
blocksize = BLOCKSIZE_TRANSFER;
size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ic)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+ grid = dim3(num_blocks_x,num_blocks_y);
+ threads = dim3(blocksize,1,1);
+
if(*FORWARD_OR_ADJOINT == 1) {
+ // copies buffer values to GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_send_accel_buffer_inner_core, buffer_recv_vector,
+ NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw),
+ cudaMemcpyHostToDevice),41001);
+
//assemble forward accel
assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel_inner_core,
mp->d_send_accel_buffer_inner_core,
@@ -274,9 +298,17 @@
mp->d_ibool_interfaces_inner_core);
}
else if(*FORWARD_OR_ADJOINT == 3) {
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
+ // copies buffer values to GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer_inner_core, buffer_recv_vector,
+ NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw),
+ cudaMemcpyHostToDevice),41001);
+
//assemble adjoint accel
assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel_inner_core,
- mp->d_send_accel_buffer_inner_core,
+ mp->d_b_send_accel_buffer_inner_core,
mp->num_interfaces_inner_core,
mp->max_nibool_interfaces_ic,
mp->d_nibool_interfaces_inner_core,
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -212,6 +212,45 @@
/* ----------------------------------------------------------------------------------------------- */
+void synchronize_cuda(){
+#if CUDA_VERSION >= 4000
+ cudaDeviceSynchronize();
+#else
+ cudaThreadSynchronize();
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void synchronize_mpi(){
+#ifdef WITH_MPI
+ MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
+
+// Initially sets the blocks_x to be the num_blocks, and adds rows as needed (block size limit of 65535).
+// If an additional row is added, the row length is cut in
+// half. If the block count is odd, there will be 1 too many blocks,
+// which must be managed at runtime with an if statement.
+
+ *num_blocks_x = num_blocks;
+ *num_blocks_y = 1;
+
+ while(*num_blocks_x > MAXIMUM_GRID_DIM) {
+ *num_blocks_x = (int) ceil(*num_blocks_x * 0.5f);
+ *num_blocks_y = *num_blocks_y * 2;
+ }
+
+ return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
void get_free_memory(double* free_db, double* used_db, double* total_db) {
// gets memory usage in byte
@@ -285,6 +324,56 @@
/* ----------------------------------------------------------------------------------------------- */
+// Auxiliary functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+__global__ void memset_to_realw_kernel(realw* array, int size, realw value){
+
+ unsigned int tid = threadIdx.x;
+ unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+ unsigned int i = tid + bx*blockDim.x;
+
+ if( i < size ){
+ array[i] = *value;
+ }
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+realw get_device_array_maximum_value(realw* array, int size){
+
+// get maximum of array on GPU by copying over to CPU and handle it there
+
+ realw max = 0.0f;
+
+ // checks if anything to do
+ if( size > 0 ){
+ realw* h_array;
+
+ // explicitly wait for cuda kernels to finish
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ synchronize_cuda();
+
+ h_array = (realw*)calloc(size,sizeof(realw));
+ print_CUDA_error_if_any(cudaMemcpy(h_array,array,sizeof(realw)*size,cudaMemcpyDeviceToHost),33001);
+
+ // finds maximum value in array
+ max = h_array[0];
+ for( int i=1; i < size; i++){
+ if( abs(h_array[i]) > max ) max = abs(h_array[i]);
+ }
+ free(h_array);
+ }
+ return max;
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
// scalar arrays (acoustic/fluid outer core)
/* ----------------------------------------------------------------------------------------------- */
@@ -340,7 +429,7 @@
void FC_FUNC_(check_norm_acoustic_from_device,
CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
- int* SIMULATION_TYPE) {
+ int* FORWARD_OR_ADJOINT) {
TRACE("check_norm_acoustic_from_device");
//double start_time = get_time();
@@ -390,22 +479,19 @@
int size = mp->NGLOB_OUTER_CORE;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
- cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+ int num_blocks_x,num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- if(*SIMULATION_TYPE == 1 ){
+ h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+ cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+
+ if(*FORWARD_OR_ADJOINT == 1 ){
get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_displ_outer_core,size,d_max);
- }else if(*SIMULATION_TYPE == 3 ){
+ }else if(*FORWARD_OR_ADJOINT == 3 ){
get_maximum_scalar_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,size,d_max);
}
@@ -513,7 +599,7 @@
void FC_FUNC_(check_norm_elastic_from_device,
CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
- int* SIMULATION_TYPE) {
+ int* FORWARD_OR_ADJOINT) {
TRACE("check_norm_elastic_from_device");
//double start_time = get_time();
@@ -522,7 +608,6 @@
realw max,max_crust_mantle,max_inner_core;
realw *d_max;
- int num_blocks_x,num_blocks_y;
int size,size_padded;
dim3 grid,threads;
@@ -535,22 +620,19 @@
size = mp->NGLOB_CRUST_MANTLE;
size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
- cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+ int num_blocks_x,num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
grid = dim3(num_blocks_x,num_blocks_y);
threads = dim3(blocksize,1,1);
- if(*SIMULATION_TYPE == 1 ){
+ h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+ cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+
+ if(*FORWARD_OR_ADJOINT == 1 ){
get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,size,d_max);
- }else if(*SIMULATION_TYPE == 3 ){
+ }else if(*FORWARD_OR_ADJOINT == 3 ){
get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,size,d_max);
}
@@ -573,22 +655,18 @@
size = mp->NGLOB_INNER_CORE;
size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
- cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
grid = dim3(num_blocks_x,num_blocks_y);
threads = dim3(blocksize,1,1);
- if(*SIMULATION_TYPE == 1 ){
+ h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+ cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+
+ if(*FORWARD_OR_ADJOINT == 1 ){
get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ_inner_core,size,d_max);
- }else if(*SIMULATION_TYPE == 3 ){
+ }else if(*FORWARD_OR_ADJOINT == 3 ){
get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,size,d_max);
}
@@ -632,10 +710,9 @@
realw max,max_eps;
realw *d_max;
- int num_blocks_x,num_blocks_y;
+ int num_blocks_x, num_blocks_y;
int size,size_padded;
- dim3 grid;
- dim3 threads;
+ dim3 grid,threads;
// launch simple reduction kernel
realw* h_max;
@@ -645,19 +722,15 @@
size = NGLL3*(mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
- cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
grid = dim3(num_blocks_x,num_blocks_y);
threads = dim3(blocksize,1,1);
+ h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+ cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+
max = 0.0f;
// determines max for: eps_trace_over_3_crust_mantle
@@ -681,19 +754,15 @@
size = NGLL3*(mp->NSPEC_CRUST_MANTLE);
size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
- cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
grid = dim3(num_blocks_x,num_blocks_y);
threads = dim3(blocksize,1,1);
+ h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+ cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
+
max_eps = 0.0f;
// determines max for: epsilondev_xx_crust_mantle
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -86,12 +86,12 @@
/* ----------------------------------------------------------------------------------------------- */
extern "C"
-void FC_FUNC_(compute_add_sources_el_cuda,
- COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
- int* NSOURCESf,
- double* h_stf_pre_compute) {
+void FC_FUNC_(compute_add_sources_cuda,
+ COMPUTE_ADD_SOURCES_CUDA)(long* Mesh_pointer_f,
+ int* NSOURCESf,
+ double* h_stf_pre_compute) {
- TRACE("compute_add_sources_el_cuda");
+ TRACE("compute_add_sources_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
@@ -100,18 +100,15 @@
int NSOURCES = *NSOURCESf;
- int num_blocks_x = NSOURCES;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(NGLLX,NGLLX,NGLLX);
// copies source time function buffer values to GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
- NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+ NSOURCES*sizeof(double),cudaMemcpyHostToDevice),71018);
// adds source contributions
compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
@@ -124,7 +121,7 @@
NSOURCES);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_add_sources_kernel");
+ exit_on_cuda_error("compute_add_sources_cuda");
#endif
}
@@ -135,11 +132,13 @@
/* ----------------------------------------------------------------------------------------------- */
extern "C"
-void FC_FUNC_(compute_add_sources_el_s3_cuda,
- COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer_f,
- int* NSOURCESf,
- double* h_stf_pre_compute) {
- TRACE("compute_add_sources_el_s3_cuda");
+void FC_FUNC_(compute_add_sources_backward_cuda,
+ COMPUTE_ADD_SOURCES_BACKWARD_CUDA)(long* Mesh_pointer_f,
+ int* NSOURCESf,
+ double* h_stf_pre_compute) {
+ TRACE("compute_add_sources_backward_cuda");
+ // debug
+ DEBUG_EMPTY_BACKWARD();
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
@@ -148,18 +147,15 @@
int NSOURCES = *NSOURCESf;
- int num_blocks_x = NSOURCES;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(NGLLX,NGLLX,NGLLX);
// copies source time function buffer values to GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
- NSOURCES*sizeof(double),cudaMemcpyHostToDevice),19);
+ NSOURCES*sizeof(double),cudaMemcpyHostToDevice),71019);
compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
mp->d_ibool_crust_mantle,
@@ -171,7 +167,7 @@
NSOURCES);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+ exit_on_cuda_error("compute_add_sources_backward_cuda");
#endif
}
@@ -182,13 +178,13 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
- int nrec,
- realw* adj_sourcearrays,
- int* ibool,
- int* ispec_selected_rec,
- int* pre_computed_irec,
- int nadj_rec_local) {
+__global__ void compute_add_sources_adjoint_cuda_kernel(realw* accel,
+ int nrec,
+ realw* adj_sourcearrays,
+ int* ibool,
+ int* ispec_selected_rec,
+ int* pre_computed_irec,
+ int nadj_rec_local) {
int ispec,iglob;
int irec,i,j,k;
@@ -220,28 +216,24 @@
/* ----------------------------------------------------------------------------------------------- */
extern "C"
-void FC_FUNC_(add_sources_el_sim_type_2_or_3,
- ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
- int* nrec,
- realw* h_adj_sourcearrays,
- int* h_islice_selected_rec,
- int* h_ispec_selected_rec,
- int* time_index) {
+void FC_FUNC_(compute_add_sources_adjoint_cuda,
+ COMPUTE_ADD_SOURCES_ADJOINT_CUDA)(long* Mesh_pointer,
+ int* nrec,
+ realw* h_adj_sourcearrays,
+ int* h_islice_selected_rec,
+ int* h_ispec_selected_rec,
+ int* time_index) {
- TRACE("add_sources_el_sim_type_2_or_3");
+ TRACE("compute_add_sources_adjoint_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// check if anything to do
if( mp->nadj_rec_local == 0 ) return;
- // make sure grid dimension is less than MAXIMUM_GRID_DIM in x dimension
- int num_blocks_x = mp->nadj_rec_local;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nadj_rec_local,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(NGLLX,NGLLX,NGLLX);
@@ -284,23 +276,23 @@
if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
// copies extracted array values onto GPU
- cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
- (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),cudaMemcpyHostToDevice);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+ (mp->nadj_rec_local)*NDIM*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),71000);
// the irec_local variable needs to be precomputed (as
// h_pre_comp..), because normally it is in the loop updating accel,
// and due to how it's incremented, it cannot be parallelized
- add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
- *nrec,
- mp->d_adj_sourcearrays,
- mp->d_ibool_crust_mantle,
- mp->d_ispec_selected_rec,
- mp->d_pre_computed_irec,
- mp->nadj_rec_local);
+ compute_add_sources_adjoint_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
+ *nrec,
+ mp->d_adj_sourcearrays,
+ mp->d_ibool_crust_mantle,
+ mp->d_ispec_selected_rec,
+ mp->d_pre_computed_irec,
+ mp->nadj_rec_local);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
+ exit_on_cuda_error("compute_add_sources_adjoint_cuda");
#endif
}
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_coupling_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -56,6 +56,7 @@
int i = threadIdx.x;
int j = threadIdx.y;
+
int iface = blockIdx.x + gridDim.x*blockIdx.y;
int k,k_corresp,iglob_cm,iglob_oc,ispec,ispec_selected;
@@ -106,6 +107,60 @@
/* ----------------------------------------------------------------------------------------------- */
+extern "C"
+void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
+ COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f,
+ int* FORWARD_OR_ADJOINT) {
+
+ TRACE("compute_coupling_fluid_cmb_cuda");
+ //double start_time = get_time();
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nspec2D_top_outer_core,&num_blocks_x,&num_blocks_y);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(NGLLX,NGLLX,1);
+
+ // launches GPU kernel
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+ mp->d_accel_outer_core,
+ mp->d_ibool_crust_mantle,
+ mp->d_ibelm_bottom_crust_mantle,
+ mp->d_normal_top_outer_core,
+ mp->d_jacobian2D_top_outer_core,
+ mp->d_wgllwgll_xy,
+ mp->d_ibool_outer_core,
+ mp->d_ibelm_top_outer_core,
+ mp->nspec2D_top_outer_core);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
+ // adjoint simulations
+ compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+ mp->d_b_accel_outer_core,
+ mp->d_ibool_crust_mantle,
+ mp->d_ibelm_bottom_crust_mantle,
+ mp->d_normal_top_outer_core,
+ mp->d_jacobian2D_top_outer_core,
+ mp->d_wgllwgll_xy,
+ mp->d_ibool_outer_core,
+ mp->d_ibelm_top_outer_core,
+ mp->nspec2D_top_outer_core);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("compute_coupling_fluid_CMB_kernel");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
__global__ void compute_coupling_fluid_ICB_kernel(realw* displ_inner_core,
realw* accel_outer_core,
int* ibool_inner_core,
@@ -119,6 +174,7 @@
int i = threadIdx.x;
int j = threadIdx.y;
+
int iface = blockIdx.x + gridDim.x*blockIdx.y;
int k,k_corresp,iglob_ic,iglob_oc,ispec,ispec_selected;
@@ -168,61 +224,7 @@
}
}
-/* ----------------------------------------------------------------------------------------------- */
-extern "C"
-void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
- COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f,
- int* FORWARD_OR_ADJOINT) {
-
- TRACE("compute_coupling_fluid_cmb_cuda");
- //double start_time = get_time();
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- int num_blocks_x = mp->nspec2D_top_outer_core;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(5,5,1);
-
- // launches GPU kernel
- if( *FORWARD_OR_ADJOINT == 1 ){
- compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
- mp->d_accel_outer_core,
- mp->d_ibool_crust_mantle,
- mp->d_ibelm_bottom_crust_mantle,
- mp->d_normal_top_outer_core,
- mp->d_jacobian2D_top_outer_core,
- mp->d_wgllwgll_xy,
- mp->d_ibool_outer_core,
- mp->d_ibelm_top_outer_core,
- mp->nspec2D_top_outer_core);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- // adjoint simulations
- compute_coupling_fluid_CMB_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
- mp->d_b_accel_outer_core,
- mp->d_ibool_crust_mantle,
- mp->d_ibelm_bottom_crust_mantle,
- mp->d_normal_top_outer_core,
- mp->d_jacobian2D_top_outer_core,
- mp->d_wgllwgll_xy,
- mp->d_ibool_outer_core,
- mp->d_ibelm_top_outer_core,
- mp->nspec2D_top_outer_core);
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //double end_time = get_time();
- //printf("Elapsed time: %e\n",end_time-start_time);
- exit_on_cuda_error("compute_coupling_fluid_CMB_kernel");
-#endif
-}
-
/* ----------------------------------------------------------------------------------------------- */
extern "C"
@@ -235,15 +237,11 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- int num_blocks_x = mp->nspec2D_bottom_outer_core;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nspec2D_bottom_outer_core,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(5,5,1);
+ dim3 threads(NGLLX,NGLLX,1);
// launches GPU kernel
if( *FORWARD_OR_ADJOINT == 1 ){
@@ -258,6 +256,9 @@
mp->d_ibelm_bottom_outer_core,
mp->nspec2D_bottom_outer_core);
}else if( *FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
// adjoint simulations
compute_coupling_fluid_ICB_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
mp->d_b_accel_outer_core,
@@ -297,7 +298,7 @@
realw RHO_TOP_OC,
realw minus_g_cmb,
int GRAVITY,
- int NSPEC_BOTTOM_CM) {
+ int NSPEC2D_BOTTOM_CM) {
int i = threadIdx.x;
int j = threadIdx.y;
@@ -309,7 +310,7 @@
realw weight;
// for surfaces elements exactly at the bottom of the crust mantle (outer core top)
- if( iface < NSPEC_BOTTOM_CM ){
+ if( iface < NSPEC2D_BOTTOM_CM ){
// "-1" from index values to convert from Fortran-> C indexing
ispec = ibelm_bottom_crust_mantle[iface] - 1;
@@ -352,6 +353,68 @@
/* ----------------------------------------------------------------------------------------------- */
+extern "C"
+void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
+ COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
+ int* FORWARD_OR_ADJOINT) {
+
+ TRACE("compute_coupling_cmb_fluid_cuda");
+ //double start_time = get_time();
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nspec2D_bottom_crust_mantle,&num_blocks_x,&num_blocks_y);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(5,5,1);
+
+ // launches GPU kernel
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
+ mp->d_accel_crust_mantle,
+ mp->d_accel_outer_core,
+ mp->d_ibool_crust_mantle,
+ mp->d_ibelm_bottom_crust_mantle,
+ mp->d_normal_top_outer_core,
+ mp->d_jacobian2D_top_outer_core,
+ mp->d_wgllwgll_xy,
+ mp->d_ibool_outer_core,
+ mp->d_ibelm_top_outer_core,
+ mp->RHO_TOP_OC,
+ mp->minus_g_cmb,
+ mp->gravity,
+ mp->nspec2D_bottom_crust_mantle);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
+ // adjoint simulations
+ compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
+ mp->d_b_accel_crust_mantle,
+ mp->d_b_accel_outer_core,
+ mp->d_ibool_crust_mantle,
+ mp->d_ibelm_bottom_crust_mantle,
+ mp->d_normal_top_outer_core,
+ mp->d_jacobian2D_top_outer_core,
+ mp->d_wgllwgll_xy,
+ mp->d_ibool_outer_core,
+ mp->d_ibelm_top_outer_core,
+ mp->RHO_TOP_OC,
+ mp->minus_g_cmb,
+ mp->gravity,
+ mp->nspec2D_bottom_crust_mantle);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("compute_coupling_CMB_fluid_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
__global__ void compute_coupling_ICB_fluid_kernel(realw* displ_inner_core,
realw* accel_inner_core,
realw* accel_outer_core,
@@ -365,10 +428,11 @@
realw RHO_BOTTOM_OC,
realw minus_g_icb,
int GRAVITY,
- int NSPEC_TOP_IC) {
+ int NSPEC2D_TOP_IC) {
int i = threadIdx.x;
int j = threadIdx.y;
+
int iface = blockIdx.x + gridDim.x*blockIdx.y;
int k,k_corresp,iglob_ic,iglob_oc,ispec,ispec_selected;
@@ -377,7 +441,7 @@
realw weight;
// for surfaces elements exactly at the top of the inner core (outer core bottom)
- if( iface < NSPEC_TOP_IC ){
+ if( iface < NSPEC2D_TOP_IC ){
// "-1" from index values to convert from Fortran-> C indexing
ispec = ibelm_top_inner_core[iface] - 1;
@@ -401,9 +465,9 @@
// compute pressure, taking gravity into account
if( GRAVITY ){
pressure = RHO_BOTTOM_OC * ( - accel_outer_core[iglob_oc]
- + minus_g_icb * (displ_inner_core[iglob_ic*3]*nx
- + displ_inner_core[iglob_ic*3+1]*ny
- + displ_inner_core[iglob_ic*3+2]*nz) );
+ + minus_g_icb * ( displ_inner_core[iglob_ic*3]*nx
+ + displ_inner_core[iglob_ic*3+1]*ny
+ + displ_inner_core[iglob_ic*3+2]*nz) );
}else{
pressure = - RHO_BOTTOM_OC * accel_outer_core[iglob_oc];
}
@@ -419,69 +483,7 @@
}
}
-/* ----------------------------------------------------------------------------------------------- */
-extern "C"
-void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
- COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
- int* FORWARD_OR_ADJOINT) {
-
- TRACE("compute_coupling_cmb_fluid_cuda");
- //double start_time = get_time();
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- int num_blocks_x = mp->nspec2D_bottom_crust_mantle;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(5,5,1);
-
- // launches GPU kernel
- if( *FORWARD_OR_ADJOINT == 1 ){
- compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
- mp->d_accel_crust_mantle,
- mp->d_accel_outer_core,
- mp->d_ibool_crust_mantle,
- mp->d_ibelm_bottom_crust_mantle,
- mp->d_normal_top_outer_core,
- mp->d_jacobian2D_top_outer_core,
- mp->d_wgllwgll_xy,
- mp->d_ibool_outer_core,
- mp->d_ibelm_top_outer_core,
- mp->RHO_TOP_OC,
- mp->minus_g_cmb,
- mp->gravity,
- mp->nspec2D_bottom_crust_mantle);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- // adjoint simulations
- compute_coupling_CMB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
- mp->d_b_accel_crust_mantle,
- mp->d_b_accel_outer_core,
- mp->d_ibool_crust_mantle,
- mp->d_ibelm_bottom_crust_mantle,
- mp->d_normal_top_outer_core,
- mp->d_jacobian2D_top_outer_core,
- mp->d_wgllwgll_xy,
- mp->d_ibool_outer_core,
- mp->d_ibelm_top_outer_core,
- mp->RHO_TOP_OC,
- mp->minus_g_cmb,
- mp->gravity,
- mp->nspec2D_bottom_crust_mantle);
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //double end_time = get_time();
- //printf("Elapsed time: %e\n",end_time-start_time);
- exit_on_cuda_error("compute_coupling_CMB_fluid_cuda");
-#endif
-}
-
/* ----------------------------------------------------------------------------------------------- */
extern "C"
@@ -494,15 +496,11 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- int num_blocks_x = mp->nspec2D_top_inner_core;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nspec2D_top_inner_core,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(5,5,1);
+ dim3 threads(NGLLX,NGLLX,1);
// launches GPU kernel
if( *FORWARD_OR_ADJOINT == 1 ){
@@ -521,6 +519,9 @@
mp->gravity,
mp->nspec2D_top_inner_core);
}else if( *FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
// adjoint simulations
compute_coupling_ICB_fluid_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
mp->d_b_accel_inner_core,
@@ -592,7 +593,7 @@
additional_term_y = (rmass - rmassy_crust_mantle[iglob]) * force_normal_comp;
additional_term_z = (rmass - rmassz_crust_mantle[iglob]) * force_normal_comp;
- // since we access this global point only once...
+ // since we access this global point only once, no need to use atomics ...
accel_crust_mantle[iglob*3] += additional_term_x * nx;
accel_crust_mantle[iglob*3+1] += additional_term_y * ny;
accel_crust_mantle[iglob*3+2] += additional_term_z * nz;
@@ -606,8 +607,6 @@
extern "C"
void FC_FUNC_(compute_coupling_ocean_cuda,
COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
- int* NCHUNKS_VAL,
- int* exact_mass_matrix_for_rotation,
int* FORWARD_OR_ADJOINT) {
TRACE("compute_coupling_ocean_cuda");
@@ -617,60 +616,35 @@
int blocksize = BLOCKSIZE_TRANSFER;
int size_padded = ((int)ceil(((double)mp->npoin_oceans)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- if( ( *NCHUNKS_VAL != 6 && mp->absorbing_conditions) || (mp->rotation && *exact_mass_matrix_for_rotation) ){
- // uses corrected mass matrices
- if( *FORWARD_OR_ADJOINT == 1 ){
- compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
- mp->d_rmassx_crust_mantle,
- mp->d_rmassy_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmass_ocean_load,
- mp->npoin_oceans,
- mp->d_ibool_ocean_load,
- mp->d_normal_ocean_load);
- }else if( *FORWARD_OR_ADJOINT == 3){
- // for backward/reconstructed potentials
- compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
- mp->d_b_rmassx_crust_mantle,
- mp->d_b_rmassy_crust_mantle,
- mp->d_b_rmassz_crust_mantle,
- mp->d_rmass_ocean_load,
- mp->npoin_oceans,
- mp->d_ibool_ocean_load,
- mp->d_normal_ocean_load);
- }
- }else{
- // uses only rmassz
- if( *FORWARD_OR_ADJOINT == 1 ){
- compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmass_ocean_load,
- mp->npoin_oceans,
- mp->d_ibool_ocean_load,
- mp->d_normal_ocean_load);
- }else if( *FORWARD_OR_ADJOINT == 3){
- // for backward/reconstructed potentials
- compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
- mp->d_b_rmassz_crust_mantle,
- mp->d_b_rmassz_crust_mantle,
- mp->d_b_rmassz_crust_mantle,
- mp->d_rmass_ocean_load,
- mp->npoin_oceans,
- mp->d_ibool_ocean_load,
- mp->d_normal_ocean_load);
- }
+ // uses corrected mass matrices
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_accel_crust_mantle,
+ mp->d_rmassx_crust_mantle,
+ mp->d_rmassy_crust_mantle,
+ mp->d_rmassz_crust_mantle,
+ mp->d_rmass_ocean_load,
+ mp->npoin_oceans,
+ mp->d_ibool_ocean_load,
+ mp->d_normal_ocean_load);
+ }else if( *FORWARD_OR_ADJOINT == 3){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
+ // for backward/reconstructed potentials
+ compute_coupling_ocean_cuda_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
+ mp->d_b_rmassx_crust_mantle,
+ mp->d_b_rmassy_crust_mantle,
+ mp->d_b_rmassz_crust_mantle,
+ mp->d_rmass_ocean_load,
+ mp->npoin_oceans,
+ mp->d_ibool_ocean_load,
+ mp->d_normal_ocean_load);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -713,7 +713,6 @@
realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
realw* epsilondev_xz,realw* epsilondev_yz,
realw* epsilon_trace_over_3,
- int SIMULATION_TYPE,
int ATTENUATION,
int PARTIAL_PHYS_DISPERSION_ONLY,
int USE_3D_ATTENUATION_ARRAYS,
@@ -862,7 +861,6 @@
if (active) {
#ifndef MANUALLY_UNROLLED_LOOPS
-
tempx1l = 0.f;
tempx2l = 0.f;
tempx3l = 0.f;
@@ -891,7 +889,6 @@
tempy3l += s_dummyy_loc[l*NGLL2+J*NGLLX+I]*fac3;
tempz3l += s_dummyz_loc[l*NGLL2+J*NGLLX+I]*fac3;
}
-
#else
tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*sh_hprime_xx[I]
@@ -1305,6 +1302,13 @@
realw* d_R_xy,
realw* d_R_xz,
realw* d_R_yz,
+ realw* d_c11store,realw* d_c12store,realw* d_c13store,
+ realw* d_c14store,realw* d_c15store,realw* d_c16store,
+ realw* d_c22store,realw* d_c23store,realw* d_c24store,
+ realw* d_c25store,realw* d_c26store,realw* d_c33store,
+ realw* d_c34store,realw* d_c35store,realw* d_c36store,
+ realw* d_c44store,realw* d_c45store,realw* d_c46store,
+ realw* d_c55store,realw* d_c56store,realw* d_c66store,
realw* d_b_epsilondev_xx,
realw* d_b_epsilondev_yy,
realw* d_b_epsilondev_xy,
@@ -1316,30 +1320,21 @@
realw* d_b_R_xy,
realw* d_b_R_xz,
realw* d_b_R_yz,
- realw* d_c11store,realw* d_c12store,realw* d_c13store,
- realw* d_c14store,realw* d_c15store,realw* d_c16store,
- realw* d_c22store,realw* d_c23store,realw* d_c24store,
- realw* d_c25store,realw* d_c26store,realw* d_c33store,
- realw* d_c34store,realw* d_c35store,realw* d_c36store,
- realw* d_c44store,realw* d_c45store,realw* d_c46store,
- realw* d_c55store,realw* d_c56store,realw* d_c66store){
+ int FORWARD_OR_ADJOINT){
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("before kernel Kernel_2_crust_mantle");
#endif
- /* if the grid can handle the number of blocks, we let it be 1D */
- /* grid_2_x = nb_elem_color; */
- /* nb_elem_color is just how many blocks we are computing now */
+ // if the grid can handle the number of blocks, we let it be 1D
+ // grid_2_x = nb_elem_color;
+ // nb_elem_color is just how many blocks we are computing now
- int num_blocks_x = nb_blocks_to_compute;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
int blocksize = NGLL3_PADDED;
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(nb_blocks_to_compute,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -1350,56 +1345,57 @@
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
- Kernel_2_crust_mantle_impl<<<grid,threads>>>(nb_blocks_to_compute,
- mp->NGLOB_CRUST_MANTLE,
- d_ibool,
- d_ispec_is_tiso,
- mp->d_phase_ispec_inner_crust_mantle,
- mp->num_phase_ispec_crust_mantle,
- d_iphase,
- mp->deltat,
- mp->use_mesh_coloring_gpu,
- mp->d_displ_crust_mantle,
- mp->d_veloc_crust_mantle,
- mp->d_accel_crust_mantle,
- d_xix, d_xiy, d_xiz,
- d_etax, d_etay, d_etaz,
- d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx,
- mp->d_hprimewgll_xx,
- mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
- d_kappavstore, d_muvstore,
- d_kappahstore, d_muhstore,
- d_eta_anisostore,
- mp->compute_and_store_strain,
- d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
- d_epsilondev_xz,d_epsilondev_yz,
- d_epsilon_trace_over_3,
- mp->simulation_type,
- mp->attenuation,
- mp->partial_phys_dispersion_only,
- mp->use_3d_attenuation_arrays,
- d_one_minus_sum_beta,d_factor_common,
- d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
- mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
- mp->anisotropic_3D_mantle,
- d_c11store,d_c12store,d_c13store,
- d_c14store,d_c15store,d_c16store,
- d_c22store,d_c23store,d_c24store,
- d_c25store,d_c26store,d_c33store,
- d_c34store,d_c35store,d_c36store,
- d_c44store,d_c45store,d_c46store,
- d_c55store,d_c56store,d_c66store,
- mp->gravity,
- mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
- mp->d_minus_gravity_table,
- mp->d_minus_deriv_gravity_table,
- mp->d_density_table,
- mp->d_wgll_cube,
- mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
+ if( FORWARD_OR_ADJOINT == 1 ){
+ Kernel_2_crust_mantle_impl<<<grid,threads>>>(nb_blocks_to_compute,
+ mp->NGLOB_CRUST_MANTLE,
+ d_ibool,
+ d_ispec_is_tiso,
+ mp->d_phase_ispec_inner_crust_mantle,
+ mp->num_phase_ispec_crust_mantle,
+ d_iphase,
+ mp->deltat,
+ mp->use_mesh_coloring_gpu,
+ mp->d_displ_crust_mantle,
+ mp->d_veloc_crust_mantle,
+ mp->d_accel_crust_mantle,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ d_kappavstore, d_muvstore,
+ d_kappahstore, d_muhstore,
+ d_eta_anisostore,
+ mp->compute_and_store_strain,
+ d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
+ d_epsilondev_xz,d_epsilondev_yz,
+ d_epsilon_trace_over_3,
+ mp->attenuation,
+ mp->partial_phys_dispersion_only,
+ mp->use_3d_attenuation_arrays,
+ d_one_minus_sum_beta,d_factor_common,
+ d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+ mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+ mp->anisotropic_3D_mantle,
+ d_c11store,d_c12store,d_c13store,
+ d_c14store,d_c15store,d_c16store,
+ d_c22store,d_c23store,d_c24store,
+ d_c25store,d_c26store,d_c33store,
+ d_c34store,d_c35store,d_c36store,
+ d_c44store,d_c45store,d_c46store,
+ d_c55store,d_c56store,d_c66store,
+ mp->gravity,
+ mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
+ mp->d_minus_gravity_table,
+ mp->d_minus_deriv_gravity_table,
+ mp->d_density_table,
+ mp->d_wgll_cube,
+ mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
+ }else if( FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
-
- if(mp->simulation_type == 3) {
Kernel_2_crust_mantle_impl<<< grid,threads>>>(nb_blocks_to_compute,
mp->NGLOB_CRUST_MANTLE,
d_ibool,
@@ -1425,7 +1421,6 @@
d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
d_b_epsilondev_xz,d_b_epsilondev_yz,
d_b_epsilon_trace_over_3,
- mp->simulation_type,
mp->attenuation,
mp->partial_phys_dispersion_only,
mp->use_3d_attenuation_arrays,
@@ -1469,7 +1464,8 @@
extern "C"
void FC_FUNC_(compute_forces_crust_mantle_cuda,
COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
- int* iphase) {
+ int* iphase,
+ int* FORWARD_OR_ADJOINT_f) {
TRACE("compute_forces_crust_mantle_cuda");
@@ -1479,6 +1475,8 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+ int FORWARD_OR_ADJOINT = *FORWARD_OR_ADJOINT_f;
+
int num_elements;
if( *iphase == 1 )
@@ -1563,15 +1561,9 @@
*iphase,
mp->d_ibool_crust_mantle + offset_nonpadded,
mp->d_ispec_is_tiso_crust_mantle + offset_ispec,
- mp->d_xix_crust_mantle + offset,
- mp->d_xiy_crust_mantle + offset,
- mp->d_xiz_crust_mantle + offset,
- mp->d_etax_crust_mantle + offset,
- mp->d_etay_crust_mantle + offset,
- mp->d_etaz_crust_mantle + offset,
- mp->d_gammax_crust_mantle + offset,
- mp->d_gammay_crust_mantle + offset,
- mp->d_gammaz_crust_mantle + offset,
+ mp->d_xix_crust_mantle + offset,mp->d_xiy_crust_mantle + offset,mp->d_xiz_crust_mantle + offset,
+ mp->d_etax_crust_mantle + offset,mp->d_etay_crust_mantle + offset,mp->d_etaz_crust_mantle + offset,
+ mp->d_gammax_crust_mantle + offset,mp->d_gammay_crust_mantle + offset,mp->d_gammaz_crust_mantle + offset,
mp->d_kappavstore_crust_mantle + offset,
mp->d_muvstore_crust_mantle + offset,
mp->d_kappahstore_crust_mantle + offset,
@@ -1590,6 +1582,13 @@
mp->d_R_xy_crust_mantle + offset_nonpadded_att1,
mp->d_R_xz_crust_mantle + offset_nonpadded_att1,
mp->d_R_yz_crust_mantle + offset_nonpadded_att1,
+ mp->d_c11store_crust_mantle + offset,mp->d_c12store_crust_mantle + offset,mp->d_c13store_crust_mantle + offset,
+ mp->d_c14store_crust_mantle + offset,mp->d_c15store_crust_mantle + offset,mp->d_c16store_crust_mantle + offset,
+ mp->d_c22store_crust_mantle + offset,mp->d_c23store_crust_mantle + offset,mp->d_c24store_crust_mantle + offset,
+ mp->d_c25store_crust_mantle + offset,mp->d_c26store_crust_mantle + offset,mp->d_c33store_crust_mantle + offset,
+ mp->d_c34store_crust_mantle + offset,mp->d_c35store_crust_mantle + offset,mp->d_c36store_crust_mantle + offset,
+ mp->d_c44store_crust_mantle + offset,mp->d_c45store_crust_mantle + offset,mp->d_c46store_crust_mantle + offset,
+ mp->d_c55store_crust_mantle + offset,mp->d_c56store_crust_mantle + offset,mp->d_c66store_crust_mantle + offset,
mp->d_b_epsilondev_xx_crust_mantle + offset_nonpadded,
mp->d_b_epsilondev_yy_crust_mantle + offset_nonpadded,
mp->d_b_epsilondev_xy_crust_mantle + offset_nonpadded,
@@ -1601,27 +1600,7 @@
mp->d_b_R_xy_crust_mantle + offset_nonpadded_att1,
mp->d_b_R_xz_crust_mantle + offset_nonpadded_att1,
mp->d_b_R_yz_crust_mantle + offset_nonpadded_att1,
- mp->d_c11store_crust_mantle + offset,
- mp->d_c12store_crust_mantle + offset,
- mp->d_c13store_crust_mantle + offset,
- mp->d_c14store_crust_mantle + offset,
- mp->d_c15store_crust_mantle + offset,
- mp->d_c16store_crust_mantle + offset,
- mp->d_c22store_crust_mantle + offset,
- mp->d_c23store_crust_mantle + offset,
- mp->d_c24store_crust_mantle + offset,
- mp->d_c25store_crust_mantle + offset,
- mp->d_c26store_crust_mantle + offset,
- mp->d_c33store_crust_mantle + offset,
- mp->d_c34store_crust_mantle + offset,
- mp->d_c35store_crust_mantle + offset,
- mp->d_c36store_crust_mantle + offset,
- mp->d_c44store_crust_mantle + offset,
- mp->d_c45store_crust_mantle + offset,
- mp->d_c46store_crust_mantle + offset,
- mp->d_c55store_crust_mantle + offset,
- mp->d_c56store_crust_mantle + offset,
- mp->d_c66store_crust_mantle + offset);
+ FORWARD_OR_ADJOINT);
// for padded and aligned arrays
offset += nb_blocks_to_compute * NGLL3_PADDED;
@@ -1650,7 +1629,6 @@
}else{
// no mesh coloring: uses atomic updates
-
Kernel_2_crust_mantle(num_elements,mp,
*iphase,
mp->d_ibool_crust_mantle,
@@ -1674,6 +1652,13 @@
mp->d_R_xy_crust_mantle,
mp->d_R_xz_crust_mantle,
mp->d_R_yz_crust_mantle,
+ mp->d_c11store_crust_mantle,mp->d_c12store_crust_mantle,mp->d_c13store_crust_mantle,
+ mp->d_c14store_crust_mantle,mp->d_c15store_crust_mantle,mp->d_c16store_crust_mantle,
+ mp->d_c22store_crust_mantle,mp->d_c23store_crust_mantle,mp->d_c24store_crust_mantle,
+ mp->d_c25store_crust_mantle,mp->d_c26store_crust_mantle,mp->d_c33store_crust_mantle,
+ mp->d_c34store_crust_mantle,mp->d_c35store_crust_mantle,mp->d_c36store_crust_mantle,
+ mp->d_c44store_crust_mantle,mp->d_c45store_crust_mantle,mp->d_c46store_crust_mantle,
+ mp->d_c55store_crust_mantle,mp->d_c56store_crust_mantle,mp->d_c66store_crust_mantle,
mp->d_b_epsilondev_xx_crust_mantle,
mp->d_b_epsilondev_yy_crust_mantle,
mp->d_b_epsilondev_xy_crust_mantle,
@@ -1685,13 +1670,7 @@
mp->d_b_R_xy_crust_mantle,
mp->d_b_R_xz_crust_mantle,
mp->d_b_R_yz_crust_mantle,
- mp->d_c11store_crust_mantle,mp->d_c12store_crust_mantle,mp->d_c13store_crust_mantle,
- mp->d_c14store_crust_mantle,mp->d_c15store_crust_mantle,mp->d_c16store_crust_mantle,
- mp->d_c22store_crust_mantle,mp->d_c23store_crust_mantle,mp->d_c24store_crust_mantle,
- mp->d_c25store_crust_mantle,mp->d_c26store_crust_mantle,mp->d_c33store_crust_mantle,
- mp->d_c34store_crust_mantle,mp->d_c35store_crust_mantle,mp->d_c36store_crust_mantle,
- mp->d_c44store_crust_mantle,mp->d_c45store_crust_mantle,mp->d_c46store_crust_mantle,
- mp->d_c55store_crust_mantle,mp->d_c56store_crust_mantle,mp->d_c66store_crust_mantle);
+ FORWARD_OR_ADJOINT);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -326,7 +326,6 @@
realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
realw* epsilondev_xz,realw* epsilondev_yz,
realw* epsilon_trace_over_3,
- int SIMULATION_TYPE,
int ATTENUATION,
int PARTIAL_PHYS_DISPERSION_ONLY,
int USE_3D_ATTENUATION_ARRAYS,
@@ -937,6 +936,8 @@
realw* d_R_xy,
realw* d_R_xz,
realw* d_R_yz,
+ realw* d_c11store,realw* d_c12store,realw* d_c13store,
+ realw* d_c33store,realw* d_c44store,
realw* d_b_epsilondev_xx,
realw* d_b_epsilondev_yy,
realw* d_b_epsilondev_xy,
@@ -948,25 +949,21 @@
realw* d_b_R_xy,
realw* d_b_R_xz,
realw* d_b_R_yz,
- realw* d_c11store,realw* d_c12store,realw* d_c13store,
- realw* d_c33store,realw* d_c44store){
+ int FORWARD_OR_ADJOINT){
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("before kernel Kernel_2_inner_core");
#endif
- /* if the grid can handle the number of blocks, we let it be 1D */
- /* grid_2_x = nb_elem_color; */
- /* nb_elem_color is just how many blocks we are computing now */
+ // if the grid can handle the number of blocks, we let it be 1D
+ // grid_2_x = nb_elem_color;
+ // nb_elem_color is just how many blocks we are computing now
- int num_blocks_x = nb_blocks_to_compute;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
int blocksize = NGLL3_PADDED;
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(nb_blocks_to_compute,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -976,55 +973,55 @@
// cudaEventCreate(&start);
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
+ if( FORWARD_OR_ADJOINT == 1 ){
+ Kernel_2_inner_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
+ mp->NGLOB_INNER_CORE,
+ d_ibool,
+ d_idoubling,
+ mp->d_phase_ispec_inner_inner_core,
+ mp->num_phase_ispec_inner_core,
+ d_iphase,
+ mp->deltat,
+ mp->use_mesh_coloring_gpu,
+ mp->d_displ_inner_core,
+ mp->d_veloc_inner_core,
+ mp->d_accel_inner_core,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ d_kappav, d_muv,
+ mp->compute_and_store_strain,
+ d_epsilondev_xx,
+ d_epsilondev_yy,
+ d_epsilondev_xy,
+ d_epsilondev_xz,
+ d_epsilondev_yz,
+ d_epsilon_trace_over_3,
+ mp->attenuation,
+ mp->partial_phys_dispersion_only,
+ mp->use_3d_attenuation_arrays,
+ d_one_minus_sum_beta,
+ d_factor_common,
+ d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+ mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+ mp->anisotropic_inner_core,
+ d_c11store,d_c12store,d_c13store,
+ d_c33store,d_c44store,
+ mp->gravity,
+ mp->d_xstore_inner_core,mp->d_ystore_inner_core,mp->d_zstore_inner_core,
+ mp->d_minus_gravity_table,
+ mp->d_minus_deriv_gravity_table,
+ mp->d_density_table,
+ mp->d_wgll_cube,
+ mp->NSPEC_INNER_CORE_STRAIN_ONLY,
+ mp->NSPEC_INNER_CORE);
+ }else if( FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
- Kernel_2_inner_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
- mp->NGLOB_INNER_CORE,
- d_ibool,
- d_idoubling,
- mp->d_phase_ispec_inner_inner_core,
- mp->num_phase_ispec_inner_core,
- d_iphase,
- mp->deltat,
- mp->use_mesh_coloring_gpu,
- mp->d_displ_inner_core,
- mp->d_veloc_inner_core,
- mp->d_accel_inner_core,
- d_xix, d_xiy, d_xiz,
- d_etax, d_etay, d_etaz,
- d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx,
- mp->d_hprimewgll_xx,
- mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
- d_kappav, d_muv,
- mp->compute_and_store_strain,
- d_epsilondev_xx,
- d_epsilondev_yy,
- d_epsilondev_xy,
- d_epsilondev_xz,
- d_epsilondev_yz,
- d_epsilon_trace_over_3,
- mp->simulation_type,
- mp->attenuation,
- mp->partial_phys_dispersion_only,
- mp->use_3d_attenuation_arrays,
- d_one_minus_sum_beta,
- d_factor_common,
- d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
- mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
- mp->anisotropic_inner_core,
- d_c11store,d_c12store,d_c13store,
- d_c33store,d_c44store,
- mp->gravity,
- mp->d_xstore_inner_core,mp->d_ystore_inner_core,mp->d_zstore_inner_core,
- mp->d_minus_gravity_table,
- mp->d_minus_deriv_gravity_table,
- mp->d_density_table,
- mp->d_wgll_cube,
- mp->NSPEC_INNER_CORE_STRAIN_ONLY,
- mp->NSPEC_INNER_CORE);
-
-
- if(mp->simulation_type == 3) {
Kernel_2_inner_core_impl<<< grid,threads>>>(nb_blocks_to_compute,
mp->NGLOB_INNER_CORE,
d_ibool,
@@ -1051,7 +1048,6 @@
d_b_epsilondev_xz,
d_b_epsilondev_yz,
d_b_epsilon_trace_over_3,
- mp->simulation_type,
mp->attenuation,
mp->partial_phys_dispersion_only,
mp->use_3d_attenuation_arrays,
@@ -1092,7 +1088,8 @@
extern "C"
void FC_FUNC_(compute_forces_inner_core_cuda,
COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
- int* iphase) {
+ int* iphase,
+ int* FORWARD_OR_ADJOINT_f) {
TRACE("compute_forces_inner_core_cuda");
@@ -1102,6 +1099,8 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+ int FORWARD_OR_ADJOINT = *FORWARD_OR_ADJOINT_f;
+
int num_elements;
if( *iphase == 1 )
@@ -1231,15 +1230,9 @@
*iphase,
mp->d_ibool_inner_core + offset_nonpadded,
mp->d_idoubling_inner_core + offset_ispec,
- mp->d_xix_inner_core + offset,
- mp->d_xiy_inner_core + offset,
- mp->d_xiz_inner_core + offset,
- mp->d_etax_inner_core + offset,
- mp->d_etay_inner_core + offset,
- mp->d_etaz_inner_core + offset,
- mp->d_gammax_inner_core + offset,
- mp->d_gammay_inner_core + offset,
- mp->d_gammaz_inner_core + offset,
+ mp->d_xix_inner_core + offset,mp->d_xiy_inner_core + offset,mp->d_xiz_inner_core + offset,
+ mp->d_etax_inner_core + offset,mp->d_etay_inner_core + offset,mp->d_etaz_inner_core + offset,
+ mp->d_gammax_inner_core + offset,mp->d_gammay_inner_core + offset,mp->d_gammaz_inner_core + offset,
mp->d_kappavstore_inner_core + offset,
mp->d_muvstore_inner_core + offset,
mp->d_epsilondev_xx_inner_core + offset_nonpadded,
@@ -1255,6 +1248,11 @@
mp->d_R_xy_inner_core + offset_nonpadded_att1,
mp->d_R_xz_inner_core + offset_nonpadded_att1,
mp->d_R_yz_inner_core + offset_nonpadded_att1,
+ mp->d_c11store_inner_core + offset,
+ mp->d_c12store_inner_core + offset,
+ mp->d_c13store_inner_core + offset,
+ mp->d_c33store_inner_core + offset,
+ mp->d_c44store_inner_core + offset,
mp->d_b_epsilondev_xx_inner_core + offset_nonpadded,
mp->d_b_epsilondev_yy_inner_core + offset_nonpadded,
mp->d_b_epsilondev_xy_inner_core + offset_nonpadded,
@@ -1266,11 +1264,7 @@
mp->d_b_R_xy_inner_core + offset_nonpadded_att1,
mp->d_b_R_xz_inner_core + offset_nonpadded_att1,
mp->d_b_R_yz_inner_core + offset_nonpadded_att1,
- mp->d_c11store_inner_core + offset,
- mp->d_c12store_inner_core + offset,
- mp->d_c13store_inner_core + offset,
- mp->d_c33store_inner_core + offset,
- mp->d_c44store_inner_core + offset);
+ FORWARD_OR_ADJOINT);
// for padded and aligned arrays
offset += nb_blocks_to_compute * NGLL3_PADDED;
@@ -1320,6 +1314,8 @@
mp->d_R_xy_inner_core,
mp->d_R_xz_inner_core,
mp->d_R_yz_inner_core,
+ mp->d_c11store_inner_core,mp->d_c12store_inner_core,mp->d_c13store_inner_core,
+ mp->d_c33store_inner_core,mp->d_c44store_inner_core,
mp->d_b_epsilondev_xx_inner_core,
mp->d_b_epsilondev_yy_inner_core,
mp->d_b_epsilondev_xy_inner_core,
@@ -1331,8 +1327,7 @@
mp->d_b_R_xy_inner_core,
mp->d_b_R_xz_inner_core,
mp->d_b_R_yz_inner_core,
- mp->d_c11store_inner_core,mp->d_c12store_inner_core,mp->d_c13store_inner_core,
- mp->d_c33store_inner_core,mp->d_c44store_inner_core);
+ FORWARD_OR_ADJOINT);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -474,25 +474,18 @@
exit_on_cuda_error("before outer_core kernel Kernel_2");
#endif
- /* if the grid can handle the number of blocks, we let it be 1D */
- /* grid_2_x = nb_elem_color; */
- /* nb_elem_color is just how many blocks we are computing now */
+ // if the grid can handle the number of blocks, we let it be 1D
+ // grid_2_x = nb_elem_color;
+ // nb_elem_color is just how many blocks we are computing now
- int num_blocks_x = nb_blocks_to_compute;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
int blocksize = NGLL3_PADDED;
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(nb_blocks_to_compute,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- // 2d grid
- //int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2;
- //dim3 grid_2(num_blocks_x,num_blocks_y);
-
// Cuda timing
// cudaEvent_t start, stop;
// realw time;
@@ -502,60 +495,63 @@
if( FORWARD_OR_ADJOINT == 1 ){
Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
- mp->NGLOB_OUTER_CORE,
- d_ibool,
- mp->d_phase_ispec_inner_outer_core,
- mp->num_phase_ispec_outer_core,
- d_iphase,
- mp->use_mesh_coloring_gpu,
- mp->d_displ_outer_core,
- mp->d_accel_outer_core,
- d_xix, d_xiy, d_xiz,
- d_etax, d_etay, d_etaz,
- d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx,
- mp->d_hprimewgll_xx,
- mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
- mp->gravity,
- mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
- mp->d_d_ln_density_dr_table,
- mp->d_minus_rho_g_over_kappa_fluid,
- mp->d_wgll_cube,
- mp->rotation,
- time,
- mp->two_omega_earth,
- mp->deltat,
- d_A_array_rotation,
- d_B_array_rotation,
- mp->NSPEC_OUTER_CORE);
+ mp->NGLOB_OUTER_CORE,
+ d_ibool,
+ mp->d_phase_ispec_inner_outer_core,
+ mp->num_phase_ispec_outer_core,
+ d_iphase,
+ mp->use_mesh_coloring_gpu,
+ mp->d_displ_outer_core,
+ mp->d_accel_outer_core,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ mp->gravity,
+ mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
+ mp->d_d_ln_density_dr_table,
+ mp->d_minus_rho_g_over_kappa_fluid,
+ mp->d_wgll_cube,
+ mp->rotation,
+ time,
+ mp->two_omega_earth,
+ mp->deltat,
+ d_A_array_rotation,
+ d_B_array_rotation,
+ mp->NSPEC_OUTER_CORE);
}else if( FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
Kernel_2_outer_core_impl<<<grid,threads>>>(nb_blocks_to_compute,
- mp->NGLOB_OUTER_CORE,
- d_ibool,
- mp->d_phase_ispec_inner_outer_core,
- mp->num_phase_ispec_outer_core,
- d_iphase,
- mp->use_mesh_coloring_gpu,
- mp->d_b_displ_outer_core,
- mp->d_b_accel_outer_core,
- d_xix, d_xiy, d_xiz,
- d_etax, d_etay, d_etaz,
- d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx,
- mp->d_hprimewgll_xx,
- mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
- mp->gravity,
- mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
- mp->d_d_ln_density_dr_table,
- mp->d_minus_rho_g_over_kappa_fluid,
- mp->d_wgll_cube,
- mp->rotation,
- time,
- mp->b_two_omega_earth,
- mp->b_deltat,
- d_b_A_array_rotation,
- d_b_B_array_rotation,
- mp->NSPEC_OUTER_CORE);
+ mp->NGLOB_OUTER_CORE,
+ d_ibool,
+ mp->d_phase_ispec_inner_outer_core,
+ mp->num_phase_ispec_outer_core,
+ d_iphase,
+ mp->use_mesh_coloring_gpu,
+ mp->d_b_displ_outer_core,
+ mp->d_b_accel_outer_core,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ mp->gravity,
+ mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
+ mp->d_d_ln_density_dr_table,
+ mp->d_minus_rho_g_over_kappa_fluid,
+ mp->d_wgll_cube,
+ mp->rotation,
+ time,
+ mp->b_two_omega_earth,
+ mp->b_deltat,
+ d_b_A_array_rotation,
+ d_b_B_array_rotation,
+ mp->NSPEC_OUTER_CORE);
}
// cudaEventRecord( stop, 0 );
@@ -583,7 +579,7 @@
COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
int* iphase,
realw* time_f,
- int* FORWARD_OR_ADJOINT) {
+ int* FORWARD_OR_ADJOINT_f) {
TRACE("compute_forces_outer_core_cuda");
@@ -592,8 +588,11 @@
//double start_time = get_time();
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+
realw time = *time_f;
+ int FORWARD_OR_ADJOINT = *FORWARD_OR_ADJOINT_f;
+
int num_elements;
if( *iphase == 1 )
@@ -612,7 +611,7 @@
int nb_colors,nb_blocks_to_compute;
int istart;
- int color_offset,color_offset_nonpadded;
+ int offset,offset_nonpadded;
// sets up color loop
if( mp->NSPEC_OUTER_CORE > COLORING_MIN_NSPEC_OUTER_CORE ){
@@ -622,16 +621,16 @@
istart = 0;
// array offsets
- color_offset = 0;
- color_offset_nonpadded = 0;
+ offset = 0;
+ offset_nonpadded = 0;
}else{
// inner element colors (start after outer elements)
nb_colors = mp->num_colors_outer_outer_core + mp->num_colors_inner_outer_core;
istart = mp->num_colors_outer_outer_core;
// array offsets (inner elements start after outer ones)
- color_offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
- color_offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
+ offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
+ offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
}
}else{
@@ -643,16 +642,16 @@
istart = 0;
// array offsets
- color_offset = 0;
- color_offset_nonpadded = 0;
+ offset = 0;
+ offset_nonpadded = 0;
}else{
// inner element colors (start after outer elements)
nb_colors = 1;
istart = 0;
// array offsets (inner elements start after outer ones)
- color_offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
- color_offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
+ offset = mp->nspec_outer_outer_core * NGLL3_PADDED;
+ offset_nonpadded = mp->nspec_outer_outer_core * NGLL3;
}
}
@@ -668,27 +667,21 @@
Kernel_2_outer_core(nb_blocks_to_compute,mp,
*iphase,
- mp->d_ibool_outer_core + color_offset_nonpadded,
- mp->d_xix_outer_core + color_offset,
- mp->d_xiy_outer_core + color_offset,
- mp->d_xiz_outer_core + color_offset,
- mp->d_etax_outer_core + color_offset,
- mp->d_etay_outer_core + color_offset,
- mp->d_etaz_outer_core + color_offset,
- mp->d_gammax_outer_core + color_offset,
- mp->d_gammay_outer_core + color_offset,
- mp->d_gammaz_outer_core + color_offset,
+ mp->d_ibool_outer_core + offset_nonpadded,
+ mp->d_xix_outer_core + offset,mp->d_xiy_outer_core + offset,mp->d_xiz_outer_core + offset,
+ mp->d_etax_outer_core + offset,mp->d_etay_outer_core + offset,mp->d_etaz_outer_core + offset,
+ mp->d_gammax_outer_core + offset,mp->d_gammay_outer_core + offset,mp->d_gammaz_outer_core + offset,
time,
- mp->d_A_array_rotation + color_offset_nonpadded,
- mp->d_B_array_rotation + color_offset_nonpadded,
- mp->d_b_A_array_rotation + color_offset_nonpadded,
- mp->d_b_B_array_rotation + color_offset_nonpadded,
- *FORWARD_OR_ADJOINT);
+ mp->d_A_array_rotation + offset_nonpadded,
+ mp->d_B_array_rotation + offset_nonpadded,
+ mp->d_b_A_array_rotation + offset_nonpadded,
+ mp->d_b_B_array_rotation + offset_nonpadded,
+ FORWARD_OR_ADJOINT);
// for padded and aligned arrays
- color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+ offset += nb_blocks_to_compute * NGLL3_PADDED;
// for no-aligned arrays
- color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+ offset_nonpadded += nb_blocks_to_compute * NGLL3;
}
}else{
@@ -705,7 +698,7 @@
mp->d_B_array_rotation,
mp->d_b_A_array_rotation,
mp->d_b_B_array_rotation,
- *FORWARD_OR_ADJOINT);
+ FORWARD_OR_ADJOINT);
}
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -223,19 +223,18 @@
void FC_FUNC_(compute_kernels_cm_cuda,
COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {
-TRACE("compute_kernels_cm_cuda");
+ TRACE("compute_kernels_cm_cuda");
+ // debug
+ DEBUG_EMPTY_BACKWARD();
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
int blocksize = NGLL3;
realw deltat = *deltat_f;
- int num_blocks_x = mp->NSPEC_CRUST_MANTLE;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->NSPEC_CRUST_MANTLE,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -311,12 +310,9 @@
int blocksize = NGLL3;
realw deltat = *deltat_f;
- int num_blocks_x = mp->NSPEC_INNER_CORE;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->NSPEC_INNER_CORE,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -532,12 +528,8 @@
int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
realw deltat = *deltat_f;
- int num_blocks_x = mp->NSPEC_OUTER_CORE;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->NSPEC_OUTER_CORE,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -624,12 +616,8 @@
realw deltat = *deltat_f;
- int num_blocks_x = mp->nspec2D_top_crust_mantle;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nspec2D_top_crust_mantle,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(NGLL2,1,1);
@@ -702,12 +690,8 @@
int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
realw deltat = *deltat_f;
- int num_blocks_x = mp->NSPEC_CRUST_MANTLE;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->NSPEC_CRUST_MANTLE,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -245,12 +245,9 @@
// > NGLLSQUARE==NGLL2==25, no further check inside kernel
int blocksize = NGLL2;
- int num_blocks_x = num_abs_boundary_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -455,12 +452,9 @@
// > NGLLSQUARE==NGLL2==25, no further check inside kernel
int blocksize = NGLL2;
- int num_blocks_x = num_abs_boundary_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -488,3 +482,117 @@
#endif
}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// undo_attenuation simulation: stacey for backward/reconstructed wavefield
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_stacey_acoustic_undoatt_cuda,
+ COMPUTE_STACEY_ACOUSTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
+ int* itype) {
+ TRACE("compute_stacey_acoustic_undoatt_cuda");
+ //double start_time = get_time();
+
+ int num_abs_boundary_faces;
+ int* d_abs_boundary_ispec;
+ realw* d_abs_boundary_jacobian2D;
+ realw* d_wgllwgll;
+ realw* d_b_absorb_potential = NULL;
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ // checks if anything to do
+ if( mp->simulation_type /= 3 ) return;
+ if( mp->save_forward ) return;
+
+ // absorbing boundary type
+ int interface_type = *itype;
+ switch( interface_type ){
+ case 4:
+ // xmin
+ num_abs_boundary_faces = mp->nspec2D_xmin_outer_core;
+ d_abs_boundary_ispec = mp->d_ibelm_xmin_outer_core;
+ d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmin_outer_core;
+ d_wgllwgll = mp->d_wgllwgll_yz;
+ break;
+
+ case 5:
+ // xmax
+ num_abs_boundary_faces = mp->nspec2D_xmax_outer_core;
+ d_abs_boundary_ispec = mp->d_ibelm_xmax_outer_core;
+ d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmax_outer_core;
+ d_wgllwgll = mp->d_wgllwgll_yz;
+ break;
+
+ case 6:
+ // ymin
+ num_abs_boundary_faces = mp->nspec2D_ymin_outer_core;
+ d_abs_boundary_ispec = mp->d_ibelm_ymin_outer_core;
+ d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymin_outer_core;
+ d_wgllwgll = mp->d_wgllwgll_xz;
+ break;
+
+ case 7:
+ // ymax
+ num_abs_boundary_faces = mp->nspec2D_ymax_outer_core;
+ d_abs_boundary_ispec = mp->d_ibelm_ymax_outer_core;
+ d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymax_outer_core;
+ d_wgllwgll = mp->d_wgllwgll_xz;
+ break;
+
+ case 8:
+ // zmin
+ num_abs_boundary_faces = mp->nspec2D_zmin_outer_core;
+ d_abs_boundary_ispec = mp->d_ibelm_bottom_outer_core;
+ d_abs_boundary_jacobian2D = mp->d_jacobian2D_bottom_outer_core;
+ d_wgllwgll = mp->d_wgllwgll_xy;
+ break;
+
+ default:
+ exit_on_cuda_error("compute_stacey_acoustic_cuda: unknown interface type");
+ break;
+ }
+
+ // checks if anything to do
+ if( num_abs_boundary_faces == 0 ) return;
+
+ // way 1: Elapsed time: 4.385948e-03
+ // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+ // int blocksize = 32;
+
+ // way 2: Elapsed time: 4.379034e-03
+ // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+ int blocksize = NGLL2;
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ compute_stacey_acoustic_kernel<<<grid,threads>>>(mp->d_b_veloc_outer_core,
+ mp->d_b_accel_outer_core,
+ interface_type,
+ num_abs_boundary_faces,
+ d_abs_boundary_ispec,
+ mp->d_nkmin_xi_outer_core,
+ mp->d_nkmin_eta_outer_core,
+ mp->d_njmin_outer_core,
+ mp->d_njmax_outer_core,
+ mp->d_nimin_outer_core,
+ mp->d_nimax_outer_core,
+ d_abs_boundary_jacobian2D,
+ d_wgllwgll,
+ mp->d_ibool_outer_core,
+ mp->d_vp_outer_core,
+ mp->save_forward,
+ d_b_absorb_potential);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("compute_stacey_acoustic_undoatt_cuda");
+#endif
+}
+
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -252,12 +252,9 @@
// > NGLLSQUARE==NGLL2==25, no further check inside kernel
int blocksize = NGLL2;
- int num_blocks_x = num_abs_boundary_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -401,7 +398,7 @@
realw* absorb_field,
int* itype) {
-TRACE("compute_stacey_elastic_backward_cuda");
+ TRACE("compute_stacey_elastic_backward_cuda");
int num_abs_boundary_faces;
int* d_abs_boundary_ispec;
@@ -456,21 +453,16 @@
// > NGLLSQUARE==NGLL2==25, no further check inside kernel
int blocksize = NGLL2;
- int num_blocks_x = num_abs_boundary_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
// adjoint simulations: needs absorbing boundary buffer
- if( num_abs_boundary_faces > 0 ){
- // copies array to GPU
- print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_field,absorb_field,
- NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
- }
+ // copies array to GPU
+ print_CUDA_error_if_any(cudaMemcpy(d_b_absorb_field,absorb_field,
+ NDIM*NGLL2*num_abs_boundary_faces*sizeof(realw),cudaMemcpyHostToDevice),7700);
// absorbing boundary contributions
compute_stacey_elastic_backward_kernel<<<grid,threads>>>(mp->d_b_accel_crust_mantle,
@@ -489,3 +481,117 @@
#endif
}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// undo_attenuation simulation: stacey for backward/reconstructed wavefield
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_stacey_elastic_undoatt_cuda,
+ COMPUTE_STACEY_ELASTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
+ int* itype) {
+
+ TRACE("compute_stacey_elastic_undoatt_cuda");
+
+ int num_abs_boundary_faces;
+ int* d_abs_boundary_ispec;
+ realw* d_abs_boundary_normal;
+ realw* d_abs_boundary_jacobian2D;
+ realw* d_wgllwgll;
+ realw* d_b_absorb_field = NULL;
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ // checks if anything to do
+ if( mp->simulation_type /= 3 ) return;
+ if( mp->save_forward ) return;
+
+ // absorbing boundary type
+ int interface_type = *itype;
+ switch( interface_type ){
+ case 0:
+ // xmin
+ num_abs_boundary_faces = mp->nspec2D_xmin_crust_mantle;
+ d_abs_boundary_ispec = mp->d_ibelm_xmin_crust_mantle;
+ d_abs_boundary_normal = mp->d_normal_xmin_crust_mantle;
+ d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmin_crust_mantle;
+ d_wgllwgll = mp->d_wgllwgll_yz;
+ break;
+
+ case 1:
+ // xmax
+ num_abs_boundary_faces = mp->nspec2D_xmax_crust_mantle;
+ d_abs_boundary_ispec = mp->d_ibelm_xmax_crust_mantle;
+ d_abs_boundary_normal = mp->d_normal_xmax_crust_mantle;
+ d_abs_boundary_jacobian2D = mp->d_jacobian2D_xmax_crust_mantle;
+ d_wgllwgll = mp->d_wgllwgll_yz;
+ break;
+
+ case 2:
+ // ymin
+ num_abs_boundary_faces = mp->nspec2D_ymin_crust_mantle;
+ d_abs_boundary_ispec = mp->d_ibelm_ymin_crust_mantle;
+ d_abs_boundary_normal = mp->d_normal_ymin_crust_mantle;
+ d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymin_crust_mantle;
+ d_wgllwgll = mp->d_wgllwgll_xz;
+ break;
+
+ case 3:
+ // ymax
+ num_abs_boundary_faces = mp->nspec2D_ymax_crust_mantle;
+ d_abs_boundary_ispec = mp->d_ibelm_ymax_crust_mantle;
+ d_abs_boundary_normal = mp->d_normal_ymax_crust_mantle;
+ d_abs_boundary_jacobian2D = mp->d_jacobian2D_ymax_crust_mantle;
+ d_wgllwgll = mp->d_wgllwgll_xz;
+ break;
+
+ default:
+ exit_on_cuda_error("compute_stacey_elastic_undoatt_cuda: unknown interface type");
+ break;
+ }
+
+ // checks if anything to do
+ if( num_abs_boundary_faces == 0 ) return;
+
+ // way 1
+ // > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
+ //int blocksize = 32;
+
+ // way 2: seems sligthly faster
+ // > NGLLSQUARE==NGLL2==25, no further check inside kernel
+ int blocksize = NGLL2;
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ // absorbing boundary contributions
+ compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_b_veloc_crust_mantle,
+ mp->d_b_accel_crust_mantle,
+ interface_type,
+ num_abs_boundary_faces,
+ d_abs_boundary_ispec,
+ mp->d_nkmin_xi_crust_mantle,
+ mp->d_nkmin_eta_crust_mantle,
+ mp->d_njmin_crust_mantle,
+ mp->d_njmax_crust_mantle,
+ mp->d_nimin_crust_mantle,
+ mp->d_nimax_crust_mantle,
+ d_abs_boundary_normal,
+ d_abs_boundary_jacobian2D,
+ d_wgllwgll,
+ mp->d_ibool_crust_mantle,
+ mp->d_rho_vp_crust_mantle,
+ mp->d_rho_vs_crust_mantle,
+ mp->save_forward,
+ d_b_absorb_field);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("compute_stacey_elastic_undoatt_cuda");
+#endif
+}
+
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -88,7 +88,7 @@
// Gets number of GPU devices
device_count = 0;
cudaGetDeviceCount(&device_count);
- exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\ncheck if driver and runtime libraries work together\nexiting...\n");
+ exit_on_cuda_error("CUDA runtime error: cudaGetDeviceCount failed\n\nplease check if driver and runtime libraries work together\nor on titan enable environment: CRAY_CUDA_PROXY=1 to use single GPU with multiple MPI processes\n\nexiting...\n");
// returns device count to fortran
if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2013-09-13 11:23:45 UTC (rev 22782)
@@ -70,20 +70,46 @@
#define PRINT5(var,offset) // for(i=0;i<10;i++) printf("var(%d)=%f\n",i,var[offset+i]);
#endif
+// daniel debug: run backward simulations with empty arrays to check
+#define DEBUG_BACKWARD_SIMULATIONS 0
+#if DEBUG_BACKWARD_SIMULATIONS == 1
+#define DEBUG_EMPTY_BACKWARD() return;
+#else
+#define DEBUG_EMPTY_BACKWARD()
+#endif
+
+
+// error checking after cuda function calls
+#define ENABLE_VERY_SLOW_ERROR_CHECKING
+
// maximum function
#define MAX(x,y) (((x) < (y)) ? (y) : (x))
+/* ----------------------------------------------------------------------------------------------- */
+
+// type of "working" variables: see also CUSTOM_REAL
+// double precision temporary variables leads to 10% performance decrease
+// in Kernel_2_impl (not very much..)
+typedef float realw;
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
// utility functions: defined in check_fields_cuda.cu
+
+/* ----------------------------------------------------------------------------------------------- */
+
double get_time();
void get_free_memory(double* free_db, double* used_db, double* total_db);
void print_CUDA_error_if_any(cudaError_t err, int num);
void pause_for_debugger(int pause);
void exit_on_cuda_error(char* kernel_name);
void exit_on_error(char* info);
+void synchronize_cuda();
+void synchronize_mpi();
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y);
+realw get_device_array_maximum_value(realw* array,int size);
-// error checking after cuda function calls
-#define ENABLE_VERY_SLOW_ERROR_CHECKING
-
/* ----------------------------------------------------------------------------------------------- */
// cuda constant arrays
@@ -122,14 +148,6 @@
/* ----------------------------------------------------------------------------------------------- */
-// type of "working" variables: see also CUSTOM_REAL
-// double precision temporary variables leads to 10% performance decrease
-// in Kernel_2_impl (not very much..)
-typedef float realw;
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
// (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
// leads up to ~ 5% performance increase
//#define USE_MESH_COLORING_GPU
@@ -246,7 +264,6 @@
realw* d_b_rmassy_crust_mantle;
realw* d_b_rmassz_crust_mantle;
-
// global indexing
int* d_ibool_crust_mantle;
int* d_ispec_is_tiso_crust_mantle;
@@ -283,12 +300,6 @@
// backward/reconstructed elastic wavefield
realw* d_b_displ_crust_mantle; realw* d_b_veloc_crust_mantle; realw* d_b_accel_crust_mantle;
-//#ifdef USE_TEXTURES_FIELDS
-// // Texture references for fast non-coalesced scattered access
-// const textureReference* d_displ_cm_tex_ref_ptr;
-// const textureReference* d_accel_cm_tex_ref_ptr;
-//#endif
-
// attenuation
realw* d_R_xx_crust_mantle;
realw* d_R_yy_crust_mantle;
@@ -370,12 +381,6 @@
// backward/reconstructed elastic wavefield
realw* d_b_displ_outer_core; realw* d_b_veloc_outer_core; realw* d_b_accel_outer_core;
-//#ifdef USE_TEXTURES_FIELDS
-// // Texture references for fast non-coalesced scattered access
-// const textureReference* d_displ_oc_tex_ref_ptr;
-// const textureReference* d_accel_oc_tex_ref_ptr;
-//#endif
-
// kernels
realw* d_rho_kl_outer_core;
realw* d_alpha_kl_outer_core;
@@ -447,12 +452,6 @@
// backward/reconstructed elastic wavefield
realw* d_b_displ_inner_core; realw* d_b_veloc_inner_core; realw* d_b_accel_inner_core;
-//#ifdef USE_TEXTURES_FIELDS
-// // Texture references for fast non-coalesced scattered access
-// const textureReference* d_displ_ic_tex_ref_ptr;
-// const textureReference* d_accel_ic_tex_ref_ptr;
-//#endif
-
// attenuation
realw* d_R_xx_inner_core;
realw* d_R_yy_inner_core;
@@ -466,7 +465,6 @@
realw* d_b_R_xz_inner_core;
realw* d_b_R_yz_inner_core;
-
realw* d_factor_common_inner_core;
realw* d_one_minus_sum_beta_inner_core;
@@ -534,11 +532,6 @@
//realw* d_hprime_yy; // only needed if NGLLX != NGLLY != NGLLZ
//realw* d_hprime_zz; // only needed if NGLLX != NGLLY != NGLLZ
-//#ifdef USE_TEXTURES_CONSTANTS
-// const textureReference* d_hprime_xx_tex_ptr;
-// realw* d_hprime_xx_tex;
-//#endif
-
realw* d_hprimewgll_xx;
//realw* d_hprimewgll_yy; // only needed if NGLLX != NGLLY != NGLLZ
//realw* d_hprimewgll_zz; // only needed if NGLLX != NGLLY != NGLLZ
@@ -635,19 +628,25 @@
int max_nibool_interfaces_cm;
int* d_nibool_interfaces_crust_mantle;
int* d_ibool_interfaces_crust_mantle;
+
realw* d_send_accel_buffer_crust_mantle;
+ realw* d_b_send_accel_buffer_crust_mantle;
int num_interfaces_inner_core;
int max_nibool_interfaces_ic;
int* d_nibool_interfaces_inner_core;
int* d_ibool_interfaces_inner_core;
+
realw* d_send_accel_buffer_inner_core;
+ realw* d_b_send_accel_buffer_inner_core;
int num_interfaces_outer_core;
int max_nibool_interfaces_oc;
int* d_nibool_interfaces_outer_core;
int* d_ibool_interfaces_outer_core;
+
realw* d_send_accel_buffer_outer_core;
+ realw* d_b_send_accel_buffer_outer_core;
// ------------------------------------------------------------------ //
// absorbing boundaries
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -155,12 +155,9 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
- int num_blocks_x = mp->nspec2D_top_crust_mantle;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nspec2D_top_crust_mantle,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(NGLL2,1,1);
@@ -305,12 +302,9 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- int num_blocks_x = mp->nspec2D_top_crust_mantle;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nspec2D_top_crust_mantle,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(NGLL2,1,1);
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -912,6 +912,11 @@
// allocates mpi buffer for exchange with cpu
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_crust_mantle),
NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);
+ if( mp->simulation_type == 3){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_crust_mantle),
+ NDIM*(mp->max_nibool_interfaces_cm)*(mp->num_interfaces_crust_mantle)*sizeof(realw)),4004);
+ }
+
}
// inner core mesh
@@ -927,6 +932,11 @@
// allocates mpi buffer for exchange with cpu
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_inner_core),
NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);
+ if( mp->simulation_type == 3){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_inner_core),
+ NDIM*(mp->max_nibool_interfaces_ic)*(mp->num_interfaces_inner_core)*sizeof(realw)),4004);
+ }
+
}
// outer core mesh
@@ -943,6 +953,10 @@
// allocates mpi buffer for exchange with cpu
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer_outer_core),
(mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);
+ if( mp->simulation_type == 3){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer_outer_core),
+ (mp->max_nibool_interfaces_oc)*(mp->num_interfaces_outer_core)*sizeof(realw)),4004);
+ }
}
}
@@ -1318,6 +1332,13 @@
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_crust_mantle),sizeof(realw)*size),4011);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_crust_mantle),sizeof(realw)*size),4012);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_crust_mantle),sizeof(realw)*size),4013);
+ // debug
+ #if DEBUG_BACKWARD_SIMULATIONS == 1
+ //debugging with empty arrays
+ print_CUDA_error_if_any(cudaMemset(mp->d_b_displ_crust_mantle,0,sizeof(realw)*size),5111);
+ print_CUDA_error_if_any(cudaMemset(mp->d_b_veloc_crust_mantle,0,sizeof(realw)*size),5111);
+ print_CUDA_error_if_any(cudaMemset(mp->d_b_accel_crust_mantle,0,sizeof(realw)*size),5111);
+ #endif
}
#ifdef USE_TEXTURES_FIELDS
@@ -1522,26 +1543,32 @@
mp->nspec2D_top_outer_core = *NSPEC2D_TOP_OC;
mp->nspec2D_bottom_outer_core = *NSPEC2D_BOTTOM_OC;
+ copy_todevice_int((void**)&mp->d_ibelm_top_outer_core,h_ibelm_top_outer_core,mp->nspec2D_top_outer_core);
int size_toc = NGLL2*(mp->nspec2D_top_outer_core);
- copy_todevice_int((void**)&mp->d_ibelm_top_outer_core,h_ibelm_top_outer_core,mp->nspec2D_top_outer_core);
copy_todevice_realw((void**)&mp->d_jacobian2D_top_outer_core,h_jacobian2D_top_outer_core,size_toc);
copy_todevice_realw((void**)&mp->d_normal_top_outer_core,h_normal_top_outer_core,NDIM*size_toc);
+ copy_todevice_int((void**)&mp->d_ibelm_bottom_outer_core,h_ibelm_bottom_outer_core,mp->nspec2D_bottom_outer_core);
int size_boc = NGLL2*(mp->nspec2D_bottom_outer_core);
- copy_todevice_int((void**)&mp->d_ibelm_bottom_outer_core,h_ibelm_bottom_outer_core,mp->nspec2D_bottom_outer_core);
copy_todevice_realw((void**)&mp->d_jacobian2D_bottom_outer_core,h_jacobian2D_bottom_outer_core,size_boc);
copy_todevice_realw((void**)&mp->d_normal_bottom_outer_core,h_normal_bottom_outer_core,NDIM*size_boc);
// wavefield
- int size = mp->NGLOB_OUTER_CORE;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ_outer_core),sizeof(realw)*size),5001);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc_outer_core),sizeof(realw)*size),5002);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel_outer_core),sizeof(realw)*size),5003);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ_outer_core),sizeof(realw)*size_glob),5001);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc_outer_core),sizeof(realw)*size_glob),5002);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel_outer_core),sizeof(realw)*size_glob),5003);
// backward/reconstructed wavefield
if( mp->simulation_type == 3 ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_outer_core),sizeof(realw)*size),5011);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_outer_core),sizeof(realw)*size),5022);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_outer_core),sizeof(realw)*size),5033);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_outer_core),sizeof(realw)*size_glob),5011);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_outer_core),sizeof(realw)*size_glob),5022);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_outer_core),sizeof(realw)*size_glob),5033);
+ // debug
+ #if DEBUG_BACKWARD_SIMULATIONS == 1
+ //debugging with empty arrays
+ print_CUDA_error_if_any(cudaMemset(mp->d_b_displ_outer_core,0,sizeof(realw)*size_glob),5111);
+ print_CUDA_error_if_any(cudaMemset(mp->d_b_veloc_outer_core,0,sizeof(realw)*size_glob),5111);
+ print_CUDA_error_if_any(cudaMemset(mp->d_b_accel_outer_core,0,sizeof(realw)*size_glob),5111);
+ #endif
}
#ifdef USE_TEXTURES_FIELDS
@@ -1551,21 +1578,21 @@
print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_oc_tex_ref_ptr, "d_displ_oc_tex"), 5021);
cudaChannelFormatDesc channelDesc1 = cudaCreateChannelDesc<realw>();
print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_oc_tex_ref_ptr, mp->d_displ_outer_core,
- &channelDesc1, sizeof(realw)*size), 5021);
+ &channelDesc1, sizeof(realw)*size_glob), 5021);
const textureReference* d_accel_oc_tex_ref_ptr;
print_CUDA_error_if_any(cudaGetTextureReference(&d_accel_oc_tex_ref_ptr, "d_accel_oc_tex"), 5023);
cudaChannelFormatDesc channelDesc2 = cudaCreateChannelDesc<realw>();
print_CUDA_error_if_any(cudaBindTexture(0, d_accel_oc_tex_ref_ptr, mp->d_accel_outer_core,
- &channelDesc2, sizeof(realw)*size), 5023);
+ &channelDesc2, sizeof(realw)*size_glob), 5023);
#else
cudaChannelFormatDesc channelDesc1 = cudaCreateChannelDesc<float>();
print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_oc_tex, mp->d_displ_outer_core,
- &channelDesc1, sizeof(realw)*size), 5021);
+ &channelDesc1, sizeof(realw)*size_glob), 5021);
cudaChannelFormatDesc channelDesc2 = cudaCreateChannelDesc<float>();
print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_oc_tex, mp->d_accel_outer_core,
- &channelDesc2, sizeof(realw)*size), 5023);
+ &channelDesc2, sizeof(realw)*size_glob), 5023);
#endif
}
#endif
@@ -1579,7 +1606,7 @@
mp->d_b_rmass_outer_core = mp->d_rmass_outer_core;
//kernels
- size = NGLL3*(mp->NSPEC_OUTER_CORE);
+ int size = NGLL3*(mp->NSPEC_OUTER_CORE);
// density kernel
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl_outer_core),
@@ -1750,6 +1777,8 @@
mp->nspec_outer_inner_core = *nspec_outer;
mp->nspec_inner_inner_core = *nspec_inner;
+
+ // boundary elements on top
mp->nspec2D_top_inner_core = *NSPEC2D_TOP_IC;
copy_todevice_int((void**)&mp->d_ibelm_top_inner_core,h_ibelm_top_inner_core,mp->nspec2D_top_inner_core);
@@ -1763,6 +1792,13 @@
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ_inner_core),sizeof(realw)*size),6011);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc_inner_core),sizeof(realw)*size),6012);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_inner_core),sizeof(realw)*size),6013);
+ // debug
+ #if DEBUG_BACKWARD_SIMULATIONS == 1
+ // debugging with empty arrays
+ print_CUDA_error_if_any(cudaMemset(mp->d_b_displ_inner_core,0,sizeof(realw)*size),5111);
+ print_CUDA_error_if_any(cudaMemset(mp->d_b_veloc_inner_core,0,sizeof(realw)*size),5111);
+ print_CUDA_error_if_any(cudaMemset(mp->d_b_accel_inner_core,0,sizeof(realw)*size),5111);
+ #endif
}
#ifdef USE_TEXTURES_FIELDS
@@ -2074,16 +2110,19 @@
cudaFree(mp->d_nibool_interfaces_crust_mantle);
cudaFree(mp->d_ibool_interfaces_crust_mantle);
cudaFree(mp->d_send_accel_buffer_crust_mantle);
+ if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_crust_mantle);
}
if( mp->num_interfaces_inner_core > 0 ){
cudaFree(mp->d_nibool_interfaces_inner_core);
cudaFree(mp->d_ibool_interfaces_inner_core);
cudaFree(mp->d_send_accel_buffer_inner_core);
+ if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_inner_core);
}
if( mp->num_interfaces_outer_core > 0 ){
cudaFree(mp->d_nibool_interfaces_outer_core);
cudaFree(mp->d_ibool_interfaces_outer_core);
cudaFree(mp->d_send_accel_buffer_outer_core);
+ if( mp->simulation_type == 3 ) cudaFree(mp->d_b_send_accel_buffer_outer_core);
}
//------------------------------------------
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2013-09-13 11:23:45 UTC (rev 22782)
@@ -57,9 +57,9 @@
void FC_FUNC_(transfer_boun_accel_from_device,
TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
- realw* send_accel_buffer,
- int* IREGION,
- int* FORWARD_OR_ADJOINT){}
+ realw* send_accel_buffer,
+ int* IREGION,
+ int* FORWARD_OR_ADJOINT){}
void FC_FUNC_(transfer_asmbl_accel_to_device,
TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
@@ -84,12 +84,12 @@
void FC_FUNC_(check_norm_acoustic_from_device,
CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
- int* SIMULATION_TYPE) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(check_norm_elastic_from_device,
CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
- int* SIMULATION_TYPE) {}
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(check_norm_strain_from_device,
CHECK_NORM_STRAIN_FROM_DEVICE)(realw* strain_norm,
@@ -131,23 +131,23 @@
// src/cuda/compute_add_sources_elastic_cuda.cu
//
-void FC_FUNC_(compute_add_sources_el_cuda,
- COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
- int* NSOURCESf,
- double* h_stf_pre_compute) {}
+void FC_FUNC_(compute_add_sources_cuda,
+ COMPUTE_ADD_SOURCES_CUDA)(long* Mesh_pointer_f,
+ int* NSOURCESf,
+ double* h_stf_pre_compute) {}
-void FC_FUNC_(compute_add_sources_el_s3_cuda,
- COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer_f,
- int* NSOURCESf,
- double* h_stf_pre_compute) {}
+void FC_FUNC_(compute_add_sources_backward_cuda,
+ COMPUTE_ADD_SOURCES_BACKWARD_CUDA)(long* Mesh_pointer_f,
+ int* NSOURCESf,
+ double* h_stf_pre_compute) {}
-void FC_FUNC_(add_sources_el_sim_type_2_or_3,
- ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
- int* nrec,
- realw* h_adj_sourcearrays,
- int* h_islice_selected_rec,
- int* h_ispec_selected_rec,
- int* time_index) {}
+void FC_FUNC_(compute_add_sources_adjoint_cuda,
+ COMPUTE_ADD_SOURCES_ADJOINT_CUDA)(long* Mesh_pointer,
+ int* nrec,
+ realw* h_adj_sourcearrays,
+ int* h_islice_selected_rec,
+ int* h_ispec_selected_rec,
+ int* time_index) {}
//
@@ -172,8 +172,6 @@
void FC_FUNC_(compute_coupling_ocean_cuda,
COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
- int* NCHUNKS_VAL,
- int* exact_mass_matrix_for_rotation,
int* FORWARD_OR_ADJOINT) {}
@@ -183,7 +181,8 @@
void FC_FUNC_(compute_forces_crust_mantle_cuda,
COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
- int* iphase) {}
+ int* iphase,
+ int* FORWARD_OR_ADJOINT_f) {}
//
@@ -192,7 +191,8 @@
void FC_FUNC_(compute_forces_inner_core_cuda,
COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
- int* iphase) {}
+ int* iphase,
+ int* FORWARD_OR_ADJOINT_f) {}
//
@@ -203,7 +203,7 @@
COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
int* iphase,
realw* time_f,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT_f) {}
//
@@ -243,7 +243,11 @@
realw* absorb_potential,
int* itype) {}
+void FC_FUNC_(compute_stacey_acoustic_undoatt_cuda,
+ COMPUTE_STACEY_ACOUSTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
+ int* itype) {}
+
//
// src/cuda/compute_stacey_elastic_cuda.cu
//
@@ -258,7 +262,11 @@
realw* absorb_field,
int* itype) {}
+void FC_FUNC_(compute_stacey_elastic_undoatt_cuda,
+ COMPUTE_STACEY_ELASTIC_UNDOATT_CUDA)(long* Mesh_pointer_f,
+ int* itype) {}
+
//
// src/cuda/initialize_cuda.cu
//
@@ -685,19 +693,19 @@
void FC_FUNC_(transfer_b_rmemory_cm_to_device,
TRANSFER_B_RMEMORY_CM_TO_DEVICE)(long* Mesh_pointer,
- realw* b_R_xx,
- realw* b_R_yy,
- realw* b_R_xy,
- realw* b_R_xz,
- realw* b_R_yz) {}
+ realw* b_R_xx,
+ realw* b_R_yy,
+ realw* b_R_xy,
+ realw* b_R_xz,
+ realw* b_R_yz) {}
void FC_FUNC_(transfer_b_rmemory_ic_to_device,
TRANSFER_B_RMEMORY_IC_TO_DEVICE)(long* Mesh_pointer,
- realw* b_R_xx,
- realw* b_R_yy,
- realw* b_R_xy,
- realw* b_R_xz,
- realw* b_R_yz) {}
+ realw* b_R_xx,
+ realw* b_R_yy,
+ realw* b_R_xy,
+ realw* b_R_xz,
+ realw* b_R_yz) {}
void FC_FUNC_(transfer_rotation_from_device,
TRANSFER_ROTATION_FROM_DEVICE)(long* Mesh_pointer,
@@ -747,42 +755,44 @@
void FC_FUNC_(update_displacement_ic_cuda,
UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
+ realw* deltat_f,
+ realw* deltatsqover2_f,
+ realw* deltatover2_f,
int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(update_displacement_cm_cuda,
UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
+ realw* deltat_f,
+ realw* deltatsqover2_f,
+ realw* deltatover2_f,
int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(update_displacement_oc_cuda,
UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
+ realw* deltat_f,
+ realw* deltatsqover2_f,
+ realw* deltatover2_f,
int* FORWARD_OR_ADJOINT) {}
-void FC_FUNC_(update_accel_3_a_cuda,
- UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* NCHUNKS_VAL,
- int* FORWARD_OR_ADJOINT) {}
+void FC_FUNC_(multiply_accel_elastic_cuda,
+ MULTIPLY_ACCEL_ELASTIC_CUDA)(long* Mesh_pointer,
+ int* FORWARD_OR_ADJOINT) {}
-void FC_FUNC_(update_veloc_3_b_cuda,
- UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {}
+void FC_FUNC_(update_veloc_elastic_cuda,
+ UPDATE_VELOC_ELASTIC_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_f,
+ int* FORWARD_OR_ADJOINT) {}
-void FC_FUNC_(kernel_3_outer_core_cuda,
- KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {}
+void FC_FUNC_(multiply_accel_acoustic_cuda,
+ MULTIPLY_ACCEL_ACOUSTIC_CUDA)(long* Mesh_pointer,
+ int* FORWARD_OR_ADJOINT) {}
+void FC_FUNC_(update_veloc_acoustic_cuda,
+ UPDATE_VELOC_ACOUSTIC_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_f,
+ int* FORWARD_OR_ADJOINT) {}
+
//
// src/cuda/write_seismograms_cuda.cu
//
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -106,6 +106,8 @@
long* Mesh_pointer_f) {
TRACE("transfer_fields_b_cm_to_device");
+ // debug
+ DEBUG_EMPTY_BACKWARD();
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_displ_crust_mantle,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
@@ -121,6 +123,8 @@
long* Mesh_pointer_f) {
TRACE("transfer_fields_b_ic_to_device");
+ // debug
+ DEBUG_EMPTY_BACKWARD();
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_displ_inner_core,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
@@ -136,6 +140,8 @@
long* Mesh_pointer_f) {
TRACE("transfer_fields_b_oc_to_device");
+ // debug
+ DEBUG_EMPTY_BACKWARD();
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_displ_outer_core,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
@@ -483,13 +489,13 @@
int size = NGLL3*mp->NSPEC_CRUST_MANTLE;
- cudaMemcpy(eps_trace_over_3,mp->d_eps_trace_over_3_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
+ print_CUDA_error_if_any(cudaMemcpy(eps_trace_over_3,mp->d_eps_trace_over_3_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320001);
- cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320002);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320003);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320004);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320005);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz_crust_mantle,size*sizeof(realw),cudaMemcpyDeviceToHost),320006);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -510,16 +516,21 @@
realw* epsilondev_xz,
realw* epsilondev_yz) {
TRACE("transfer_b_strain_cm_to_device");
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
int size = NGLL3*mp->NSPEC_CRUST_MANTLE;
- cudaMemcpy(mp->d_b_epsilondev_xx_crust_mantle,epsilondev_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_yy_crust_mantle,epsilondev_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xy_crust_mantle,epsilondev_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xz_crust_mantle,epsilondev_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_yz_crust_mantle,epsilondev_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+ if( ! mp->undo_attenuation ){
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx_crust_mantle,epsilondev_xx,size*sizeof(realw),cudaMemcpyHostToDevice),330001);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy_crust_mantle,epsilondev_yy,size*sizeof(realw),cudaMemcpyHostToDevice),330002);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy_crust_mantle,epsilondev_xy,size*sizeof(realw),cudaMemcpyHostToDevice),330003);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz_crust_mantle,epsilondev_xz,size*sizeof(realw),cudaMemcpyHostToDevice),330004);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz_crust_mantle,epsilondev_yz,size*sizeof(realw),cudaMemcpyHostToDevice),330005);
+ }
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_b_strain_cm_to_device");
@@ -545,13 +556,13 @@
int size = NGLL3*mp->NSPEC_INNER_CORE;
- cudaMemcpy(eps_trace_over_3,mp->d_eps_trace_over_3_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
+ print_CUDA_error_if_any(cudaMemcpy(eps_trace_over_3,mp->d_eps_trace_over_3_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340001);
- cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340002);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340003);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340004);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340005);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz_inner_core,size*sizeof(realw),cudaMemcpyDeviceToHost),340006);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -572,16 +583,21 @@
realw* epsilondev_xz,
realw* epsilondev_yz) {
TRACE("transfer_b_strain_cm_to_device");
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
int size = NGLL3*mp->NSPEC_INNER_CORE;
- cudaMemcpy(mp->d_b_epsilondev_xx_inner_core,epsilondev_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_yy_inner_core,epsilondev_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xy_inner_core,epsilondev_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xz_inner_core,epsilondev_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_yz_inner_core,epsilondev_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+ if( ! mp->undo_attenuation ){
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx_inner_core,epsilondev_xx,size*sizeof(realw),cudaMemcpyHostToDevice),350001);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy_inner_core,epsilondev_yy,size*sizeof(realw),cudaMemcpyHostToDevice),350002);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy_inner_core,epsilondev_xy,size*sizeof(realw),cudaMemcpyHostToDevice),350003);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz_inner_core,epsilondev_xz,size*sizeof(realw),cudaMemcpyHostToDevice),350004);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz_inner_core,epsilondev_yz,size*sizeof(realw),cudaMemcpyHostToDevice),350005);
+ }
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_b_strain_ic_to_device");
@@ -599,25 +615,30 @@
extern "C"
void FC_FUNC_(transfer_b_rmemory_cm_to_device,
TRANSFER_B_RMEMORY_CM_TO_DEVICE)(long* Mesh_pointer,
- realw* b_R_xx,
- realw* b_R_yy,
- realw* b_R_xy,
- realw* b_R_xz,
- realw* b_R_yz) {
+ realw* b_R_xx,
+ realw* b_R_yy,
+ realw* b_R_xy,
+ realw* b_R_xz,
+ realw* b_R_yz) {
TRACE("transfer_b_Rmemory_cm_to_device");
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
int size = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
- cudaMemcpy(mp->d_b_R_xx_crust_mantle,b_R_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_yy_crust_mantle,b_R_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_xy_crust_mantle,b_R_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_xz_crust_mantle,b_R_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_yz_crust_mantle,b_R_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+ if( ! mp->partial_phys_dispersion_only ){
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx_crust_mantle,b_R_xx,size*sizeof(realw),cudaMemcpyHostToDevice),360001);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy_crust_mantle,b_R_yy,size*sizeof(realw),cudaMemcpyHostToDevice),360002);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy_crust_mantle,b_R_xy,size*sizeof(realw),cudaMemcpyHostToDevice),360003);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz_crust_mantle,b_R_xz,size*sizeof(realw),cudaMemcpyHostToDevice),360004);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz_crust_mantle,b_R_yz,size*sizeof(realw),cudaMemcpyHostToDevice),360005);
+ }
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("after transfer_b_Rmemory_cm_to_device");
+ exit_on_cuda_error("after transfer_b_rmemory_cm_to_device");
#endif
}
@@ -629,25 +650,30 @@
extern "C"
void FC_FUNC_(transfer_b_rmemory_ic_to_device,
TRANSFER_B_RMEMORY_IC_TO_DEVICE)(long* Mesh_pointer,
- realw* b_R_xx,
- realw* b_R_yy,
- realw* b_R_xy,
- realw* b_R_xz,
- realw* b_R_yz) {
- TRACE("transfer_b_Rmemory_ic_to_device");
+ realw* b_R_xx,
+ realw* b_R_yy,
+ realw* b_R_xy,
+ realw* b_R_xz,
+ realw* b_R_yz) {
+ TRACE("transfer_b_rmemory_ic_to_device");
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
int size = N_SLS*NGLL3*mp->NSPEC_INNER_CORE;
- cudaMemcpy(mp->d_b_R_xx_inner_core,b_R_xx,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_yy_inner_core,b_R_yy,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_xy_inner_core,b_R_xy,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_xz_inner_core,b_R_xz,size*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_yz_inner_core,b_R_yz,size*sizeof(realw),cudaMemcpyHostToDevice);
+ if( ! mp->partial_phys_dispersion_only ){
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx_inner_core,b_R_xx,size*sizeof(realw),cudaMemcpyHostToDevice),370001);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy_inner_core,b_R_yy,size*sizeof(realw),cudaMemcpyHostToDevice),370002);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy_inner_core,b_R_xy,size*sizeof(realw),cudaMemcpyHostToDevice),370003);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz_inner_core,b_R_xz,size*sizeof(realw),cudaMemcpyHostToDevice),370004);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz_inner_core,b_R_yz,size*sizeof(realw),cudaMemcpyHostToDevice),370005);
+ }
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("after transfer_b_Rmemory_ic_to_device");
+ exit_on_cuda_error("after transfer_b_rmemory_ic_to_device");
#endif
}
@@ -688,6 +714,9 @@
realw* A_array_rotation,
realw* B_array_rotation) {
TRACE("transfer_b_rotation_to_device");
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/update_displacement_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -48,10 +48,11 @@
realw deltat,
realw deltatsqover2,
realw deltatover2) {
+
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id];
veloc[id] = veloc[id] + deltatover2*accel[id];
@@ -69,9 +70,9 @@
extern "C"
void FC_FUNC_(update_displacement_ic_cuda,
UPDATE_DISPLACMENT_IC_CUDA)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
+ realw* deltat_f,
+ realw* deltatsqover2_f,
+ realw* deltatover2_f,
int* FORWARD_OR_ADJOINT) {
TRACE("update_displacement_ic_cuda");
@@ -80,23 +81,30 @@
int size = NDIM * mp->NGLOB_INNER_CORE;
- realw deltat = *deltat_F;
- realw deltatsqover2 = *deltatsqover2_F;
- realw deltatover2 = *deltatover2_F;
+ // debug
+#if DEBUG_BACKWARD_SIMULATIONS == 1
+ realw max_d,max_v,max_a;
+ max_d = get_device_array_maximum_value(mp->d_b_displ_inner_core, size);
+ max_v = get_device_array_maximum_value(mp->d_b_veloc_inner_core, size);
+ max_a = get_device_array_maximum_value(mp->d_b_accel_inner_core, size);
+ printf("rank %d - max inner_core displ: %f veloc: %f accel: %f\n",mp->myrank,max_d,max_v,max_a);
+ fflush(stdout);
+ synchronize_mpi();
+#endif
int blocksize = BLOCKSIZE_KERNEL1;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
+ realw deltat = *deltat_f;
+ realw deltatsqover2 = *deltatsqover2_f;
+ realw deltatover2 = *deltatover2_f;
+
if( *FORWARD_OR_ADJOINT == 1 ){
//launch kernel
UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_inner_core,
@@ -104,6 +112,9 @@
mp->d_accel_inner_core,
size,deltat,deltatsqover2,deltatover2);
}else if( *FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
// kernel for backward fields
UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_inner_core,
mp->d_b_veloc_inner_core,
@@ -126,9 +137,9 @@
extern "C"
void FC_FUNC_(update_displacement_cm_cuda,
UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
+ realw* deltat_f,
+ realw* deltatsqover2_f,
+ realw* deltatover2_f,
int* FORWARD_OR_ADJOINT) {
TRACE("update_displacement_cm_cuda");
@@ -137,23 +148,30 @@
int size = NDIM * mp->NGLOB_CRUST_MANTLE;
- realw deltat = *deltat_F;
- realw deltatsqover2 = *deltatsqover2_F;
- realw deltatover2 = *deltatover2_F;
+ // debug
+#if DEBUG_BACKWARD_SIMULATIONS == 1
+ realw max_d,max_v,max_a;
+ max_d = get_device_array_maximum_value(mp->d_b_displ_crust_mantle, size);
+ max_v = get_device_array_maximum_value(mp->d_b_veloc_crust_mantle, size);
+ max_a = get_device_array_maximum_value(mp->d_b_accel_crust_mantle, size);
+ printf("rank %d - max crust_mantle displ: %f veloc: %f accel: %f\n",mp->myrank,max_d,max_v,max_a);
+ fflush(stdout);
+ synchronize_mpi();
+#endif
int blocksize = BLOCKSIZE_KERNEL1;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
+ realw deltat = *deltat_f;
+ realw deltatsqover2 = *deltatsqover2_f;
+ realw deltatover2 = *deltatover2_f;
+
if( *FORWARD_OR_ADJOINT == 1 ){
//launch kernel
UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_displ_crust_mantle,
@@ -161,6 +179,9 @@
mp->d_accel_crust_mantle,
size,deltat,deltatsqover2,deltatover2);
}else if( *FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
// kernel for backward fields
UpdateDispVeloc_kernel<<<grid,threads>>>(mp->d_b_displ_crust_mantle,
mp->d_b_veloc_crust_mantle,
@@ -214,9 +235,9 @@
extern "C"
void FC_FUNC_(update_displacement_oc_cuda,
UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
- realw* deltat_F,
- realw* deltatsqover2_F,
- realw* deltatover2_F,
+ realw* deltat_f,
+ realw* deltatsqover2_f,
+ realw* deltatover2_f,
int* FORWARD_OR_ADJOINT) {
TRACE("update_displacement_oc_cuda");
@@ -225,30 +246,40 @@
int size = mp->NGLOB_OUTER_CORE;
- realw deltat = *deltat_F;
- realw deltatsqover2 = *deltatsqover2_F;
- realw deltatover2 = *deltatover2_F;
+ // debug
+#if DEBUG_BACKWARD_SIMULATIONS == 1
+ realw max_d,max_v,max_a;
+ max_d = get_device_array_maximum_value(mp->d_b_displ_outer_core, size);
+ max_v = get_device_array_maximum_value(mp->d_b_veloc_outer_core, size);
+ max_a = get_device_array_maximum_value(mp->d_b_accel_outer_core, size);
+ printf("rank %d - max outer_core displ: %f veloc: %f accel: %f\n",mp->myrank,max_d,max_v,max_a);
+ fflush(stdout);
+ synchronize_mpi();
+#endif
int blocksize = BLOCKSIZE_KERNEL1;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
+ realw deltat = *deltat_f;
+ realw deltatsqover2 = *deltatsqover2_f;
+ realw deltatover2 = *deltatover2_f;
+
if( *FORWARD_OR_ADJOINT == 1 ){
//launch kernel
UpdatePotential_kernel<<<grid,threads>>>(mp->d_displ_outer_core,
mp->d_veloc_outer_core,
mp->d_accel_outer_core,
size,deltat,deltatsqover2,deltatover2);
- }else if( *FORWARD_OR_ADJOINT == 1 ){
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
UpdatePotential_kernel<<<grid,threads>>>(mp->d_b_displ_outer_core,
mp->d_b_veloc_outer_core,
mp->d_b_accel_outer_core,
@@ -265,10 +296,12 @@
// KERNEL 3
//
-// crust/mantle and inner core regions
+// elastic domains: crust/mantle and inner core regions
/* ----------------------------------------------------------------------------------------------- */
+// unused...
+/*
__global__ void kernel_3_cuda_device(realw* veloc,
realw* accel,
int size,
@@ -279,8 +312,8 @@
realw* rmassz) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
// note: update adds rotational acceleration in case two_omega_earth is non-zero
accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
@@ -292,244 +325,190 @@
veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
}
}
+*/
/* ----------------------------------------------------------------------------------------------- */
-__global__ void kernel_3_accel_cuda_device(realw* accel,
- realw* veloc,
- int size,
- realw two_omega_earth,
- realw* rmassx,
- realw* rmassy,
- realw* rmassz) {
+__global__ void multiply_accel_elastic_cuda_device(realw* accel,
+ realw* veloc,
+ int size,
+ realw two_omega_earth,
+ realw* rmassx,
+ realw* rmassy,
+ realw* rmassz) {
+
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
// note: update adds rotational acceleration in case two_omega_earth is non-zero
- accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id+1]; // (2,i);
- accel[3*id+1] = accel[3*id+1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
- accel[3*id+2] = accel[3*id+2]*rmassz[id];
+ accel[3*id] = accel[3*id]*rmassx[id] + two_omega_earth*veloc[3*id + 1]; // (2,i);
+ accel[3*id + 1] = accel[3*id + 1]*rmassy[id] - two_omega_earth*veloc[3*id]; //(1,i);
+ accel[3*id + 2] = accel[3*id + 2]*rmassz[id];
}
}
/* ----------------------------------------------------------------------------------------------- */
-__global__ void kernel_3_veloc_cuda_device(realw* veloc,
- realw* accel,
- int size,
- realw deltatover2) {
- int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+extern "C"
+void FC_FUNC_(multiply_accel_elastic_cuda,
+ MULTIPLY_ACCEL_ELASTIC_CUDA)(long* Mesh_pointer,
+ int* FORWARD_OR_ADJOINT) {
+ TRACE("multiply_accel_elastic_cuda");
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
- if(id < size) {
- veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
- veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
- veloc[3*id+2] = veloc[3*id+2] + deltatover2*accel[3*id+2];
- }
-}
+ int size_padded,num_blocks_x,num_blocks_y;
+ dim3 grid,threads;
-/* ----------------------------------------------------------------------------------------------- */
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
-extern "C"
-void FC_FUNC_(update_accel_3_a_cuda,
- UPDATE_ACCEL_3_A_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* NCHUNKS_VAL,
- int* FORWARD_OR_ADJOINT) {
- TRACE("update_accel_3_a_cuda");
+ int blocksize = BLOCKSIZE_KERNEL3;
- Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+ // multiplies accel with inverse of mass matrix
- realw deltatover2 = *deltatover2_F;
+ // crust/mantle region
+ size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
- int blocksize = BLOCKSIZE_KERNEL3;
- int size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
+ grid = dim3(num_blocks_x,num_blocks_y);
+ threads = dim3(blocksize,1,1);
+
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
+ mp->d_veloc_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ mp->two_omega_earth,
+ mp->d_rmassx_crust_mantle,
+ mp->d_rmassy_crust_mantle,
+ mp->d_rmassz_crust_mantle);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
+ multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
+ mp->d_b_veloc_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ mp->b_two_omega_earth,
+ mp->d_b_rmassx_crust_mantle,
+ mp->d_b_rmassy_crust_mantle,
+ mp->d_b_rmassz_crust_mantle);
}
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ // inner core region
+ size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
- // crust/mantle region only
- // check whether we can update accel and veloc, or only accel at this point
- if( mp->oceans == 0 ){
- // updates both, accel and veloc
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
- if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
- // uses corrected mass matrices rmassx,rmassy,rmassz
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
- mp->d_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2,
- mp->two_omega_earth,
- mp->d_rmassx_crust_mantle,
- mp->d_rmassy_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
- mp->d_b_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2,
- mp->b_two_omega_earth,
- mp->d_rmassx_crust_mantle,
- mp->d_rmassy_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }
- }else{
- // uses only rmassz
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
- mp->d_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2,
- mp->two_omega_earth,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
- mp->d_b_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2,
- mp->b_two_omega_earth,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }
- }
+ grid = dim3(num_blocks_x,num_blocks_y);
+ threads = dim3(blocksize,1,1);
- }else{
- // updates only accel
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_accel_inner_core,
+ mp->d_veloc_inner_core,
+ mp->NGLOB_INNER_CORE,
+ mp->two_omega_earth,
+ mp->d_rmassx_inner_core,
+ mp->d_rmassy_inner_core,
+ mp->d_rmassz_inner_core);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
- if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
- // uses corrected mass matrices
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
- mp->d_veloc_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- mp->two_omega_earth,
- mp->d_rmassx_crust_mantle,
- mp->d_rmassy_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
- mp->d_b_veloc_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- mp->b_two_omega_earth,
- mp->d_rmassx_crust_mantle,
- mp->d_rmassy_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }
- }else{
- // uses only rmassz
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_accel_crust_mantle,
- mp->d_veloc_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- mp->two_omega_earth,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_accel_cuda_device<<< grid, threads>>>(mp->d_b_accel_crust_mantle,
- mp->d_b_veloc_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- mp->b_two_omega_earth,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle,
- mp->d_rmassz_crust_mantle);
- }
- }
+ multiply_accel_elastic_cuda_device<<< grid, threads>>>(mp->d_b_accel_inner_core,
+ mp->d_b_veloc_inner_core,
+ mp->NGLOB_INNER_CORE,
+ mp->b_two_omega_earth,
+ mp->d_b_rmassx_inner_core,
+ mp->d_b_rmassy_inner_core,
+ mp->d_b_rmassz_inner_core);
}
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
- exit_on_cuda_error("after update_accel_3_a");
+ exit_on_cuda_error("after multiply_accel_elastic_cuda");
#endif
}
/* ----------------------------------------------------------------------------------------------- */
+__global__ void update_veloc_elastic_cuda_device(realw* veloc,
+ realw* accel,
+ int size,
+ realw deltatover2) {
+
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
+ if(id < size) {
+ veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
+ veloc[3*id + 1] = veloc[3*id + 1] + deltatover2*accel[3*id + 1];
+ veloc[3*id + 2] = veloc[3*id + 2] + deltatover2*accel[3*id + 2];
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
extern "C"
-void FC_FUNC_(update_veloc_3_b_cuda,
- UPDATE_VELOC_3_B_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {
- TRACE("update_veloc_3_b_cuda");
+void FC_FUNC_(update_veloc_elastic_cuda,
+ UPDATE_VELOC_ELASTIC_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_f,
+ int* FORWARD_OR_ADJOINT) {
+
+ TRACE("update_veloc_elastic_cuda");
+
int size_padded,num_blocks_x,num_blocks_y;
+ dim3 grid,threads;
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- realw deltatover2 = *deltatover2_F;
+ realw deltatover2 = *deltatover2_f;
int blocksize = BLOCKSIZE_KERNEL3;
+ // updates velocity
+
// crust/mantle region
- // in case of ocean loads, we still have to update the velocity for crust/mantle region
- if( mp->oceans ){
- size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid1(num_blocks_x,num_blocks_y);
- dim3 threads1(blocksize,1,1);
+ size_padded = ((int)ceil(((double)mp->NGLOB_CRUST_MANTLE)/((double)blocksize)))*blocksize;
- // updates only veloc at this point
- if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_veloc_crust_mantle,
- mp->d_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2);
- }else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_veloc_cuda_device<<< grid1, threads1>>>(mp->d_b_veloc_crust_mantle,
- mp->d_b_accel_crust_mantle,
- mp->NGLOB_CRUST_MANTLE,
- deltatover2);
- }
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+ grid = dim3(num_blocks_x,num_blocks_y);
+ threads = dim3(blocksize,1,1);
+
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_veloc_crust_mantle,
+ mp->d_accel_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ deltatover2);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_crust_mantle,
+ mp->d_b_accel_crust_mantle,
+ mp->NGLOB_CRUST_MANTLE,
+ deltatover2);
}
// inner core region
size_padded = ((int)ceil(((double)mp->NGLOB_INNER_CORE)/((double)blocksize)))*blocksize;
- num_blocks_x = size_padded/blocksize;
- num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
- // updates both, accel and veloc
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+ grid = dim3(num_blocks_x,num_blocks_y);
+ threads = dim3(blocksize,1,1);
+
if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
- mp->d_accel_inner_core,
- mp->NGLOB_INNER_CORE,
- deltatover2,
- mp->two_omega_earth,
- mp->d_rmassx_inner_core,
- mp->d_rmassy_inner_core,
- mp->d_rmassz_inner_core);
+ update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_veloc_inner_core,
+ mp->d_accel_inner_core,
+ mp->NGLOB_INNER_CORE,
+ deltatover2);
}else if( *FORWARD_OR_ADJOINT == 3 ){
- kernel_3_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
- mp->d_b_accel_inner_core,
- mp->NGLOB_INNER_CORE,
- deltatover2,
- mp->b_two_omega_earth,
- mp->d_rmassx_inner_core,
- mp->d_rmassy_inner_core,
- mp->d_rmassz_inner_core);
+ update_veloc_elastic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_inner_core,
+ mp->d_b_accel_inner_core,
+ mp->NGLOB_INNER_CORE,
+ deltatover2);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -542,23 +521,77 @@
// KERNEL 3
//
-// for outer_core
+// acoustic domains: for fluid outer_core
/* ----------------------------------------------------------------------------------------------- */
-__global__ void kernel_3_outer_core_cuda_device(realw* veloc,
- realw* accel,int size,
- realw deltatover2,
- realw* rmass) {
+__global__ void multiply_accel_acoustic_cuda_device(realw* accel,
+ int size,
+ realw* rmass) {
+
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
// multiplies pressure with the inverse of the mass matrix
accel[id] = accel[id]*rmass[id];
+ }
+}
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(multiply_accel_acoustic_cuda,
+ MULTIPLY_ACCEL_ACOUSTIC_CUDA)(long* Mesh_pointer,
+ int* FORWARD_OR_ADJOINT) {
+ TRACE("multiply_accel_acoustic_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+
+ int blocksize = BLOCKSIZE_KERNEL3;
+
+ int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ // multiplies accel with inverse of mass matrix
+ if( *FORWARD_OR_ADJOINT == 1 ){
+ multiply_accel_acoustic_cuda_device<<< grid, threads>>>(mp->d_accel_outer_core,
+ mp->NGLOB_OUTER_CORE,
+ mp->d_rmass_outer_core);
+ }else if( *FORWARD_OR_ADJOINT == 3 ){
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
+ multiply_accel_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_accel_outer_core,
+ mp->NGLOB_OUTER_CORE,
+ mp->d_b_rmass_outer_core);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("after multiply_accel_acoustic_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void update_veloc_acoustic_cuda_device(realw* veloc,
+ realw* accel,
+ int size,
+ realw deltatover2) {
+
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
+ if(id < size) {
// Newmark time scheme: corrector term
veloc[id] = veloc[id] + deltatover2*accel[id];
}
@@ -568,44 +601,46 @@
/* ----------------------------------------------------------------------------------------------- */
extern "C"
-void FC_FUNC_(kernel_3_outer_core_cuda,
- KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
- realw* deltatover2_F,
- int* FORWARD_OR_ADJOINT) {
+void FC_FUNC_(update_veloc_acoustic_cuda,
+ UPDATE_VELOC_ACOUSTIC_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_f,
+ int* FORWARD_OR_ADJOINT) {
- TRACE("kernel_3_outer_core_cuda");
+ TRACE("update_veloc_acoustic_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- realw deltatover2 = *deltatover2_F;
+ realw deltatover2 = *deltatover2_f;
int blocksize = BLOCKSIZE_KERNEL3;
int size_padded = ((int)ceil(((double)mp->NGLOB_OUTER_CORE)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
+ // updates velocity
if( *FORWARD_OR_ADJOINT == 1 ){
- kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
- mp->d_accel_outer_core,
- mp->NGLOB_OUTER_CORE,
- deltatover2,mp->d_rmass_outer_core);
+ update_veloc_acoustic_cuda_device<<< grid, threads>>>(mp->d_veloc_outer_core,
+ mp->d_accel_outer_core,
+ mp->NGLOB_OUTER_CORE,
+ deltatover2);
}else if( *FORWARD_OR_ADJOINT == 3){
- kernel_3_outer_core_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
- mp->d_b_accel_outer_core,
- mp->NGLOB_OUTER_CORE,
- deltatover2,mp->d_rmass_outer_core);
+ // debug
+ DEBUG_EMPTY_BACKWARD();
+
+ update_veloc_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_veloc_outer_core,
+ mp->d_b_accel_outer_core,
+ mp->NGLOB_OUTER_CORE,
+ deltatover2);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
- exit_on_cuda_error("after kernel_3_outer_core");
+ exit_on_cuda_error("after update_veloc_acoustic_cuda");
#endif
}
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2013-09-13 11:23:45 UTC (rev 22782)
@@ -105,12 +105,9 @@
int blocksize = NGLL3;
- int num_blocks_x = mp->nrec_local;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nrec_local,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -162,12 +159,9 @@
int blocksize = NGLL3;
- int num_blocks_x = mp->nrec_local;
- int num_blocks_y = 1;
- while(num_blocks_x > MAXIMUM_GRID_DIM) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nrec_local,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_scalar.f90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -341,9 +341,7 @@
enddo
! adding contributions of neighbours
- call transfer_asmbl_pot_to_device(Mesh_pointer, &
- buffer_recv_scalar, &
- FORWARD_OR_ADJOINT)
+ call transfer_asmbl_pot_to_device(Mesh_pointer,buffer_recv_scalar,FORWARD_OR_ADJOINT)
! note: adding contributions of neighbours has been done just above for cuda
!do iinterface = 1, num_interfaces
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/assemble_MPI_vector.f90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -78,17 +78,16 @@
! send messages
do iinterface = 1, num_interfaces
call isend_cr(buffer_send_vector(1,1,iinterface), &
- NDIM*nibool_interfaces(iinterface), &
- my_neighbours(iinterface), &
- itag, &
- request_send_vector(iinterface) &
- )
+ NDIM*nibool_interfaces(iinterface), &
+ my_neighbours(iinterface), &
+ itag, &
+ request_send_vector(iinterface))
+
call irecv_cr(buffer_recv_vector(1,1,iinterface), &
- NDIM*nibool_interfaces(iinterface), &
- my_neighbours(iinterface), &
- itag, &
- request_recv_vector(iinterface) &
- )
+ NDIM*nibool_interfaces(iinterface), &
+ my_neighbours(iinterface), &
+ itag, &
+ request_recv_vector(iinterface))
enddo
endif
@@ -136,7 +135,7 @@
integer :: FORWARD_OR_ADJOINT
! local parameters
- integer iinterface
+ integer :: iinterface
! send only if more than one partition
if(NPROC > 1) then
@@ -144,25 +143,24 @@
! preparation of the contribution between partitions using MPI
! transfers mpi buffers to CPU
call transfer_boun_accel_from_device(Mesh_pointer, &
- buffer_send_vector,&
- IREGION,FORWARD_OR_ADJOINT)
+ buffer_send_vector,&
+ IREGION,FORWARD_OR_ADJOINT)
- ! send messages
- do iinterface = 1, num_interfaces
- call isend_cr(buffer_send_vector(1,1,iinterface), &
- NDIM*nibool_interfaces(iinterface), &
- my_neighbours(iinterface), &
- itag, &
- request_send_vector(iinterface) &
- )
- call irecv_cr(buffer_recv_vector(1,1,iinterface), &
- NDIM*nibool_interfaces(iinterface), &
- my_neighbours(iinterface), &
- itag, &
- request_recv_vector(iinterface) &
- )
- enddo
+ ! send messages
+ do iinterface = 1, num_interfaces
+ call isend_cr(buffer_send_vector(1,1,iinterface), &
+ NDIM*nibool_interfaces(iinterface), &
+ my_neighbours(iinterface), &
+ itag, &
+ request_send_vector(iinterface))
+ call irecv_cr(buffer_recv_vector(1,1,iinterface), &
+ NDIM*nibool_interfaces(iinterface), &
+ my_neighbours(iinterface), &
+ itag, &
+ request_recv_vector(iinterface))
+ enddo
+
endif
end subroutine assemble_MPI_vector_send_cuda
@@ -252,8 +250,7 @@
integer :: num_interfaces,max_nibool_interfaces
- real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: &
- buffer_recv_vector
+ real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces,num_interfaces) :: buffer_recv_vector
integer, dimension(num_interfaces) :: request_send_vector,request_recv_vector
integer :: IREGION
@@ -275,8 +272,8 @@
! adding contributions of neighbours
call transfer_asmbl_accel_to_device(Mesh_pointer, &
- buffer_recv_vector, &
- IREGION,FORWARD_OR_ADJOINT)
+ buffer_recv_vector, &
+ IREGION,FORWARD_OR_ADJOINT)
! This step is done via previous function transfer_and_assemble...
! do iinterface = 1, num_interfaces
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_add_sources.f90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -133,7 +133,7 @@
enddo
endif
! adds sources: only implements SIMTYPE=1 and NOISE_TOM=0
- call compute_add_sources_el_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
+ call compute_add_sources_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
endif
@@ -288,9 +288,9 @@
else
! on GPU
- call add_sources_el_sim_type_2_or_3(Mesh_pointer,nrec,adj_sourcearrays, &
- islice_selected_rec,ispec_selected_rec, &
- iadj_vec(it))
+ call compute_add_sources_adjoint_cuda(Mesh_pointer,nrec,adj_sourcearrays, &
+ islice_selected_rec,ispec_selected_rec, &
+ iadj_vec(it))
endif
end subroutine compute_add_sources_adjoint
@@ -414,7 +414,7 @@
enddo
endif
! adds sources: only implements SIMTYPE=3 (and NOISE_TOM=0)
- call compute_add_sources_el_s3_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
+ call compute_add_sources_backward_cuda(Mesh_pointer,NSOURCES,stf_pre_compute)
endif
end subroutine compute_add_sources_backward
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_coupling.f90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -333,7 +333,8 @@
if(GRAVITY_VAL) then
pressure = RHO_BOTTOM_OC * (- accel_outer_core(iglob) &
+ minus_g_icb *(displ_inner_core(1,iglob_inner_core)*nx &
- + displ_inner_core(2,iglob_inner_core)*ny + displ_inner_core(3,iglob_inner_core)*nz))
+ + displ_inner_core(2,iglob_inner_core)*ny &
+ + displ_inner_core(3,iglob_inner_core)*nz))
else
pressure = - RHO_BOTTOM_OC * accel_outer_core(iglob)
endif
@@ -425,8 +426,8 @@
! we divide by rmass_crust_mantle() which is 1 / M
! we use the total force which includes the Coriolis term above
force_normal_comp = accel_crust_mantle(1,iglob)*nx / rmassx_crust_mantle(iglob) &
- + accel_crust_mantle(2,iglob)*ny / rmassy_crust_mantle(iglob) &
- + accel_crust_mantle(3,iglob)*nz / rmassz_crust_mantle(iglob)
+ + accel_crust_mantle(2,iglob)*ny / rmassy_crust_mantle(iglob) &
+ + accel_crust_mantle(3,iglob)*nz / rmassz_crust_mantle(iglob)
additional_term_x = (rmass_ocean_load(iglob) - rmassx_crust_mantle(iglob)) * force_normal_comp
additional_term_y = (rmass_ocean_load(iglob) - rmassy_crust_mantle(iglob)) * force_normal_comp
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_acoustic_calling_routine.F90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -204,8 +204,17 @@
enddo ! iphase
! multiply by the inverse of the mass matrix
- call multiply_accel_acoustic(NGLOB_OUTER_CORE,accel_outer_core,rmass_outer_core)
+ if(.NOT. GPU_MODE) then
+ ! on CPU
+ call multiply_accel_acoustic(NGLOB_OUTER_CORE,accel_outer_core,rmass_outer_core)
+ else
+ ! on GPU
+ ! includes FORWARD_OR_ADJOINT == 1
+ call multiply_accel_acoustic_cuda(Mesh_pointer,1)
+ endif
+
+
! time schemes
if( USE_LDDRK ) then
! runge-kutta scheme
@@ -407,25 +416,32 @@
if(.NOT. GPU_MODE) then
! on CPU
call assemble_MPI_scalar_w(NPROCTOT_VAL,NGLOB_OUTER_CORE, &
- b_accel_outer_core, &
- b_buffer_recv_scalar_outer_core,num_interfaces_outer_core,&
- max_nibool_interfaces_oc, &
- nibool_interfaces_outer_core,ibool_interfaces_outer_core, &
- b_request_send_scalar_oc,b_request_recv_scalar_oc)
+ b_accel_outer_core, &
+ b_buffer_recv_scalar_outer_core,num_interfaces_outer_core,&
+ max_nibool_interfaces_oc, &
+ nibool_interfaces_outer_core,ibool_interfaces_outer_core, &
+ b_request_send_scalar_oc,b_request_recv_scalar_oc)
else
! on GPU
call assemble_MPI_scalar_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
- b_buffer_recv_scalar_outer_core, &
- num_interfaces_outer_core,max_nibool_interfaces_oc, &
- b_request_send_scalar_oc,b_request_recv_scalar_oc, &
- 3) ! <-- 3 == adjoint b_accel
+ b_buffer_recv_scalar_outer_core, &
+ num_interfaces_outer_core,max_nibool_interfaces_oc, &
+ b_request_send_scalar_oc,b_request_recv_scalar_oc, &
+ 3) ! <-- 3 == adjoint b_accel
endif
endif ! iphase == 1
enddo ! iphase
! multiply by the inverse of the mass matrix
- call multiply_accel_acoustic(NGLOB_OUTER_CORE_ADJOINT,b_accel_outer_core,b_rmass_outer_core)
+ if(.NOT. GPU_MODE) then
+ ! on CPU
+ call multiply_accel_acoustic(NGLOB_OUTER_CORE_ADJOINT,b_accel_outer_core,b_rmass_outer_core)
+ else
+ ! on GPU
+ ! includes FORWARD_OR_ADJOINT == 3
+ call multiply_accel_acoustic_cuda(Mesh_pointer,3)
+ endif
! time schemes
if( USE_LDDRK ) then
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -73,80 +73,79 @@
! compute internal forces in the solid regions
! note: for anisotropy and gravity, x y and z contain r theta and phi
if( .NOT. GPU_MODE ) then
- ! on CPU
- if( USE_DEVILLE_PRODUCTS_VAL ) then
- ! uses Deville (2002) optimizations
- ! crust/mantle region
- call compute_forces_crust_mantle_Dev( NSPEC_CRUST_MANTLE_STR_OR_ATT,NGLOB_CRUST_MANTLE, &
- NSPEC_CRUST_MANTLE_ATTENUATION, &
- deltat, &
- displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
- phase_is_inner, &
- R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
- R_xz_crust_mantle,R_yz_crust_mantle, &
- R_xx_crust_mantle_lddrk,R_yy_crust_mantle_lddrk,R_xy_crust_mantle_lddrk, &
- R_xz_crust_mantle_lddrk,R_yz_crust_mantle_lddrk, &
- epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
- epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
- eps_trace_over_3_crust_mantle, &
- alphaval,betaval,gammaval, &
- factor_common_crust_mantle,size(factor_common_crust_mantle,5), .false. )
- ! inner core region
- call compute_forces_inner_core_Dev( NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
- NSPEC_INNER_CORE_ATTENUATION, &
- deltat, &
- displ_inner_core,veloc_inner_core,accel_inner_core, &
- phase_is_inner, &
- R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
- R_xz_inner_core,R_yz_inner_core, &
- R_xx_inner_core_lddrk,R_yy_inner_core_lddrk,R_xy_inner_core_lddrk, &
- R_xz_inner_core_lddrk,R_yz_inner_core_lddrk, &
- epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
- epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
- eps_trace_over_3_inner_core,&
- alphaval,betaval,gammaval, &
- factor_common_inner_core,size(factor_common_inner_core,5), .false. )
- else
- ! no Deville optimization
- ! crust/mantle region
- call compute_forces_crust_mantle( NSPEC_CRUST_MANTLE_STR_OR_ATT,NGLOB_CRUST_MANTLE, &
- NSPEC_CRUST_MANTLE_ATTENUATION, &
- deltat, &
- displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
- phase_is_inner, &
- R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
- R_xz_crust_mantle,R_yz_crust_mantle, &
- R_xx_crust_mantle_lddrk,R_yy_crust_mantle_lddrk,R_xy_crust_mantle_lddrk, &
- R_xz_crust_mantle_lddrk,R_yz_crust_mantle_lddrk, &
- epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
- epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
- eps_trace_over_3_crust_mantle, &
- alphaval,betaval,gammaval, &
- factor_common_crust_mantle,size(factor_common_crust_mantle,5), .false. )
- ! inner core region
- call compute_forces_inner_core( NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
- NSPEC_INNER_CORE_ATTENUATION, &
- deltat, &
- displ_inner_core,veloc_inner_core,accel_inner_core, &
- phase_is_inner, &
- R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
- R_xz_inner_core,R_yz_inner_core, &
- R_xx_inner_core_lddrk,R_yy_inner_core_lddrk,R_xy_inner_core_lddrk, &
- R_xz_inner_core_lddrk,R_yz_inner_core_lddrk, &
- epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
- epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
- eps_trace_over_3_inner_core,&
- alphaval,betaval,gammaval, &
- factor_common_inner_core,size(factor_common_inner_core,5), .false. )
-
- endif
+ ! on CPU
+ if( USE_DEVILLE_PRODUCTS_VAL ) then
+ ! uses Deville (2002) optimizations
+ ! crust/mantle region
+ call compute_forces_crust_mantle_Dev(NSPEC_CRUST_MANTLE_STR_OR_ATT,NGLOB_CRUST_MANTLE, &
+ NSPEC_CRUST_MANTLE_ATTENUATION, &
+ deltat, &
+ displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
+ phase_is_inner, &
+ R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
+ R_xz_crust_mantle,R_yz_crust_mantle, &
+ R_xx_crust_mantle_lddrk,R_yy_crust_mantle_lddrk,R_xy_crust_mantle_lddrk, &
+ R_xz_crust_mantle_lddrk,R_yz_crust_mantle_lddrk, &
+ epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+ epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+ eps_trace_over_3_crust_mantle, &
+ alphaval,betaval,gammaval, &
+ factor_common_crust_mantle,size(factor_common_crust_mantle,5), .false. )
+ ! inner core region
+ call compute_forces_inner_core_Dev(NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
+ NSPEC_INNER_CORE_ATTENUATION, &
+ deltat, &
+ displ_inner_core,veloc_inner_core,accel_inner_core, &
+ phase_is_inner, &
+ R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
+ R_xz_inner_core,R_yz_inner_core, &
+ R_xx_inner_core_lddrk,R_yy_inner_core_lddrk,R_xy_inner_core_lddrk, &
+ R_xz_inner_core_lddrk,R_yz_inner_core_lddrk, &
+ epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+ epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+ eps_trace_over_3_inner_core,&
+ alphaval,betaval,gammaval, &
+ factor_common_inner_core,size(factor_common_inner_core,5), .false. )
+ else
+ ! no Deville optimization
+ ! crust/mantle region
+ call compute_forces_crust_mantle(NSPEC_CRUST_MANTLE_STR_OR_ATT,NGLOB_CRUST_MANTLE, &
+ NSPEC_CRUST_MANTLE_ATTENUATION, &
+ deltat, &
+ displ_crust_mantle,veloc_crust_mantle,accel_crust_mantle, &
+ phase_is_inner, &
+ R_xx_crust_mantle,R_yy_crust_mantle,R_xy_crust_mantle, &
+ R_xz_crust_mantle,R_yz_crust_mantle, &
+ R_xx_crust_mantle_lddrk,R_yy_crust_mantle_lddrk,R_xy_crust_mantle_lddrk, &
+ R_xz_crust_mantle_lddrk,R_yz_crust_mantle_lddrk, &
+ epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
+ epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle, &
+ eps_trace_over_3_crust_mantle, &
+ alphaval,betaval,gammaval, &
+ factor_common_crust_mantle,size(factor_common_crust_mantle,5), .false. )
+ ! inner core region
+ call compute_forces_inner_core(NSPEC_INNER_CORE_STR_OR_ATT,NGLOB_INNER_CORE, &
+ NSPEC_INNER_CORE_ATTENUATION, &
+ deltat, &
+ displ_inner_core,veloc_inner_core,accel_inner_core, &
+ phase_is_inner, &
+ R_xx_inner_core,R_yy_inner_core,R_xy_inner_core, &
+ R_xz_inner_core,R_yz_inner_core, &
+ R_xx_inner_core_lddrk,R_yy_inner_core_lddrk,R_xy_inner_core_lddrk, &
+ R_xz_inner_core_lddrk,R_yz_inner_core_lddrk, &
+ epsilondev_xx_inner_core,epsilondev_yy_inner_core,epsilondev_xy_inner_core, &
+ epsilondev_xz_inner_core,epsilondev_yz_inner_core, &
+ eps_trace_over_3_inner_core,&
+ alphaval,betaval,gammaval, &
+ factor_common_inner_core,size(factor_common_inner_core,5), .false. )
+ endif
else
- ! on GPU
- ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
- ! for crust/mantle
- call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase)
- ! for inner core
- call compute_forces_inner_core_cuda(Mesh_pointer,iphase)
+ ! on GPU
+ ! contains forward FORWARD_OR_ADJOINT == 1
+ ! for crust/mantle
+ call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase,1)
+ ! for inner core
+ call compute_forces_inner_core_cuda(Mesh_pointer,iphase,1)
endif ! GPU_MODE
! computes additional contributions to acceleration field
@@ -272,8 +271,7 @@
nibool_interfaces_crust_mantle,&
my_neighbours_crust_mantle, &
request_send_vector_cm,request_recv_vector_cm, &
- IREGION_CRUST_MANTLE, &
- 1) ! <-- 1 == fwd accel
+ IREGION_CRUST_MANTLE,1) ! <-- 1 == fwd accel
! inner core
call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
buffer_send_vector_inner_core,buffer_recv_vector_inner_core, &
@@ -281,8 +279,7 @@
nibool_interfaces_inner_core,&
my_neighbours_inner_core, &
request_send_vector_ic,request_recv_vector_ic, &
- IREGION_INNER_CORE, &
- 1)
+ IREGION_INNER_CORE,1)
endif ! GPU_MODE
else
! waits for send/receive requests to be completed and assembles values
@@ -309,15 +306,13 @@
buffer_recv_vector_crust_mantle, &
num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
request_send_vector_cm,request_recv_vector_cm, &
- IREGION_CRUST_MANTLE, &
- 1) ! <-- 1 == fwd accel
+ IREGION_CRUST_MANTLE,1) ! <-- 1 == fwd accel
! inner core
call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL, &
buffer_recv_vector_inner_core, &
num_interfaces_inner_core,max_nibool_interfaces_ic, &
request_send_vector_ic,request_recv_vector_ic, &
- IREGION_INNER_CORE, &
- 1)
+ IREGION_INNER_CORE,1)
endif
endif ! iphase == 1
@@ -337,7 +332,7 @@
else
! on GPU
! includes FORWARD_OR_ADJOINT == 1
- call update_accel_3_a_cuda(Mesh_pointer,deltatover2,NCHUNKS_VAL,1)
+ call multiply_accel_elastic_cuda(Mesh_pointer,1)
endif
! couples ocean with crust mantle
@@ -353,8 +348,7 @@
NSPEC2D_TOP(IREGION_CRUST_MANTLE) )
else
! on GPU
- call compute_coupling_ocean_cuda(Mesh_pointer,NCHUNKS_VAL,EXACT_MASS_MATRIX_FOR_ROTATION, &
- 1) ! <- 1 == forward arrays
+ call compute_coupling_ocean_cuda(Mesh_pointer,1) ! <- 1 == forward arrays
endif
endif
@@ -463,78 +457,76 @@
! uses Deville (2002) optimizations
! crust/mantle region
call compute_forces_crust_mantle_Dev( NSPEC_CRUST_MANTLE_ADJOINT,NGLOB_CRUST_MANTLE_ADJOINT, &
- NSPEC_CRUST_MANTLE_STR_AND_ATT, &
- b_deltat, &
- b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
- phase_is_inner, &
- b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
- b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
- b_R_xx_crust_mantle_lddrk,b_R_yy_crust_mantle_lddrk,b_R_xy_crust_mantle_lddrk, &
- b_R_xz_crust_mantle_lddrk,b_R_yz_crust_mantle_lddrk, &
- b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
- b_epsilondev_xy_crust_mantle, &
- b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
- b_eps_trace_over_3_crust_mantle, &
- b_alphaval,b_betaval,b_gammaval, &
- factor_common_crust_mantle,size(factor_common_crust_mantle,5), .true. )
+ NSPEC_CRUST_MANTLE_STR_AND_ATT, &
+ b_deltat, &
+ b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
+ phase_is_inner, &
+ b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
+ b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
+ b_R_xx_crust_mantle_lddrk,b_R_yy_crust_mantle_lddrk,b_R_xy_crust_mantle_lddrk, &
+ b_R_xz_crust_mantle_lddrk,b_R_yz_crust_mantle_lddrk, &
+ b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
+ b_epsilondev_xy_crust_mantle, &
+ b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
+ b_eps_trace_over_3_crust_mantle, &
+ b_alphaval,b_betaval,b_gammaval, &
+ factor_common_crust_mantle,size(factor_common_crust_mantle,5), .true. )
! inner core region
call compute_forces_inner_core_Dev( NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
- NSPEC_INNER_CORE_STR_AND_ATT, &
- b_deltat, &
- b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
- phase_is_inner, &
- b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
- b_R_xz_inner_core,b_R_yz_inner_core, &
- b_R_xx_inner_core_lddrk,b_R_yy_inner_core_lddrk,b_R_xy_inner_core_lddrk, &
- b_R_xz_inner_core_lddrk,b_R_yz_inner_core_lddrk, &
- b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
- b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
- b_eps_trace_over_3_inner_core,&
- b_alphaval,b_betaval,b_gammaval, &
- factor_common_inner_core,size(factor_common_inner_core,5), .true. )
-
+ NSPEC_INNER_CORE_STR_AND_ATT, &
+ b_deltat, &
+ b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
+ phase_is_inner, &
+ b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
+ b_R_xz_inner_core,b_R_yz_inner_core, &
+ b_R_xx_inner_core_lddrk,b_R_yy_inner_core_lddrk,b_R_xy_inner_core_lddrk, &
+ b_R_xz_inner_core_lddrk,b_R_yz_inner_core_lddrk, &
+ b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+ b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
+ b_eps_trace_over_3_inner_core,&
+ b_alphaval,b_betaval,b_gammaval, &
+ factor_common_inner_core,size(factor_common_inner_core,5), .true. )
else
! no Deville optimization
! crust/mantle region
call compute_forces_crust_mantle( NSPEC_CRUST_MANTLE_ADJOINT,NGLOB_CRUST_MANTLE_ADJOINT, &
- NSPEC_CRUST_MANTLE_STR_AND_ATT, &
- b_deltat, &
- b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
- phase_is_inner, &
- b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
- b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
- b_R_xx_crust_mantle_lddrk,b_R_yy_crust_mantle_lddrk,b_R_xy_crust_mantle_lddrk, &
- b_R_xz_crust_mantle_lddrk,b_R_yz_crust_mantle_lddrk, &
- b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
- b_epsilondev_xy_crust_mantle, &
- b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
- b_eps_trace_over_3_crust_mantle, &
- b_alphaval,b_betaval,b_gammaval, &
- factor_common_crust_mantle,size(factor_common_crust_mantle,5), .true. )
-
+ NSPEC_CRUST_MANTLE_STR_AND_ATT, &
+ b_deltat, &
+ b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
+ phase_is_inner, &
+ b_R_xx_crust_mantle,b_R_yy_crust_mantle,b_R_xy_crust_mantle, &
+ b_R_xz_crust_mantle,b_R_yz_crust_mantle, &
+ b_R_xx_crust_mantle_lddrk,b_R_yy_crust_mantle_lddrk,b_R_xy_crust_mantle_lddrk, &
+ b_R_xz_crust_mantle_lddrk,b_R_yz_crust_mantle_lddrk, &
+ b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle,&
+ b_epsilondev_xy_crust_mantle, &
+ b_epsilondev_xz_crust_mantle,b_epsilondev_yz_crust_mantle, &
+ b_eps_trace_over_3_crust_mantle, &
+ b_alphaval,b_betaval,b_gammaval, &
+ factor_common_crust_mantle,size(factor_common_crust_mantle,5), .true. )
! inner core region
call compute_forces_inner_core( NSPEC_INNER_CORE_ADJOINT,NGLOB_INNER_CORE_ADJOINT, &
- NSPEC_INNER_CORE_STR_AND_ATT, &
- b_deltat, &
- b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
- phase_is_inner, &
- b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
- b_R_xz_inner_core,b_R_yz_inner_core, &
- b_R_xx_inner_core_lddrk,b_R_yy_inner_core_lddrk,b_R_xy_inner_core_lddrk, &
- b_R_xz_inner_core_lddrk,b_R_yz_inner_core_lddrk, &
- b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
- b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
- b_eps_trace_over_3_inner_core,&
- b_alphaval,b_betaval,b_gammaval, &
- factor_common_inner_core,size(factor_common_inner_core,5), .true. )
+ NSPEC_INNER_CORE_STR_AND_ATT, &
+ b_deltat, &
+ b_displ_inner_core,b_veloc_inner_core,b_accel_inner_core, &
+ phase_is_inner, &
+ b_R_xx_inner_core,b_R_yy_inner_core,b_R_xy_inner_core, &
+ b_R_xz_inner_core,b_R_yz_inner_core, &
+ b_R_xx_inner_core_lddrk,b_R_yy_inner_core_lddrk,b_R_xy_inner_core_lddrk, &
+ b_R_xz_inner_core_lddrk,b_R_yz_inner_core_lddrk, &
+ b_epsilondev_xx_inner_core,b_epsilondev_yy_inner_core,b_epsilondev_xy_inner_core, &
+ b_epsilondev_xz_inner_core,b_epsilondev_yz_inner_core, &
+ b_eps_trace_over_3_inner_core,&
+ b_alphaval,b_betaval,b_gammaval, &
+ factor_common_inner_core,size(factor_common_inner_core,5), .true. )
endif
else
! on GPU
- ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
+ ! contains forward FORWARD_OR_ADJOINT == 3
! for crust/mantle
- call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase)
+ call compute_forces_crust_mantle_cuda(Mesh_pointer,iphase,3)
! for inner core
- call compute_forces_inner_core_cuda(Mesh_pointer,iphase)
+ call compute_forces_inner_core_cuda(Mesh_pointer,iphase,3)
endif ! GPU_MODE
! computes additional contributions to acceleration field
@@ -647,8 +639,7 @@
nibool_interfaces_crust_mantle,&
my_neighbours_crust_mantle, &
b_request_send_vector_cm,b_request_recv_vector_cm, &
- IREGION_CRUST_MANTLE, &
- 3) ! <-- 3 == adjoint b_accel
+ IREGION_CRUST_MANTLE,3) ! <-- 3 == adjoint b_accel
! inner core
call assemble_MPI_vector_send_cuda(Mesh_pointer,NPROCTOT_VAL, &
b_buffer_send_vector_inner_core,b_buffer_recv_vector_inner_core, &
@@ -656,8 +647,7 @@
nibool_interfaces_inner_core,&
my_neighbours_inner_core, &
b_request_send_vector_ic,b_request_recv_vector_ic, &
- IREGION_INNER_CORE, &
- 3)
+ IREGION_INNER_CORE,3)
endif ! GPU
else
! waits for send/receive requests to be completed and assembles values
@@ -687,15 +677,13 @@
b_buffer_recv_vector_cm, &
num_interfaces_crust_mantle,max_nibool_interfaces_cm, &
b_request_send_vector_cm,b_request_recv_vector_cm, &
- IREGION_CRUST_MANTLE, &
- 3) ! <-- 3 == adjoint b_accel
+ IREGION_CRUST_MANTLE,3) ! <-- 3 == adjoint b_accel
! inner core
call assemble_MPI_vector_write_cuda(Mesh_pointer,NPROCTOT_VAL,&
b_buffer_recv_vector_inner_core, &
num_interfaces_inner_core,max_nibool_interfaces_ic, &
b_request_send_vector_ic,b_request_recv_vector_ic, &
- IREGION_INNER_CORE, &
- 3)
+ IREGION_INNER_CORE,3)
endif
endif ! iphase == 1
@@ -716,7 +704,7 @@
else
! on GPU
! includes FORWARD_OR_ADJOINT == 3
- call update_accel_3_a_cuda(Mesh_pointer,b_deltatover2,NCHUNKS_VAL,3)
+ call multiply_accel_elastic_cuda(Mesh_pointer,3)
endif
! couples ocean with crust mantle
@@ -732,8 +720,7 @@
NSPEC2D_TOP(IREGION_CRUST_MANTLE) )
else
! on GPU
- call compute_coupling_ocean_cuda(Mesh_pointer,NCHUNKS_VAL,EXACT_MASS_MATRIX_FOR_ROTATION, &
- 3) ! <- 3 == backward/reconstructed arrays
+ call compute_coupling_ocean_cuda(Mesh_pointer,3) ! <- 3 == backward/reconstructed arrays
endif
endif
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_crust_mantle.f90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -128,8 +128,8 @@
else
! on GPU
if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
- absorb_xmin_crust_mantle, &
- 0) ! <= xmin
+ absorb_xmin_crust_mantle, &
+ 0) ! <= xmin
endif
! writes absorbing boundary values
@@ -190,8 +190,8 @@
else
! on GPU
if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
- absorb_xmax_crust_mantle, &
- 1) ! <= xmin
+ absorb_xmax_crust_mantle, &
+ 1) ! <= xmin
endif
@@ -251,8 +251,8 @@
else
! on GPU
if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
- absorb_ymin_crust_mantle, &
- 2) ! <= ymin
+ absorb_ymin_crust_mantle, &
+ 2) ! <= ymin
endif
! writes absorbing boundary values
@@ -309,8 +309,8 @@
else
! on GPU
if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
- absorb_ymax_crust_mantle, &
- 3) ! <= ymax
+ absorb_ymax_crust_mantle, &
+ 3) ! <= ymax
endif
! writes absorbing boundary values
@@ -401,8 +401,8 @@
else
! on GPU
if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
- absorb_xmin_crust_mantle, &
- 0) ! <= xmin
+ absorb_xmin_crust_mantle, &
+ 0) ! <= xmin
endif
endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC
@@ -438,8 +438,8 @@
else
! on GPU
if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
- absorb_xmax_crust_mantle, &
- 1) ! <= xmin
+ absorb_xmax_crust_mantle, &
+ 1) ! <= xmin
endif
endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB
@@ -473,8 +473,8 @@
else
! on GPU
if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
- absorb_ymin_crust_mantle, &
- 2) ! <= ymin
+ absorb_ymin_crust_mantle, &
+ 2) ! <= ymin
endif
! ymax
@@ -506,8 +506,8 @@
else
! on GPU
if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_backward_cuda(Mesh_pointer, &
- absorb_ymax_crust_mantle, &
- 3) ! <= ymax
+ absorb_ymax_crust_mantle, &
+ 3) ! <= ymax
endif
end subroutine compute_stacey_crust_mantle_backward
@@ -542,9 +542,7 @@
njmin_crust_mantle,njmax_crust_mantle, &
nkmin_xi_crust_mantle,nkmin_eta_crust_mantle, &
nspec2D_xmin_crust_mantle,nspec2D_xmax_crust_mantle, &
- nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle, &
- absorb_xmin_crust_mantle,absorb_xmax_crust_mantle, &
- absorb_ymin_crust_mantle,absorb_ymax_crust_mantle
+ nspec2D_ymin_crust_mantle,nspec2D_ymax_crust_mantle
implicit none
@@ -566,9 +564,6 @@
if( SIMULATION_TYPE /= 3 ) return
if( SAVE_FORWARD ) return
- ! daniel debug
- if( GPU_MODE ) stop 'error compute_stacey_crust_mantle_backward_undoatt not implemented yet for GPU simulations'
-
! crust & mantle
! xmin
@@ -615,9 +610,7 @@
else
! on GPU
- if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
- absorb_xmin_crust_mantle, &
- 0) ! <= xmin
+ if( nspec2D_xmin_crust_mantle > 0 ) call compute_stacey_elastic_undoatt_cuda(Mesh_pointer,0) ! <= xmin
endif
endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AC
@@ -666,9 +659,7 @@
else
! on GPU
- if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
- absorb_xmax_crust_mantle, &
- 1) ! <= xmin
+ if( nspec2D_xmax_crust_mantle > 0 ) call compute_stacey_elastic_undoatt_cuda(Mesh_pointer,1) ! <= xmin
endif
endif ! NCHUNKS_VAL == 1 .or. ichunk == CHUNK_AB
@@ -715,9 +706,7 @@
else
! on GPU
- if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
- absorb_ymin_crust_mantle, &
- 2) ! <= ymin
+ if( nspec2D_ymin_crust_mantle > 0 ) call compute_stacey_elastic_undoatt_cuda(Mesh_pointer,2) ! <= ymin
endif
! ymax
@@ -762,9 +751,7 @@
else
! on GPU
- if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_cuda(Mesh_pointer, &
- absorb_ymax_crust_mantle, &
- 3) ! <= ymax
+ if( nspec2D_ymax_crust_mantle > 0 ) call compute_stacey_elastic_undoatt_cuda(Mesh_pointer,3) ! <= ymax
endif
end subroutine compute_stacey_crust_mantle_backward_undoatt
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_stacey_outer_core.f90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -548,9 +548,6 @@
if( SIMULATION_TYPE /= 3 ) return
if( SAVE_FORWARD ) return
- ! daniel debug
- if( GPU_MODE ) stop 'error compute_stacey_outer_core_backward_undoatt not implemented yet for GPU simulations'
-
! outer core
! xmin
@@ -583,9 +580,7 @@
else
! on GPU
- if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
- absorb_xmin_outer_core, &
- 4) ! <= xmin
+ if( nspec2D_xmin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,4) ! <= xmin
endif
endif
@@ -620,9 +615,7 @@
else
! on GPU
- if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
- absorb_xmax_outer_core, &
- 5) ! <= xmax
+ if( nspec2D_xmax_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,5) ! <= xmax
endif
endif
@@ -655,9 +648,7 @@
else
! on GPU
- if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
- absorb_ymin_outer_core, &
- 6) ! <= ymin
+ if( nspec2D_ymin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,6) ! <= ymin
endif
! ymax
@@ -688,9 +679,7 @@
else
! on GPU
- if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
- absorb_ymax_outer_core, &
- 7) ! <= ymax
+ if( nspec2D_ymax_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,7) ! <= ymax
endif
! zmin
@@ -720,9 +709,7 @@
else
! on GPU
- if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_cuda(Mesh_pointer, &
- absorb_zmin_outer_core, &
- 8) ! <= zmin
+ if( nspec2D_zmin_outer_core > 0 ) call compute_stacey_acoustic_undoatt_cuda(Mesh_pointer,8) ! <= zmin
endif
end subroutine compute_stacey_outer_core_backward_undoatt
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -2161,23 +2161,23 @@
! inner core region
if(myrank == 0 ) write(IMAIN,*) " loading inner core region"
call prepare_inner_core_device(Mesh_pointer, &
- xix_inner_core,xiy_inner_core,xiz_inner_core, &
- etax_inner_core,etay_inner_core,etaz_inner_core, &
- gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
- rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
- rmassx_inner_core,rmassy_inner_core,rmassz_inner_core, &
- b_rmassx_inner_core,b_rmassy_inner_core, &
- ibool_inner_core, &
- xstore_inner_core,ystore_inner_core,zstore_inner_core, &
- c11store_inner_core,c12store_inner_core,c13store_inner_core, &
- c33store_inner_core,c44store_inner_core, &
- idoubling_inner_core, &
- num_phase_ispec_inner_core,phase_ispec_inner_inner_core, &
- nspec_outer_inner_core,nspec_inner_inner_core, &
- NSPEC2D_TOP(IREGION_INNER_CORE), &
- ibelm_top_inner_core, &
- num_colors_outer_inner_core,num_colors_inner_inner_core, &
- num_elem_colors_inner_core)
+ xix_inner_core,xiy_inner_core,xiz_inner_core, &
+ etax_inner_core,etay_inner_core,etaz_inner_core, &
+ gammax_inner_core,gammay_inner_core,gammaz_inner_core, &
+ rhostore_inner_core,kappavstore_inner_core,muvstore_inner_core, &
+ rmassx_inner_core,rmassy_inner_core,rmassz_inner_core, &
+ b_rmassx_inner_core,b_rmassy_inner_core, &
+ ibool_inner_core, &
+ xstore_inner_core,ystore_inner_core,zstore_inner_core, &
+ c11store_inner_core,c12store_inner_core,c13store_inner_core, &
+ c33store_inner_core,c44store_inner_core, &
+ idoubling_inner_core, &
+ num_phase_ispec_inner_core,phase_ispec_inner_inner_core, &
+ nspec_outer_inner_core,nspec_inner_inner_core, &
+ NSPEC2D_TOP(IREGION_INNER_CORE), &
+ ibelm_top_inner_core, &
+ num_colors_outer_inner_core,num_colors_inner_inner_core, &
+ num_elem_colors_inner_core)
! transfer forward and backward fields to device with initial values
if(myrank == 0 ) write(IMAIN,*) " transfering initial wavefield"
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_forward_arrays.f90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -198,6 +198,7 @@
! transfers fields onto GPU
if(GPU_MODE) then
+ ! transfers initialized wavefields to GPU device
call transfer_b_fields_cm_to_device(NDIM*NGLOB_CRUST_MANTLE, &
b_displ_crust_mantle,b_veloc_crust_mantle,b_accel_crust_mantle, &
Mesh_pointer)
@@ -209,7 +210,7 @@
call transfer_b_fields_oc_to_device(NGLOB_OUTER_CORE, &
b_displ_outer_core,b_veloc_outer_core,b_accel_outer_core, &
Mesh_pointer)
-
+ ! strain
if( .not. UNDO_ATTENUATION ) then
call transfer_b_strain_cm_to_device(Mesh_pointer, &
b_epsilondev_xx_crust_mantle,b_epsilondev_yy_crust_mantle, &
@@ -221,20 +222,20 @@
b_epsilondev_xy_inner_core,b_epsilondev_xz_inner_core, &
b_epsilondev_yz_inner_core)
endif
-
+ ! rotation
if (ROTATION_VAL) then
call transfer_b_rotation_to_device(Mesh_pointer,b_A_array_rotation,b_B_array_rotation)
endif
-
+ ! attenuation memory variables
if (ATTENUATION_VAL) then
call transfer_b_rmemory_cm_to_device(Mesh_pointer, &
- b_R_xx_crust_mantle,b_R_yy_crust_mantle, &
- b_R_xy_crust_mantle,b_R_xz_crust_mantle, &
- b_R_yz_crust_mantle)
+ b_R_xx_crust_mantle,b_R_yy_crust_mantle, &
+ b_R_xy_crust_mantle,b_R_xz_crust_mantle, &
+ b_R_yz_crust_mantle)
call transfer_b_rmemory_ic_to_device(Mesh_pointer, &
- b_R_xx_inner_core,b_R_yy_inner_core, &
- b_R_xy_inner_core,b_R_xz_inner_core, &
- b_R_yz_inner_core)
+ b_R_xx_inner_core,b_R_yy_inner_core, &
+ b_R_xy_inner_core,b_R_xz_inner_core, &
+ b_R_yz_inner_core)
endif
endif
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/read_mesh_databases.F90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -819,18 +819,18 @@
endif
allocate(buffer_send_vector_crust_mantle(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
- buffer_recv_vector_crust_mantle(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
- request_send_vector_cm(num_interfaces_crust_mantle), &
- request_recv_vector_cm(num_interfaces_crust_mantle), &
- stat=ier)
+ buffer_recv_vector_crust_mantle(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
+ request_send_vector_cm(num_interfaces_crust_mantle), &
+ request_recv_vector_cm(num_interfaces_crust_mantle), &
+ stat=ier)
if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_crust_mantle etc.')
if( SIMULATION_TYPE == 3 ) then
allocate(b_buffer_send_vector_cm(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
- b_buffer_recv_vector_cm(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
- b_request_send_vector_cm(num_interfaces_crust_mantle), &
- b_request_recv_vector_cm(num_interfaces_crust_mantle), &
- stat=ier)
+ b_buffer_recv_vector_cm(NDIM,max_nibool_interfaces_cm,num_interfaces_crust_mantle), &
+ b_request_send_vector_cm(num_interfaces_crust_mantle), &
+ b_request_recv_vector_cm(num_interfaces_crust_mantle), &
+ stat=ier)
if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_cm etc.')
endif
@@ -842,18 +842,18 @@
endif
allocate(buffer_send_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
- buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
- request_send_scalar_oc(num_interfaces_outer_core), &
- request_recv_scalar_oc(num_interfaces_outer_core), &
- stat=ier)
+ buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
+ request_send_scalar_oc(num_interfaces_outer_core), &
+ request_recv_scalar_oc(num_interfaces_outer_core), &
+ stat=ier)
if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_outer_core etc.')
if( SIMULATION_TYPE == 3 ) then
allocate(b_buffer_send_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
- b_buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
- b_request_send_scalar_oc(num_interfaces_outer_core), &
- b_request_recv_scalar_oc(num_interfaces_outer_core), &
- stat=ier)
+ b_buffer_recv_scalar_outer_core(max_nibool_interfaces_oc,num_interfaces_outer_core), &
+ b_request_send_scalar_oc(num_interfaces_outer_core), &
+ b_request_recv_scalar_oc(num_interfaces_outer_core), &
+ stat=ier)
if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_outer_core etc.')
endif
@@ -865,18 +865,18 @@
endif
allocate(buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
- buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
- request_send_vector_ic(num_interfaces_inner_core), &
- request_recv_vector_ic(num_interfaces_inner_core), &
- stat=ier)
+ buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
+ request_send_vector_ic(num_interfaces_inner_core), &
+ request_recv_vector_ic(num_interfaces_inner_core), &
+ stat=ier)
if( ier /= 0 ) call exit_mpi(myrank,'error allocating array buffer_send_vector_inner_core etc.')
if( SIMULATION_TYPE == 3 ) then
allocate(b_buffer_send_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
- b_buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
- b_request_send_vector_ic(num_interfaces_inner_core), &
- b_request_recv_vector_ic(num_interfaces_inner_core), &
- stat=ier)
+ b_buffer_recv_vector_inner_core(NDIM,max_nibool_interfaces_ic,num_interfaces_inner_core), &
+ b_request_send_vector_ic(num_interfaces_inner_core), &
+ b_request_recv_vector_ic(num_interfaces_inner_core), &
+ stat=ier)
if( ier /= 0 ) call exit_mpi(myrank,'error allocating array b_buffer_send_vector_inner_core etc.')
endif
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90 2013-09-13 01:26:30 UTC (rev 22781)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/update_displacement_Newmark.f90 2013-09-13 11:23:45 UTC (rev 22782)
@@ -86,12 +86,12 @@
else
! on GPU
! Includes FORWARD_OR_ADJOINT == 1
+ ! crust/mantle region
+ call update_displacement_cm_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
! outer core region
call update_displacement_oc_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
! inner core region
call update_displacement_ic_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
- ! crust/mantle region
- call update_displacement_cm_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,1)
endif
end subroutine update_displacement_Newmark
@@ -129,12 +129,12 @@
else
! on GPU
! Includes FORWARD_OR_ADJOINT == 3
+ ! crust/mantle region
+ call update_displacement_cm_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
! outer core region
call update_displacement_oc_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
! inner core region
call update_displacement_ic_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
- ! crust/mantle region
- call update_displacement_cm_cuda(Mesh_pointer,b_deltat,b_deltatsqover2,b_deltatover2,3)
endif
end subroutine update_displacement_Newmark_backward
@@ -231,7 +231,7 @@
else
! on GPU
! includes FORWARD_OR_ADJOINT == 1
- call kernel_3_outer_core_cuda(Mesh_pointer,deltatover2,1)
+ call update_veloc_acoustic_cuda(Mesh_pointer,deltatover2,1)
endif
end subroutine update_veloc_acoustic_newmark
@@ -257,7 +257,7 @@
else
! on GPU
! includes FORWARD_OR_ADJOINT == 3
- call kernel_3_outer_core_cuda(Mesh_pointer,b_deltatover2,3)
+ call update_veloc_acoustic_cuda(Mesh_pointer,b_deltatover2,3)
endif
end subroutine update_veloc_acoustic_newmark_backward
@@ -321,7 +321,7 @@
else
! on GPU
! includes FORWARD_OR_ADJOINT == 1
- call update_veloc_3_b_cuda(Mesh_pointer,deltatover2,1)
+ call update_veloc_elastic_cuda(Mesh_pointer,deltatover2,1)
endif
@@ -350,7 +350,7 @@
else
! on GPU
! includes FORWARD_OR_ADJOINT == 3
- call update_veloc_3_b_cuda(Mesh_pointer,b_deltatover2,3)
+ call update_veloc_elastic_cuda(Mesh_pointer,b_deltatover2,3)
endif
More information about the CIG-COMMITS
mailing list