[cig-commits] r20512 - in seismo/3D/SPECFEM3D/trunk/src: cuda specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Tue Jul 10 11:22:43 PDT 2012
Author: danielpeter
Date: 2012-07-10 11:22:43 -0700 (Tue, 10 Jul 2012)
New Revision: 20512
Modified:
seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
Log:
bug fix in stability check for large meshes on GPUs; updates GPU handling for single mpi-process runs; outcomments unused GPU mesh constants
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu 2012-07-10 18:22:43 UTC (rev 20512)
@@ -303,12 +303,13 @@
*/
// reduction example:
- __shared__ realw sdata[256] ;
+ __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
// load shared mem
unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
-
+ unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+ unsigned int i = tid + bx*blockDim.x;
+
// loads absolute values into shared memory
sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ;
@@ -327,7 +328,7 @@
}
// write result for this block to global mem
- if (tid == 0) d_max[blockIdx.x] = sdata[0];
+ if (tid == 0) d_max[bx] = sdata[0];
}
@@ -381,15 +382,21 @@
// way 2 b: timing Elapsed time: 1.236916e-03
// launch simple reduction kernel
realw* h_max;
- int blocksize = 256;
-
- int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->NGLOB_AB)/((double)blocksize)))*blocksize;
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+
//printf("num_blocks_x %i \n",num_blocks_x);
- h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
- cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
+ h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+ cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
- dim3 grid(num_blocks_x,1);
+ dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
if(*SIMULATION_TYPE == 1 ){
@@ -404,11 +411,12 @@
d_max);
}
- print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
+ print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
+ cudaMemcpyDeviceToHost),222);
// determines max for all blocks
max = h_max[0];
- for(int i=1;i<num_blocks_x;i++) {
+ for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
if( max < h_max[i]) max = h_max[i];
}
@@ -457,7 +465,7 @@
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//double end_time = get_time();
//printf("Elapsed time: %e\n",end_time-start_time);
- exit_on_cuda_error("after get_norm_acoustic_from_device");
+ exit_on_cuda_error("get_norm_acoustic_from_device");
#endif
}
@@ -470,11 +478,12 @@
__global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
// reduction example:
- __shared__ realw sdata[256] ;
+ __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
// load shared mem
unsigned int tid = threadIdx.x;
- unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+ unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+ unsigned int i = tid + bx*blockDim.x;
// loads values into shared memory: assume array is a vector array
sdata[tid] = (i < size) ? sqrt(array[i*3]*array[i*3]
@@ -496,7 +505,7 @@
}
// write result for this block to global mem
- if (tid == 0) d_max[blockIdx.x] = sdata[0];
+ if (tid == 0) d_max[bx] = sdata[0];
}
@@ -505,8 +514,8 @@
extern "C"
void FC_FUNC_(get_norm_elastic_from_device,
GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
- long* Mesh_pointer_f,
- int* SIMULATION_TYPE) {
+ long* Mesh_pointer_f,
+ int* SIMULATION_TYPE) {
TRACE("get_norm_elastic_from_device");
//double start_time = get_time();
@@ -515,19 +524,24 @@
realw max;
realw *d_max;
- max = 0;
+ max = 0.0;
// launch simple reduction kernel
realw* h_max;
- int blocksize = 256;
-
- int num_blocks_x = (int) ceil(mp->NGLOB_AB/blocksize);
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->NGLOB_AB)/((double)blocksize)))*blocksize;
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+
//printf("num_blocks_x %i \n",num_blocks_x);
+ h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+ cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
- h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
- cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
-
- dim3 grid(num_blocks_x,1);
+ dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
if(*SIMULATION_TYPE == 1 ){
@@ -542,11 +556,18 @@
d_max);
}
- print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("kernel get_norm_elastic_from_device");
+#endif
+
+ print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
+ cudaMemcpyDeviceToHost),222);
// determines max for all blocks
max = h_max[0];
- for(int i=1;i<num_blocks_x;i++) {
+ for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
if( max < h_max[i]) max = h_max[i];
}
@@ -559,7 +580,7 @@
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//double end_time = get_time();
//printf("Elapsed time: %e\n",end_time-start_time);
- exit_on_cuda_error("after get_norm_elastic_from_device");
+ exit_on_cuda_error("get_norm_elastic_from_device");
#endif
}
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu 2012-07-10 18:22:43 UTC (rev 20512)
@@ -185,6 +185,8 @@
// cudaEventCreate(&start);
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
+
+ if( *num_interfaces_ext_mesh == 0 ) return;
// copies buffer onto GPU
cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_elastic_cuda.cu 2012-07-10 18:22:43 UTC (rev 20512)
@@ -104,51 +104,53 @@
if( *num_interfaces_ext_mesh == 0 ) return;
- int blocksize = BLOCKSIZE_TRANSFER;
- int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ if( mp->size_mpi_buffer > 0 ){
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
- //timing for memory xfer
- // cudaEvent_t start, stop;
- // realw time;
- // cudaEventCreate(&start);
- // cudaEventCreate(&stop);
- // cudaEventRecord( start, 0 );
- if(*FORWARD_OR_ADJOINT == 1) {
- prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
- }
- else if(*FORWARD_OR_ADJOINT == 3) {
- prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_send_accel_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
- }
+ //timing for memory xfer
+ // cudaEvent_t start, stop;
+ // realw time;
+ // cudaEventCreate(&start);
+ // cudaEventCreate(&stop);
+ // cudaEventRecord( start, 0 );
+ if(*FORWARD_OR_ADJOINT == 1) {
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ }
+ else if(*FORWARD_OR_ADJOINT == 3) {
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_send_accel_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ }
+ // copies buffer from GPU to CPU host
+ cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
+ mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
- 3*mp->max_nibool_interfaces_ext_mesh*mp->num_interfaces_ext_mesh*sizeof(realw),
- cudaMemcpyDeviceToHost);
-
- // finish timing of kernel+memcpy
- // cudaEventRecord( stop, 0 );
- // cudaEventSynchronize( stop );
- // cudaEventElapsedTime( &time, start, stop );
- // cudaEventDestroy( start );
- // cudaEventDestroy( stop );
- // printf("boundary xfer d->h Time: %f ms\n",time);
+ // finish timing of kernel+memcpy
+ // cudaEventRecord( stop, 0 );
+ // cudaEventSynchronize( stop );
+ // cudaEventElapsedTime( &time, start, stop );
+ // cudaEventDestroy( start );
+ // cudaEventDestroy( stop );
+ // printf("boundary xfer d->h Time: %f ms\n",time);
+ }
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("transfer_boun_accel_from_device");
#endif
@@ -167,17 +169,19 @@
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+ if( mp->size_mpi_buffer > 0 ){
+
//daniel: todo - check below with this...
- int blocksize = BLOCKSIZE_TRANSFER;
- int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
/*
//daniel: todo - check originally used...
@@ -193,25 +197,24 @@
dim3 threads(blocksize,1,1);
*/
- prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
mp->num_interfaces_ext_mesh,
mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
// wait until kernel is finished before starting async memcpy
#if CUDA_VERSION >= 4000
- cudaDeviceSynchronize();
+ cudaDeviceSynchronize();
#else
- cudaThreadSynchronize();
+ cudaThreadSynchronize();
#endif
- cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
- 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
- cudaMemcpyDeviceToHost,mp->copy_stream);
- // cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
- // 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
- // cudaMemcpyDeviceToHost,mp->compute_stream);
-
+ cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
+ mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost,mp->copy_stream);
+ // cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
+ // 3* mp->max_nibool_interfaces_ext_mesh* mp->num_interfaces_ext_mesh*sizeof(realw),
+ // cudaMemcpyDeviceToHost,mp->compute_stream);
+ }
}
/* ----------------------------------------------------------------------------------------------- */
@@ -271,19 +274,16 @@
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_recv_buffer*sizeof(realw));
-
- // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
- // 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
- // cudaMemcpyHostToDevice,mp->compute_stream);
- //printf("xfer to device\n");
- cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
- 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
- cudaMemcpyHostToDevice,mp->copy_stream);
-
-
-
-
+ if( mp->size_mpi_buffer > 0 ){
+ // copy on host memory
+ memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_buffer*sizeof(realw));
+
+ // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+ // mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->compute_stream);
+ //printf("xfer to device\n");
+ cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+ mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->copy_stream);
+ }
}
/* ----------------------------------------------------------------------------------------------- */
@@ -371,55 +371,58 @@
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously?
- if(*FORWARD_OR_ADJOINT == 1 ){
- // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
- cudaStreamSynchronize(mp->copy_stream);
- }
- else if(*FORWARD_OR_ADJOINT == 3 ){
- cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
- 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),
- cudaMemcpyHostToDevice);
- }
+ if( mp->size_mpi_buffer > 0 ){
+
+ //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously?
+ if(*FORWARD_OR_ADJOINT == 1 ){
+ // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+ cudaStreamSynchronize(mp->copy_stream);
+ }
+ else if(*FORWARD_OR_ADJOINT == 3 ){
+ cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+ mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice);
+ }
- int blocksize = BLOCKSIZE_TRANSFER;
- int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+ int num_blocks_x = size_padded/blocksize;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = (int) ceil(num_blocks_x*0.5f);
+ num_blocks_y = num_blocks_y*2;
+ }
- //double start_time = get_time();
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
- // cudaEvent_t start, stop;
- // realw time;
- // cudaEventCreate(&start);
- // cudaEventCreate(&stop);
- // cudaEventRecord( start, 0 );
- if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
- assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
+ //double start_time = get_time();
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+ // cudaEvent_t start, stop;
+ // realw time;
+ // cudaEventCreate(&start);
+ // cudaEventCreate(&stop);
+ // cudaEventRecord( start, 0 );
+ if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ }
+ else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ }
+
+ // cudaEventRecord( stop, 0 );
+ // cudaEventSynchronize( stop );
+ // cudaEventElapsedTime( &time, start, stop );
+ // cudaEventDestroy( start );
+ // cudaEventDestroy( stop );
+ // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
}
- else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
- assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
- }
-
- // cudaEventRecord( stop, 0 );
- // cudaEventSynchronize( stop );
- // cudaEventElapsedTime( &time, start, stop );
- // cudaEventDestroy( start );
- // cudaEventDestroy( stop );
- // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//double end_time = get_time();
//printf("Elapsed time: %e\n",end_time-start_time);
@@ -1629,6 +1632,14 @@
color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
// for factor_common array
color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
+
+ //daniel: we use the same stream, so kernels are executed one after the other
+ // synchronizes in case we run on only 1 process to avoid race-conditions
+ //if( mp->NPROC == 1 ){
+ // // Wait until previous compute stream finishes.
+ // cudaStreamSynchronize(mp->compute_stream);
+ //}
+
}
}else{
@@ -1706,7 +1717,7 @@
// // There have been problems using the pinned-memory with MPI, so
// // we copy the buffer into a non-pinned region.
// memcpy(mp->send_buffer,mp->h_send_accel_buffer,
-// mp->size_mpi_send_buffer*sizeof(float));
+// mp->size_mpi_buffer*sizeof(float));
//
// // memory copy is now finished, so non-blocking MPI send can proceed
// // MPI based halo exchange
@@ -1744,19 +1755,16 @@
// Wait until async-memcpy of outer elements is finished and start MPI.
if( *iphase != 2 ){ exit_on_cuda_error("sync_copy_from_device must be called for iphase == 2"); }
- //if(*iphase==2) {
+ if( mp->size_mpi_buffer > 0 ){
+ // waits for asynchronous copy to finish
+ cudaStreamSynchronize(mp->copy_stream);
- // waits for asynchronous copy to finish
- cudaStreamSynchronize(mp->copy_stream);
-
- // There have been problems using the pinned-memory with MPI, so
- // we copy the buffer into a non-pinned region.
- memcpy(send_buffer,mp->h_send_accel_buffer,
- mp->size_mpi_send_buffer*sizeof(float));
-
+ // There have been problems using the pinned-memory with MPI, so
+ // we copy the buffer into a non-pinned region.
+ memcpy(send_buffer,mp->h_send_accel_buffer,
+ mp->size_mpi_buffer*sizeof(float));
+ }
// memory copy is now finished, so non-blocking MPI send can proceed
-
- //}
}
/* ----------------------------------------------------------------------------------------------- */
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu 2012-07-10 18:22:43 UTC (rev 20512)
@@ -28,15 +28,15 @@
#include <stdio.h>
#include <cuda.h>
-#include <cublas.h>
+//#include <cublas.h>
#include "config.h"
#include "mesh_constants_cuda.h"
-#define CUBLAS_ERROR(s,n) if (s != CUBLAS_STATUS_SUCCESS) { \
-fprintf (stderr, "CUBLAS Memory Write Error @ %d\n",n); \
-exit(EXIT_FAILURE); }
+//#define CUBLAS_ERROR(s,n) if (s != CUBLAS_STATUS_SUCCESS) { \
+//fprintf (stderr, "CUBLAS Memory Write Error @ %d\n",n); \
+//exit(EXIT_FAILURE); }
/* ----------------------------------------------------------------------------------------------- */
@@ -84,12 +84,6 @@
//int i,device;
int size = *size_F;
- realw deltat = *deltat_F;
- realw deltatsqover2 = *deltatsqover2_F;
- realw deltatover2 = *deltatover2_F;
- realw b_deltat = *b_deltat_F;
- realw b_deltatsqover2 = *b_deltatsqover2_F;
- realw b_deltatover2 = *b_deltatover2_F;
//cublasStatus status;
int blocksize = BLOCKSIZE_KERNEL1;
@@ -110,7 +104,11 @@
// exit_on_cuda_error("Before UpdateDispVeloc_kernel");
//#endif
- //launch kernel
+ realw deltat = *deltat_F;
+ realw deltatsqover2 = *deltatsqover2_F;
+ realw deltatover2 = *deltatover2_F;
+
+ //launch kernel
UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
size,deltat,deltatsqover2,deltatover2);
@@ -123,7 +121,10 @@
// kernel for backward fields
if(*SIMULATION_TYPE == 3) {
-
+ realw b_deltat = *b_deltat_F;
+ realw b_deltatsqover2 = *b_deltatsqover2_F;
+ realw b_deltatover2 = *b_deltatover2_F;
+
UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
size,b_deltat,b_deltatsqover2,b_deltatover2);
@@ -186,12 +187,6 @@
//int i,device;
int size = *size_F;
- realw deltat = *deltat_F;
- realw deltatsqover2 = *deltatsqover2_F;
- realw deltatover2 = *deltatover2_F;
- realw b_deltat = *b_deltat_F;
- realw b_deltatsqover2 = *b_deltatsqover2_F;
- realw b_deltatover2 = *b_deltatover2_F;
//cublasStatus status;
int blocksize = BLOCKSIZE_KERNEL1;
@@ -207,6 +202,10 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
+ realw deltat = *deltat_F;
+ realw deltatsqover2 = *deltatsqover2_F;
+ realw deltatover2 = *deltatover2_F;
+
//launch kernel
UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_acoustic,
mp->d_potential_dot_acoustic,
@@ -214,6 +213,10 @@
size,deltat,deltatsqover2,deltatover2);
if(*SIMULATION_TYPE == 3) {
+ realw b_deltat = *b_deltat_F;
+ realw b_deltatsqover2 = *b_deltatsqover2_F;
+ realw b_deltatover2 = *b_deltatover2_F;
+
UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_acoustic,
mp->d_b_potential_dot_acoustic,
mp->d_b_potential_dot_dot_acoustic,
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h 2012-07-10 18:22:43 UTC (rev 20512)
@@ -71,8 +71,9 @@
#endif
// error checking after cuda function calls
-/* #define ENABLE_VERY_SLOW_ERROR_CHECKING */
+//#define ENABLE_VERY_SLOW_ERROR_CHECKING
+// helper functions
#define MAX(x,y) (((x) < (y)) ? (y) : (x))
double get_time();
@@ -124,7 +125,7 @@
// requires CUDA version >= 4.0, see check below
// Use textures for d_displ and d_accel -- 10% performance boost
#define USE_TEXTURES_FIELDS
-//
+
// Using texture memory for the hprime-style constants is slower on
// Fermi generation hardware, but *may* be faster on Kepler
// generation.
@@ -181,7 +182,7 @@
int NGLOB_AB;
int myrank;
- int NPROCS;
+ int NPROC;
// interpolators
realw* d_xix; realw* d_xiy; realw* d_xiz;
@@ -215,26 +216,23 @@
float* h_recv_accel_buffer;
float* h_recv_b_accel_buffer;
float* recv_buffer;
- int size_mpi_send_buffer;
- int size_mpi_recv_buffer;
+ int size_mpi_buffer;
-
-
// buffers and constants for the MPI-send required for async-memcpy
// + non-blocking MPI
- float* buffer_recv_vector_ext_mesh;
+ //daniel: check if needed
+ //float* buffer_recv_vector_ext_mesh;
int num_interfaces_ext_mesh;
int max_nibool_interfaces_ext_mesh;
- int* nibool_interfaces_ext_mesh;
- int* my_neighbours_ext_mesh;
- int* request_send_vector_ext_mesh;
- int* request_recv_vector_ext_mesh;
+ //int* nibool_interfaces_ext_mesh;
+ //int* my_neighbours_ext_mesh;
+ //int* request_send_vector_ext_mesh;
+ //int* request_recv_vector_ext_mesh;
-
// overlapped memcpy streams
cudaStream_t compute_stream;
cudaStream_t copy_stream;
- cudaStream_t b_copy_stream;
+ //cudaStream_t b_copy_stream;
// ------------------------------------------------------------------ //
// elastic wavefield parameters
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu 2012-07-10 18:22:43 UTC (rev 20512)
@@ -80,7 +80,7 @@
char hostname[256];
gethostname(hostname, sizeof(hostname));
printf("PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
- FILE *file = fopen("/scratch/eiger/rietmann/attach_gdb.txt","w+");
+ FILE *file = fopen("./attach_gdb.txt","w+");
if (file != NULL){
fprintf(file,"PID %d on %s:%d ready for attach\n", getpid(), hostname,myrank);
fclose(file);
@@ -97,16 +97,35 @@
// sync and check to catch errors from previous async operations
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
- if (err != cudaSuccess)
- {
- fprintf(stderr,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
- pause_for_debugger(0);
- //free(kernel_name);
+ if (err != cudaSuccess){
+ fprintf(stderr,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
+
+ //debugging
+ //pause_for_debugger(0);
+
+ // outputs error file
+ FILE* fp;
+ int myrank;
+ char filename[BUFSIZ];
#ifdef WITH_MPI
- MPI_Abort(MPI_COMM_WORLD,1);
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+ myrank = 0;
+#endif
+ sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+ fp = fopen(filename,"a+");
+ if (fp != NULL){
+ fprintf(fp,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
+ fclose(fp);
+ }
+
+ // stops program
+ //free(kernel_name);
+#ifdef WITH_MPI
+ MPI_Abort(MPI_COMM_WORLD,1);
#endif
- exit(EXIT_FAILURE);
- }
+ exit(EXIT_FAILURE);
+ }
}
/* ----------------------------------------------------------------------------------------------- */
@@ -115,7 +134,25 @@
{
printf("\nERROR: %s\n",info);
fflush(stdout);
+
+ // outputs error file
+ FILE* fp;
+ int myrank;
+ char filename[BUFSIZ];
#ifdef WITH_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+ myrank = 0;
+#endif
+ sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+ fp = fopen(filename,"a+");
+ if (fp != NULL){
+ fprintf(fp,"ERROR: %s\n",info);
+ fclose(fp);
+ }
+
+ // stops program
+#ifdef WITH_MPI
MPI_Abort(MPI_COMM_WORLD,1);
#endif
//free(info);
@@ -131,7 +168,25 @@
{
printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
fflush(stdout);
+
+ // outputs error file
+ FILE* fp;
+ int myrank;
+ char filename[BUFSIZ];
#ifdef WITH_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
+#else
+ myrank = 0;
+#endif
+ sprintf(filename,"../in_out_files/OUTPUT_FILES/error_message_%06d.txt",myrank);
+ fp = fopen(filename,"a+");
+ if (fp != NULL){
+ fprintf(fp,"\nCUDA error !!!!! <%s> !!!!! \nat CUDA call error code: # %d\n",cudaGetErrorString(err),num);
+ fclose(fp);
+ }
+
+ // stops program
+#ifdef WITH_MPI
MPI_Abort(MPI_COMM_WORLD,1);
#endif
exit(EXIT_FAILURE);
@@ -491,9 +546,8 @@
if (device_count == 0) exit_on_error("CUDA runtime error: there is no device supporting CUDA\n");
*ncuda_devices = device_count;
-
// Sets the active device
- if(device_count > 1) {
+ if(device_count >= 1) {
// generalized for more GPUs per node
// note: without previous context release, cudaSetDevice will complain with the cuda error
// "setting the device when a process is active is not allowed"
@@ -631,16 +685,15 @@
exit_on_error("NGLLX must be 5 for CUDA devices");
}
-
+ // sets number of processes
#ifdef WITH_MPI
int nproc;
MPI_Comm_size(MPI_COMM_WORLD,&nproc);
- mp->NPROCS=nproc;
+ mp->NPROC = nproc;
#else
- mp->NPROCS = 1;
+ mp->NPROC = 1;
#endif
-
// sets global parameters
mp->NSPEC_AB = *NSPEC_AB;
mp->NGLOB_AB = *NGLOB_AB;
@@ -670,36 +723,6 @@
}
#endif
-
- // Allocate pinned mpi-buffers.
- // MPI buffers use pinned memory allocated by cudaMallocHost, which
- // enables the use of asynchronous memory copies from host <->
- // device
- int size_mpi_buffer = 3 * (*num_interfaces_ext_mesh) * (*max_nibool_interfaces_ext_mesh);
- print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
- mp->send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
- mp->size_mpi_send_buffer = size_mpi_buffer;
-
- print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
- mp->recv_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
- mp->size_mpi_recv_buffer = size_mpi_buffer;
-
- print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
- // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
-
- mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh;
- mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh;
- mp->nibool_interfaces_ext_mesh = h_nibool_interfaces_ext_mesh;
- mp->my_neighbours_ext_mesh = my_neighbours_ext_mesh;
- mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh;
- mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh;
- mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh;
-
- // setup two streams, one for compute and one for host<->device memory copies
- cudaStreamCreate(&mp->compute_stream);
- cudaStreamCreate(&mp->copy_stream);
- cudaStreamCreate(&mp->b_copy_stream);
-
/* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
@@ -764,6 +787,41 @@
cudaMemcpyHostToDevice),1204);
}
+ // Allocate pinned mpi-buffers.
+ // MPI buffers use pinned memory allocated by cudaMallocHost, which
+ // enables the use of asynchronous memory copies from host <->
+ // device
+ int size_mpi_buffer = 3 * (mp->num_interfaces_ext_mesh) * (mp->max_nibool_interfaces_ext_mesh);
+ // send buffer
+ mp->size_mpi_buffer = size_mpi_buffer;
+ if( mp->size_mpi_buffer > 0 ){
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
+ mp->send_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
+ // adjoint
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
+ // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+
+ // receive buffer
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
+ mp->recv_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
+ }
+
+ //daniel: check if needed
+ //mp->nibool_interfaces_ext_mesh = h_nibool_interfaces_ext_mesh;
+ //mp->my_neighbours_ext_mesh = my_neighbours_ext_mesh;
+ //mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh;
+ //mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh;
+ //mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh;
+
+ // setup two streams, one for compute and one for host<->device memory copies
+ // compute stream
+ cudaStreamCreate(&mp->compute_stream);
+ // copy stream (needed to transfer mpi buffers)
+ if( mp->size_mpi_buffer > 0 ){
+ cudaStreamCreate(&mp->copy_stream);
+ //cudaStreamCreate(&mp->b_copy_stream);
+ }
+
// inner elements
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,mp->NSPEC_AB*sizeof(int)),1205);
print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
@@ -906,9 +964,12 @@
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),2003);
// mpi buffer
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
+ if( mp->num_interfaces_ext_mesh > 0 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),2004);
-
+ }
+
+ // mass matrix
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
@@ -1145,9 +1206,11 @@
#endif
// mpi buffer
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
- 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004);
-
+ if( mp->size_mpi_buffer > 0 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
+ mp->size_mpi_buffer*sizeof(realw)),4004);
+ }
+
// mass matrix
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
// transfer element data
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90 2012-07-10 18:22:43 UTC (rev 20512)
@@ -446,15 +446,15 @@
! assemble only if more than one partition
if(NPROC > 1) then
- ! wait for communications completion (recv)
- !write(IMAIN,*) "sending MPI_wait"
- do iinterface = 1, num_interfaces_ext_mesh
- call wait_req(request_recv_vector_ext_mesh(iinterface))
- enddo
+ ! wait for communications completion (recv)
+ !write(IMAIN,*) "sending MPI_wait"
+ do iinterface = 1, num_interfaces_ext_mesh
+ call wait_req(request_recv_vector_ext_mesh(iinterface))
+ enddo
- ! send contributions to GPU
- call transfer_boundary_to_device_a(Mesh_pointer, buffer_recv_vector_ext_mesh, &
- num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh)
+ ! send contributions to GPU
+ call transfer_boundary_to_device_a(Mesh_pointer, buffer_recv_vector_ext_mesh, &
+ num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh)
endif
! This step is done via previous function transfer_and_assemble...
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_elastic.F90 2012-07-10 18:22:43 UTC (rev 20512)
@@ -255,13 +255,11 @@
my_neighbours_ext_mesh, &
request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
else ! GPU_MODE==1
-
! transfers boundary region to host asynchronously. The
! MPI-send is done from within compute_forces_elastic_cuda,
! once the inner element kernels are launched, and the
! memcpy has finished. see compute_forces_elastic_cuda:1655
call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic)
-
endif ! GPU_MODE
! adjoint simulations
@@ -284,7 +282,6 @@
nibool_interfaces_ext_mesh,&
my_neighbours_ext_mesh, &
b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
-
endif ! GPU
endif !adjoint
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.f90 2012-07-10 18:22:43 UTC (rev 20512)
@@ -136,31 +136,6 @@
!=====================================================================
- subroutine it_print_elapsed_time()
- use specfem_par
- use specfem_par_elastic
- use specfem_par_acoustic
- implicit none
-
- ! local parameters
- double precision :: tCPU
- integer :: ihours,iminutes,iseconds,int_tCPU
-
- if(myrank == 0) then
- ! elapsed time since beginning of the simulation
- tCPU = wtime() - time_start
- int_tCPU = int(tCPU)
- ihours = int_tCPU / 3600
- iminutes = (int_tCPU - 3600*ihours) / 60
- iseconds = int_tCPU - 3600*ihours - 60*iminutes
- write(IMAIN,*) 'Time-Loop Complete. Timing info:'
- write(IMAIN,*) 'Total elapsed time in seconds = ',tCPU
- write(IMAIN,"(' Total elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
- endif
- end subroutine it_print_elapsed_time
-
-!=====================================================================
-
subroutine it_check_stability()
! computes the maximum of the norm of the displacement
@@ -209,8 +184,8 @@
! check stability of the code, exit if unstable
! negative values can occur with some compilers when the unstable value is greater
! than the greatest possible floating-point number of the machine
- if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0) &
- call exit_MPI(myrank,'forward simulation became unstable and blew up')
+ !if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0.0_CUSTOM_REAL) &
+ ! call exit_MPI(myrank,'single forward simulation became unstable and blew up')
! compute the maximum of the maxima for all the slices using an MPI reduction
call max_all_cr(Usolidnorm,Usolidnorm_all)
@@ -262,8 +237,8 @@
! check stability of the code, exit if unstable
! negative values can occur with some compilers when the unstable value is greater
! than the greatest possible floating-point number of the machine
- if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0) &
- call exit_MPI(myrank,'backward simulation became unstable and blew up')
+ !if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0.0_CUSTOM_REAL) &
+ ! call exit_MPI(myrank,'single backward simulation became unstable and blew up')
! compute max of all slices
call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
@@ -365,10 +340,10 @@
! check stability of the code, exit if unstable
! negative values can occur with some compilers when the unstable value is greater
! than the greatest possible floating-point number of the machine
- if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0 &
- .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0 &
- .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0 &
- .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0) &
+ if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0_CUSTOM_REAL &
+ .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0_CUSTOM_REAL &
+ .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0_CUSTOM_REAL &
+ .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0_CUSTOM_REAL) &
call exit_MPI(myrank,'forward simulation became unstable and blew up')
! adjoint simulations
if(SIMULATION_TYPE == 3 .and. (b_Usolidnorm_all > STABILITY_THRESHOLD &
@@ -745,7 +720,33 @@
end subroutine it_store_attenuation_arrays
+!=====================================================================
+ subroutine it_print_elapsed_time()
+
+ use specfem_par
+ use specfem_par_elastic
+ use specfem_par_acoustic
+ implicit none
+
+ ! local parameters
+ double precision :: tCPU
+ integer :: ihours,iminutes,iseconds,int_tCPU
+
+ if(myrank == 0) then
+ ! elapsed time since beginning of the simulation
+ tCPU = wtime() - time_start
+ int_tCPU = int(tCPU)
+ ihours = int_tCPU / 3600
+ iminutes = (int_tCPU - 3600*ihours) / 60
+ iseconds = int_tCPU - 3600*ihours - 60*iminutes
+ write(IMAIN,*) 'Time-Loop Complete. Timing info:'
+ write(IMAIN,*) 'Total elapsed time in seconds = ',tCPU
+ write(IMAIN,"(' Total elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+ endif
+
+ end subroutine it_print_elapsed_time
+
!=====================================================================
subroutine it_transfer_from_GPU()
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90 2012-07-10 18:22:43 UTC (rev 20512)
@@ -33,10 +33,92 @@
use specfem_par_elastic
use specfem_par_poroelastic
use specfem_par_movie
+ implicit none
+ ! local parameters
+ double precision :: tCPU
+ ! get MPI starting time
+ time_start = wtime()
+
+ ! user output infos
+ call prepare_timerun_user_output()
+
+ ! sets up mass matrices
+ call prepare_timerun_mass_matrices()
+
+ ! initializes arrays
+ call prepare_timerun_init_wavefield()
+
+ ! sets up time increments
+ call prepare_timerun_constants()
+
+ ! prepares attenuation arrays
+ call prepare_timerun_attenuation()
+
+ ! prepares gravity arrays
+ call prepare_timerun_gravity()
+
+ ! initializes PML arrays
+ if( ABSORBING_CONDITIONS ) then
+ if (SIMULATION_TYPE /= 1 .and. ABSORB_USE_PML ) then
+ write(IMAIN,*) 'NOTE: adjoint simulations and PML not supported yet...'
+ else
+ if( ABSORB_USE_PML ) then
+ call PML_initialize()
+ endif
+ endif
+ endif
+
+ ! prepares ADJOINT simulations
+ call prepare_timerun_adjoint()
+
+ ! prepares noise simulations
+ call prepare_timerun_noise()
+
+ ! prepares GPU arrays
+ if(GPU_MODE) call prepare_timerun_GPU()
+
+#ifdef OPENMP_MODE
+ ! prepares arrays for OpenMP
+ call prepare_timerun_OpenMP()
+#endif
+
+ ! synchronize all the processes
+ call sync_all()
+
+ ! elapsed time since beginning of preparation
+ if(myrank == 0) then
+ tCPU = wtime() - time_start
+ write(IMAIN,*)
+ write(IMAIN,*) 'Elapsed time for preparing timerun in seconds = ',tCPU
+ write(IMAIN,*)
+ write(IMAIN,*) 'time loop:'
+ write(IMAIN,*)
+ write(IMAIN,*) ' time step: ',sngl(DT),' s'
+ write(IMAIN,*) 'number of time steps: ',NSTEP
+ write(IMAIN,*) 'total simulated time: ',sngl(NSTEP*DT),' seconds'
+ write(IMAIN,*) 'start time:',sngl(-t0),' seconds'
+ write(IMAIN,*)
+
+ !daniel debug: time estimation
+ ! elastic elements: time per element t_per_element = 1.40789368e-05 s
+ ! total time = nspec * nstep * t_per_element
+ endif
+
+ end subroutine prepare_timerun
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+ subroutine prepare_timerun_user_output()
+
+ use specfem_par
+ use specfem_par_acoustic
+ use specfem_par_elastic
+ use specfem_par_poroelastic
+ use specfem_par_movie
implicit none
- character(len=256) :: plot_file
- integer :: ier
! flag for any movie simulation
if( EXTERNAL_MESH_MOVIE_SURFACE .or. EXTERNAL_MESH_CREATE_SHAKEMAP .or. &
@@ -102,7 +184,6 @@
else
write(IMAIN,*) 'no poroelastic simulation'
endif
- write(IMAIN,*)
write(IMAIN,*)
if(MOVIE_SIMULATION) then
@@ -110,17 +191,98 @@
else
write(IMAIN,*) 'no movie simulation'
endif
+
write(IMAIN,*)
+ write(IMAIN,*)
endif
+ end subroutine prepare_timerun_user_output
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+ subroutine prepare_timerun_mass_matrices()
+
+ use specfem_par
+ use specfem_par_acoustic
+ use specfem_par_elastic
+ use specfem_par_poroelastic
+ implicit none
+
! synchronize all the processes before assembling the mass matrix
! to make sure all the nodes have finished to read their databases
call sync_all()
- ! sets up mass matrices
- call prepare_timerun_mass_matrices()
+ ! the mass matrix needs to be assembled with MPI here once and for all
+ if(ACOUSTIC_SIMULATION) then
+ call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_acoustic, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
+ my_neighbours_ext_mesh)
+ ! fill mass matrix with fictitious non-zero values to make sure it can be inverted globally
+ where(rmass_acoustic <= 0._CUSTOM_REAL) rmass_acoustic = 1._CUSTOM_REAL
+ rmass_acoustic(:) = 1._CUSTOM_REAL / rmass_acoustic(:)
+
+ endif
+
+ if(ELASTIC_SIMULATION) then
+ call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+ my_neighbours_ext_mesh)
+
+ ! fill mass matrix with fictitious non-zero values to make sure it can be inverted globally
+ where(rmass <= 0._CUSTOM_REAL) rmass = 1._CUSTOM_REAL
+ rmass(:) = 1._CUSTOM_REAL / rmass(:)
+
+ if(OCEANS ) then
+ if( minval(rmass_ocean_load(:)) <= 0._CUSTOM_REAL) &
+ call exit_MPI(myrank,'negative ocean load mass matrix term')
+ rmass_ocean_load(:) = 1. / rmass_ocean_load(:)
+ endif
+
+ endif
+
+ if(POROELASTIC_SIMULATION) then
+ call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_solid_poroelastic, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+ my_neighbours_ext_mesh)
+
+ call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_fluid_poroelastic, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+ my_neighbours_ext_mesh)
+
+ ! fills mass matrix with fictitious non-zero values to make sure it can be inverted globally
+ where(rmass_solid_poroelastic <= 0._CUSTOM_REAL) rmass_solid_poroelastic = 1._CUSTOM_REAL
+ where(rmass_fluid_poroelastic <= 0._CUSTOM_REAL) rmass_fluid_poroelastic = 1._CUSTOM_REAL
+ rmass_solid_poroelastic(:) = 1._CUSTOM_REAL / rmass_solid_poroelastic(:)
+ rmass_fluid_poroelastic(:) = 1._CUSTOM_REAL / rmass_fluid_poroelastic(:)
+
+ endif
+
+ !debug
+ !if(myrank == 0) write(IMAIN,*) 'end assembling MPI mass matrix'
+
+ end subroutine prepare_timerun_mass_matrices
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+ subroutine prepare_timerun_init_wavefield()
+
+! initializes arrays
+
+ use specfem_par
+ use specfem_par_acoustic
+ use specfem_par_elastic
+ use specfem_par_poroelastic
+ implicit none
+
! initialize acoustic arrays to zero
if( ACOUSTIC_SIMULATION ) then
potential_acoustic(:) = 0._CUSTOM_REAL
@@ -152,6 +314,23 @@
if(FIX_UNDERFLOW_PROBLEM) displw_poroelastic(:,:) = VERYSMALLVAL
endif
+ end subroutine prepare_timerun_init_wavefield
+
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+ subroutine prepare_timerun_constants()
+
+! precomputes constants for time integration
+
+ use specfem_par
+ implicit none
+ ! local parameters
+ character(len=256) :: plot_file
+ integer :: ier
+
! distinguish between single and double precision for reals
if(CUSTOM_REAL == SIZE_REAL) then
deltat = sngl(DT)
@@ -161,6 +340,19 @@
deltatover2 = deltat/2._CUSTOM_REAL
deltatsqover2 = deltat*deltat/2._CUSTOM_REAL
+ ! adjoint runs: timing
+ if (SIMULATION_TYPE == 3) then
+ ! backward/reconstructed wavefields: time stepping is in time-reversed sense
+ ! (negative time increments)
+ if(CUSTOM_REAL == SIZE_REAL) then
+ b_deltat = - sngl(DT)
+ else
+ b_deltat = - DT
+ endif
+ b_deltatover2 = b_deltat/2._CUSTOM_REAL
+ b_deltatsqover2 = b_deltat*b_deltat/2._CUSTOM_REAL
+ endif
+
! seismograms
if (nrec_local > 0) then
! allocate seismogram array
@@ -177,26 +369,6 @@
seismograms_a(:,:,:) = 0._CUSTOM_REAL
endif
- ! synchronize all the processes
- call sync_all()
-
- ! prepares attenuation arrays
- call prepare_timerun_attenuation()
-
- ! prepares gravity arrays
- call prepare_timerun_gravity()
-
- ! initializes PML arrays
- if( ABSORBING_CONDITIONS ) then
- if (SIMULATION_TYPE /= 1 .and. ABSORB_USE_PML ) then
- write(IMAIN,*) 'NOTE: adjoint simulations and PML not supported yet...'
- else
- if( ABSORB_USE_PML ) then
- call PML_initialize()
- endif
- endif
- endif
-
! opens source time function file
if(PRINT_SOURCE_TIME_FUNCTION .and. myrank == 0) then
! print the source-time function
@@ -209,111 +381,16 @@
write(plot_file,"('/plot_source_time_function',i2,'.txt')") NSOURCES
endif
endif
- open(unit=IOSTF,file=trim(OUTPUT_FILES)//plot_file,status='unknown')
+ open(unit=IOSTF,file=trim(OUTPUT_FILES)//plot_file,status='unknown',iostat=ier)
+ if( ier /= 0 ) call exit_mpi(myrank,'error opening plot_source_time_function file')
endif
- ! user output
- if(myrank == 0) then
- write(IMAIN,*)
- write(IMAIN,*) ' time step: ',sngl(DT),' s'
- write(IMAIN,*) 'number of time steps: ',NSTEP
- write(IMAIN,*) 'total simulated time: ',sngl(NSTEP*DT),' seconds'
- write(IMAIN,*) 'start time:',sngl(-t0),' seconds'
- write(IMAIN,*)
+ end subroutine prepare_timerun_constants
- !daniel debug: time estimation
- ! elastic elements: time per element t_per_element = 1.40789368e-05 s
- ! total time = nspec * nstep * t_per_element
-
- endif
-
- ! prepares ADJOINT simulations
- call prepare_timerun_adjoint()
-
- ! prepares noise simulations
- call prepare_timerun_noise()
-
- ! prepares GPU arrays
- if(GPU_MODE) call prepare_timerun_GPU()
-
-#ifdef OPENMP_MODE
- ! prepares arrays for OpenMP
- call prepare_timerun_OpenMP()
-#endif
-
- end subroutine prepare_timerun
-
!
!-------------------------------------------------------------------------------------------------
!
- subroutine prepare_timerun_mass_matrices()
-
- use specfem_par
- use specfem_par_acoustic
- use specfem_par_elastic
- use specfem_par_poroelastic
- implicit none
-
-! the mass matrix needs to be assembled with MPI here once and for all
- if(ACOUSTIC_SIMULATION) then
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_acoustic, &
- num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
- my_neighbours_ext_mesh)
-
- ! fill mass matrix with fictitious non-zero values to make sure it can be inverted globally
- where(rmass_acoustic <= 0._CUSTOM_REAL) rmass_acoustic = 1._CUSTOM_REAL
- rmass_acoustic(:) = 1._CUSTOM_REAL / rmass_acoustic(:)
-
- endif ! ACOUSTIC_SIMULATION
-
- if(ELASTIC_SIMULATION) then
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass, &
- num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- my_neighbours_ext_mesh)
-
- ! fill mass matrix with fictitious non-zero values to make sure it can be inverted globally
- where(rmass <= 0._CUSTOM_REAL) rmass = 1._CUSTOM_REAL
- rmass(:) = 1._CUSTOM_REAL / rmass(:)
-
- if(OCEANS ) then
- if( minval(rmass_ocean_load(:)) <= 0._CUSTOM_REAL) &
- call exit_MPI(myrank,'negative ocean load mass matrix term')
- rmass_ocean_load(:) = 1. / rmass_ocean_load(:)
- endif
-
- endif ! ELASTIC_SIMULATION
-
- if(POROELASTIC_SIMULATION) then
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_solid_poroelastic, &
- num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- my_neighbours_ext_mesh)
-
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_fluid_poroelastic, &
- num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- my_neighbours_ext_mesh)
-
- ! fills mass matrix with fictitious non-zero values to make sure it can be inverted globally
- where(rmass_solid_poroelastic <= 0._CUSTOM_REAL) rmass_solid_poroelastic = 1._CUSTOM_REAL
- where(rmass_fluid_poroelastic <= 0._CUSTOM_REAL) rmass_fluid_poroelastic = 1._CUSTOM_REAL
- rmass_solid_poroelastic(:) = 1._CUSTOM_REAL / rmass_solid_poroelastic(:)
- rmass_fluid_poroelastic(:) = 1._CUSTOM_REAL / rmass_fluid_poroelastic(:)
-
- endif ! POROELASTIC_SIMULATION
-
- if(myrank == 0) write(IMAIN,*) 'end assembling MPI mass matrix'
-
-
- end subroutine prepare_timerun_mass_matrices
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
subroutine prepare_timerun_attenuation()
use specfem_par
@@ -568,21 +645,6 @@
seismograms_eps(:,:,:,:) = 0._CUSTOM_REAL
endif
-! timing
- if (SIMULATION_TYPE == 3) then
-
- ! backward/reconstructed wavefields: time stepping is in time-reversed sense
- ! (negative time increments)
- if(CUSTOM_REAL == SIZE_REAL) then
- b_deltat = - sngl(DT)
- else
- b_deltat = - DT
- endif
- b_deltatover2 = b_deltat/2._CUSTOM_REAL
- b_deltatsqover2 = b_deltat*b_deltat/2._CUSTOM_REAL
-
- endif
-
! attenuation backward memories
if( ATTENUATION .and. SIMULATION_TYPE == 3 ) then
@@ -879,7 +941,6 @@
if(myrank == 0 ) then
write(IMAIN,*)
write(IMAIN,*) "GPU Preparing Fields and Constants on Device."
- write(IMAIN,*)
endif
! prepares general fields on GPU
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90 2012-07-10 10:09:17 UTC (rev 20511)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90 2012-07-10 18:22:43 UTC (rev 20512)
@@ -51,9 +51,9 @@
write(IMAIN,*)
write(IMAIN,*) 'Total number of samples for seismograms = ',NSTEP
write(IMAIN,*)
- write(IMAIN,*)
write(IMAIN,*) 'found a total of ',nrec_tot_found,' receivers in all the slices'
if(NSOURCES > 1) write(IMAIN,*) 'Using ',NSOURCES,' point sources'
+ write(IMAIN,*)
endif
end subroutine setup_sources_receivers
More information about the CIG-COMMITS
mailing list