[cig-commits] r22718 - in seismo/3D/SPECFEM3D/trunk: . src/cuda src/generate_databases src/shared src/specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Tue Aug 20 07:13:27 PDT 2013
Author: danielpeter
Date: 2013-08-20 07:13:26 -0700 (Tue, 20 Aug 2013)
New Revision: 22718
Added:
seismo/3D/SPECFEM3D/trunk/src/specfem3D/check_stability.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_total_energy.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/update_displacement_scheme.f90
Modified:
seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_acoustic_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_viscoelastic_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_coupling_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_viscoelastic_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_kernels_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_acoustic_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_viscoelastic_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D/trunk/src/cuda/noise_tomography_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_constants_cuda.h
seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
seismo/3D/SPECFEM3D/trunk/src/cuda/transfer_fields_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu
seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_MPI.f90
seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_coupling_surfaces.f90
seismo/3D/SPECFEM3D/trunk/src/generate_databases/memory_eval.f90
seismo/3D/SPECFEM3D/trunk/src/shared/assemble_MPI_scalar.f90
seismo/3D/SPECFEM3D/trunk/src/shared/constants.h.in
seismo/3D/SPECFEM3D/trunk/src/shared/detect_surface.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in
seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_acoustic.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_poroelastic.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_arrays_source.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic_calling_routine.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_acoustic.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_poroelastic.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/fault_solver_common.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/finalize_simulation.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.F90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/specfem3D_par.f90
seismo/3D/SPECFEM3D/trunk/src/specfem3D/write_seismograms.f90
seismo/3D/SPECFEM3D/trunk/todo_list_please_dont_remove.txt
Log:
adds new files update_displacement_scheme.f90, check_stability.f90 and compute_total_energy.f90 for corresponding routines; renames some routines & variables to remove obsolete _ext_mesh ending; bug fix for cuda routines and kernel simulations
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/check_fields_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -39,7 +39,6 @@
#include "config.h"
#include "mesh_constants_cuda.h"
-//#include "prepare_constants_cuda.h"
/* ----------------------------------------------------------------------------------------------- */
@@ -64,6 +63,7 @@
pause_for_debugger(1);
}
+
/* ----------------------------------------------------------------------------------------------- */
void pause_for_debugger(int pause) {
@@ -94,7 +94,7 @@
void exit_on_cuda_error(char* kernel_name) {
// sync and check to catch errors from previous async operations
- cudaThreadSynchronize();
+ synchronize_cuda();
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess){
fprintf(stderr,"Error after %s: %s\n", kernel_name, cudaGetErrorString(err));
@@ -193,8 +193,50 @@
/* ----------------------------------------------------------------------------------------------- */
+void synchronize_cuda(){
+#if CUDA_VERSION >= 4000
+ cudaDeviceSynchronize();
+#else
+ cudaThreadSynchronize();
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+void synchronize_mpi(){
+#ifdef WITH_MPI
+ MPI_Barrier(MPI_COMM_WORLD);
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
+
+// Initially sets the blocks_x to be the num_blocks, and adds rows as needed (block size limit of 65535).
+// If an additional row is added, the row length is cut in
+// half. If the block count is odd, there will be 1 too many blocks,
+// which must be managed at runtime with an if statement.
+
+ *num_blocks_x = num_blocks;
+ *num_blocks_y = 1;
+
+ while(*num_blocks_x > MAXIMUM_GRID_DIM) {
+ *num_blocks_x = (int) ceil(*num_blocks_x * 0.5f);
+ *num_blocks_y = *num_blocks_y * 2;
+ }
+
+ return;
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
void get_free_memory(double* free_db, double* used_db, double* total_db) {
+ TRACE("get_free_memory");
+
// gets memory usage in byte
size_t free_byte ;
size_t total_byte ;
@@ -215,13 +257,15 @@
// Saves GPU memory usage to file
void output_free_memory(int myrank,char* info_str) {
+ TRACE("output_free_memory");
+
FILE* fp;
char filename[BUFSIZ];
double free_db,used_db,total_db;
get_free_memory(&free_db,&used_db,&total_db);
- sprintf(filename,"../OUTPUT_FILES/gpu_device_mem_usage_proc_%06d.txt",myrank);
+ sprintf(filename,"../OUTPUT_FILES/gpu_memory_usage_proc_%06d.txt",myrank);
fp = fopen(filename,"a+");
if (fp != NULL){
fprintf(fp,"%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
@@ -235,12 +279,14 @@
// Fortran-callable version of above method
extern "C"
void FC_FUNC_(output_free_device_memory,
- OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {
+ OUTPUT_FREE_DEVICE_MEMORY)(int* myrank_f) {
TRACE("output_free_device_memory");
- char info[6];
- sprintf(info,"f %d:",*myrank);
- output_free_memory(*myrank,info);
+ char info[64];
+ int myrank = *myrank_f;
+
+ sprintf(info,"f %d:",myrank);
+ output_free_memory(myrank,info);
}
@@ -264,7 +310,411 @@
}
+
/* ----------------------------------------------------------------------------------------------- */
+
+// Auxiliary functions
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+__global__ void memset_to_realw_kernel(realw* array, int size, realw value){
+
+ unsigned int tid = threadIdx.x;
+ unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+ unsigned int i = tid + bx*blockDim.x;
+
+ if( i < size ){
+ array[i] = *value;
+ }
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+
+realw get_device_array_maximum_value(realw* array, int size){
+
+// get maximum of array on GPU by copying over to CPU and handle it there
+
+ realw max = 0.0f;
+
+ // checks if anything to do
+ if( size > 0 ){
+ realw* h_array;
+
+ // explicitly wait for cuda kernels to finish
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ synchronize_cuda();
+
+ h_array = (realw*)calloc(size,sizeof(realw));
+ print_CUDA_error_if_any(cudaMemcpy(h_array,array,sizeof(realw)*size,cudaMemcpyDeviceToHost),33001);
+
+ // finds maximum value in array
+ max = h_array[0];
+ for( int i=1; i < size; i++){
+ if( abs(h_array[i]) > max ) max = abs(h_array[i]);
+ }
+ free(h_array);
+ }
+ return max;
+}
+
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void get_maximum_kernel(realw* array, int size, realw* d_max){
+
+ /* simplest version: uses only 1 thread
+ realw max;
+ max = 0;
+ // finds maximum value in array
+ if( size > 0 ){
+ max = abs(array[0]);
+ for( int i=1; i < size; i++){
+ if( abs(array[i]) > max ) max = abs(array[i]);
+ }
+ }
+ *d_max = max;
+ */
+
+ // reduction example:
+ __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
+
+ // load shared mem
+ unsigned int tid = threadIdx.x;
+ unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+ unsigned int i = tid + bx*blockDim.x;
+
+ // loads absolute values into shared memory
+ sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ;
+
+ __syncthreads();
+
+ // do reduction in shared mem
+ for(unsigned int s=blockDim.x/2; s>0; s>>=1)
+ {
+ if (tid < s){
+ // summation:
+ //sdata[tid] += sdata[tid + s];
+ // maximum:
+ if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
+ }
+ __syncthreads();
+ }
+
+ // write result for this block to global mem
+ if (tid == 0) d_max[bx] = sdata[0];
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_norm_acoustic_from_device,
+ GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,long* Mesh_pointer,int* sim_type) {
+
+ TRACE("get_norm_acoustic_from_device");
+ //double start_time = get_time();
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+ realw max = 0.0;
+ realw *d_max;
+
+ //initializes
+ *norm = 0.0f;
+
+ /* way 1 : timing Elapsed time: 8.464813e-03
+ realw* h_array;
+ h_array = (realw*)calloc(mp->NGLOB_AB,sizeof(realw));
+
+ print_CUDA_error_if_any(cudaMemcpy(h_array,mp->d_potential_dot_dot_acoustic,
+ sizeof(realw)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131);
+
+ // finds maximum value in array
+ max = h_array[0];
+ for( int i=1; i < mp->NGLOB_AB; i++){
+ if( abs(h_array[i]) > max ) max = abs(h_array[i]);
+ }
+ free(h_array);
+ */
+
+ /* way 2: timing Elapsed time: 8.818102e-02
+ // launch simple kernel
+ cudaMalloc((void**)&d_max,sizeof(realw));
+
+ dim3 grid(1,1);
+ dim3 threads(1,1,1);
+
+ get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+ mp->NGLOB_AB,
+ d_max);
+ print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(realw), cudaMemcpyDeviceToHost),222);
+
+ cudaFree(d_max);
+ */
+
+ // way 2 b: timing Elapsed time: 1.236916e-03
+ // launch simple reduction kernel
+ realw* h_max;
+ int blocksize = BLOCKSIZE_TRANSFER;
+
+ int size = mp->NGLOB_AB;
+ int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ //printf("num_blocks_x %i \n",num_blocks_x);
+
+ // on host (allocates & initializes to zero)
+ h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+
+ // allocates memory on device
+ print_CUDA_error_if_any(cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw)),78001);
+ // initializes values to zero
+ print_CUDA_error_if_any(cudaMemset(d_max,0,num_blocks_x*num_blocks_y*sizeof(realw)),77002);
+
+
+ if(*sim_type == 1 ){
+ get_maximum_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,size,d_max);
+ }else if(*sim_type == 3 ){
+ get_maximum_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_dot_dot_acoustic,size,d_max);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("kernel get_maximum_kernel");
+#endif
+
+ // synchronizes
+ //synchronize_cuda();
+ // explicitly waits for stream to finish
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
+ print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
+ cudaMemcpyDeviceToHost),222);
+
+ // determines max for all blocks
+ max = h_max[0];
+ for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
+ if( max < h_max[i]) max = h_max[i];
+ }
+
+ cudaFree(d_max);
+ free(h_max);
+
+ /* way 3: doesn't work properly...
+ cublasStatus status;
+
+ // Initialize CUBLAS
+ status = cublasInit();
+ if (status != CUBLAS_STATUS_SUCCESS) {
+ fprintf (stderr, "!!!! CUBLAS initialization error\n");
+ exit(1);
+ }
+
+ // cublas function: cublasIsamax
+ // finds the smallest index of the maximum magnitude element of single
+ // precision vector x
+ int incr = 1;
+ int imax = 0;
+ imax = cublasIsamax(mp->NGLOB_AB,(realw*)mp->d_potential_dot_dot_acoustic, incr);
+ status= cublasGetError();
+ if (status != CUBLAS_STATUS_SUCCESS) {
+ fprintf (stderr, "!!!! CUBLAS error in cublasIsamax\n");
+ exit(1);
+ }
+
+ print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]),
+ sizeof(realw), cudaMemcpyDeviceToHost),222);
+
+ printf("maximum %i %i %f \n",mp->NGLOB_AB,imax,max);
+
+ // Shutdown
+ status = cublasShutdown();
+ if (status != CUBLAS_STATUS_SUCCESS) {
+ fprintf (stderr, "!!!! shutdown error (A)\n");
+ exit(1);
+ }
+
+ */
+
+ // return result
+ *norm = max;
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("get_norm_acoustic_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ELASTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
+
+ // reduction example:
+ __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
+
+ // load shared mem
+ unsigned int tid = threadIdx.x;
+ unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
+ unsigned int i = tid + bx*blockDim.x;
+
+ // loads values into shared memory: assume array is a vector array
+ sdata[tid] = (i < size) ? (array[i*3]*array[i*3] + array[i*3+1]*array[i*3+1] + array[i*3+2]*array[i*3+2]) : 0.0 ;
+
+ __syncthreads();
+
+ // do reduction in shared mem
+ for(unsigned int s=blockDim.x/2; s>0; s>>=1)
+ {
+ if (tid < s){
+ // summation:
+ //sdata[tid] += sdata[tid + s];
+ // maximum:
+ if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
+ }
+ __syncthreads();
+ }
+
+ // write result for this block to global mem
+ if (tid == 0) d_max[bx] = sdata[0];
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(get_norm_elastic_from_device,
+ GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+ long* Mesh_pointer,
+ int* type) {
+
+ TRACE("\tget_norm_elastic_from_device");
+ //double start_time = get_time();
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+ realw max,res;
+ realw *d_max;
+
+ //initializes
+ *norm = 0.0f;
+
+ // launch simple reduction kernel
+ realw* h_max;
+ int blocksize = BLOCKSIZE_TRANSFER;
+
+ int size = mp->NGLOB_AB;
+ int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ // on host (allocates & initializes to zero)
+ h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
+
+ // allocates memory on device
+ print_CUDA_error_if_any(cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw)),77001);
+ // initializes values to zero
+ print_CUDA_error_if_any(cudaMemset(d_max,0,num_blocks_x*num_blocks_y*sizeof(realw)),77002);
+
+ if(*type == 1 ){
+ get_maximum_vector_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ,size,d_max);
+ }else if(*type == 3 ){
+ get_maximum_vector_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ,size,d_max);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("kernel get_norm_elastic_from_device");
+#endif
+
+ // synchronizes
+ //synchronize_cuda();
+ // explicitly waits for stream to finish
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
+ // copies reduction array back to CPU
+ print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
+ cudaMemcpyDeviceToHost),222);
+
+ // determines max for all blocks
+ max = h_max[0];
+ for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
+ if( max < h_max[i]) max = h_max[i];
+ }
+ res = sqrt(max);
+
+ // return result
+ *norm = res;
+
+ // debug
+ //printf("rank % d - type: %d norm: %f \n",mp->myrank,*type,res);
+
+ cudaFree(d_max);
+ free(h_max);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("get_norm_elastic_from_device");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// unused ...
+
+/* ----------------------------------------------------------------------------------------------- */
+
+/*
+extern "C"
+void FC_FUNC_(get_max_accel,
+ GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {
+
+TRACE("get_max_accel");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+ int procid;
+#ifdef WITH_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD,&procid);
+#else
+ procid = 0;
+#endif
+ int size = *sizef;
+ int it = *itf;
+ realw* accel_cpy = (realw*)malloc(size*sizeof(realw));
+ cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost);
+ realw maxval=0;
+ for(int i=0;i<size;++i) {
+ maxval = MAX(maxval,accel_cpy[i]);
+ }
+ printf("%d/%d: max=%e\n",it,procid,maxval);
+ free(accel_cpy);
+}
+*/
+
+
+/* ----------------------------------------------------------------------------------------------- */
//daniel: helper function
/*
__global__ void check_phase_ispec_kernel(int num_phase_ispec,
@@ -295,9 +745,9 @@
}
}
- void check_phase_ispec(long* Mesh_pointer_f,int type){
+ void check_phase_ispec(long* Mesh_pointer,int type){
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
printf("check phase_ispec for type=%d\n",type);
@@ -331,7 +781,7 @@
#endif
}
- */
+*/
/* ----------------------------------------------------------------------------------------------- */
//daniel: helper function
@@ -358,9 +808,9 @@
printf("check_ispec_is done: count = %d %d\n",count0,count1);
}
- void check_ispec_is(long* Mesh_pointer_f,int type){
+ void check_ispec_is(long* Mesh_pointer,int type){
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
printf("check ispec_is for type=%d\n",type);
@@ -395,7 +845,7 @@
exit_on_cuda_error("check_ispec_is");
#endif
}
- */
+*/
/* ----------------------------------------------------------------------------------------------- */
//daniel: helper function
/*
@@ -424,9 +874,9 @@
printf("check_array_ispec done: count = %d %d \n",count0,count1);
}
- void check_array_ispec(long* Mesh_pointer_f,int type){
+ void check_array_ispec(long* Mesh_pointer,int type){
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
printf("check array_ispec for type=%d\n",type);
@@ -455,7 +905,7 @@
#endif
}
- */
+*/
/* ----------------------------------------------------------------------------------------------- */
@@ -463,13 +913,16 @@
/* ----------------------------------------------------------------------------------------------- */
+//max: helper functions
+
+/*
extern "C"
void FC_FUNC_(check_max_norm_displ_gpu,
- CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {
+ CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer,int* announceID) {
-TRACE("check_max_norm_displ_gpu");
+ TRACE("check_max_norm_displ_gpu");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
cudaMemcpy(displ, mp->d_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
realw maxnorm=0;
@@ -479,9 +932,10 @@
}
printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
}
+*/
/* ----------------------------------------------------------------------------------------------- */
-
+/*
extern "C"
void FC_FUNC_(check_max_norm_vector,
CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {
@@ -504,9 +958,11 @@
}
printf("%d:maxnorm of vector %d [%d] = %e\n",procid,*announceID,maxloc,maxnorm);
}
+*/
/* ----------------------------------------------------------------------------------------------- */
+/*
extern "C"
void FC_FUNC_(check_max_norm_displ,
CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {
@@ -520,16 +976,18 @@
}
printf("%d: maxnorm of forward displ = %e\n",*announceID,maxnorm);
}
+*/
/* ----------------------------------------------------------------------------------------------- */
+/*
extern "C"
void FC_FUNC_(check_max_norm_b_displ_gpu,
- CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {
+ CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer,int* announceID) {
-TRACE("check_max_norm_b_displ_gpu");
+ TRACE("check_max_norm_b_displ_gpu");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
realw* b_accel = (realw*)malloc(*size*sizeof(realw));
@@ -547,16 +1005,18 @@
printf("%d: maxnorm of backward displ = %e\n",*announceID,maxnorm);
printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm_accel);
}
+*/
/* ----------------------------------------------------------------------------------------------- */
+/*
extern "C"
void FC_FUNC_(check_max_norm_b_accel_gpu,
- CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {
+ CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer,int* announceID) {
-TRACE("check_max_norm_b_accel_gpu");
+ TRACE("check_max_norm_b_accel_gpu");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
@@ -567,16 +1027,18 @@
}
printf("%d: maxnorm of backward accel = %e\n",*announceID,maxnorm);
}
+*/
/* ----------------------------------------------------------------------------------------------- */
+/*
extern "C"
void FC_FUNC_(check_max_norm_b_veloc_gpu,
- CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {
+ CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer,int* announceID) {
-TRACE("check_max_norm_b_veloc_gpu");
+ TRACE("check_max_norm_b_veloc_gpu");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(realw),cudaMemcpyDeviceToHost);
@@ -587,9 +1049,11 @@
}
printf("%d: maxnorm of backward veloc = %e\n",*announceID,maxnorm);
}
+*/
/* ----------------------------------------------------------------------------------------------- */
+/*
extern "C"
void FC_FUNC_(check_max_norm_b_displ,
CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {
@@ -603,9 +1067,11 @@
}
printf("%d:maxnorm of backward displ = %e\n",*announceID,maxnorm);
}
+*/
/* ----------------------------------------------------------------------------------------------- */
+/*
extern "C"
void FC_FUNC_(check_max_norm_b_accel,
CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {
@@ -619,9 +1085,11 @@
}
printf("%d:maxnorm of backward accel = %e\n",*announceID,maxnorm);
}
+*/
/* ----------------------------------------------------------------------------------------------- */
+/*
extern "C"
void FC_FUNC_(check_error_vectors,
CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {
@@ -660,331 +1128,5 @@
}
}
+*/
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// Auxiliary functions
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(get_max_accel,
- GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {
-
-TRACE("get_max_accel");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer);
- int procid;
-#ifdef WITH_MPI
- MPI_Comm_rank(MPI_COMM_WORLD,&procid);
-#else
- procid = 0;
-#endif
- int size = *sizef;
- int it = *itf;
- realw* accel_cpy = (realw*)malloc(size*sizeof(realw));
- cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost);
- realw maxval=0;
- for(int i=0;i<size;++i) {
- maxval = MAX(maxval,accel_cpy[i]);
- }
- printf("%d/%d: max=%e\n",it,procid,maxval);
- free(accel_cpy);
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// ACOUSTIC simulations
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void get_maximum_kernel(realw* array, int size, realw* d_max){
-
- /* simplest version: uses only 1 thread
- realw max;
- max = 0;
- // finds maximum value in array
- if( size > 0 ){
- max = abs(array[0]);
- for( int i=1; i < size; i++){
- if( abs(array[i]) > max ) max = abs(array[i]);
- }
- }
- *d_max = max;
- */
-
- // reduction example:
- __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
-
- // load shared mem
- unsigned int tid = threadIdx.x;
- unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
- unsigned int i = tid + bx*blockDim.x;
-
- // loads absolute values into shared memory
- sdata[tid] = (i < size) ? fabs(array[i]) : 0.0 ;
-
- __syncthreads();
-
- // do reduction in shared mem
- for(unsigned int s=blockDim.x/2; s>0; s>>=1)
- {
- if (tid < s){
- // summation:
- //sdata[tid] += sdata[tid + s];
- // maximum:
- if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
- }
- __syncthreads();
- }
-
- // write result for this block to global mem
- if (tid == 0) d_max[bx] = sdata[0];
-
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(get_norm_acoustic_from_device,
- GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
- long* Mesh_pointer_f) {
-
-TRACE("get_norm_acoustic_from_device");
- //double start_time = get_time();
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- realw max;
- realw *d_max;
-
- max = 0.0;
-
- /* way 1 : timing Elapsed time: 8.464813e-03
- realw* h_array;
- h_array = (realw*)calloc(mp->NGLOB_AB,sizeof(realw));
-
- print_CUDA_error_if_any(cudaMemcpy(h_array,mp->d_potential_dot_dot_acoustic,
- sizeof(realw)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131);
-
- // finds maximum value in array
- max = h_array[0];
- for( int i=1; i < mp->NGLOB_AB; i++){
- if( abs(h_array[i]) > max ) max = abs(h_array[i]);
- }
- free(h_array);
- */
-
- /* way 2: timing Elapsed time: 8.818102e-02
- // launch simple kernel
- cudaMalloc((void**)&d_max,sizeof(realw));
-
- dim3 grid(1,1);
- dim3 threads(1,1,1);
-
- get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
- mp->NGLOB_AB,
- d_max);
- print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(realw), cudaMemcpyDeviceToHost),222);
-
- cudaFree(d_max);
- */
-
- // way 2 b: timing Elapsed time: 1.236916e-03
- // launch simple reduction kernel
- realw* h_max;
- int blocksize = BLOCKSIZE_TRANSFER;
-
- int size = mp->NGLOB_AB;
- int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
- //printf("num_blocks_x %i \n",num_blocks_x);
-
- h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
- cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
-
- if(mp->simulation_type == 1 ){
- get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,size,d_max);
- }else if(mp->simulation_type == 3 ){
- get_maximum_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,size,d_max);
- }
-
- print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
- cudaMemcpyDeviceToHost),222);
-
- // determines max for all blocks
- max = h_max[0];
- for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
- if( max < h_max[i]) max = h_max[i];
- }
-
- cudaFree(d_max);
- free(h_max);
-
- /* way 3: doesn't work properly...
- cublasStatus status;
-
- // Initialize CUBLAS
- status = cublasInit();
- if (status != CUBLAS_STATUS_SUCCESS) {
- fprintf (stderr, "!!!! CUBLAS initialization error\n");
- exit(1);
- }
-
- // cublas function: cublasIsamax
- // finds the smallest index of the maximum magnitude element of single
- // precision vector x
- int incr = 1;
- int imax = 0;
- imax = cublasIsamax(mp->NGLOB_AB,(realw*)mp->d_potential_dot_dot_acoustic, incr);
- status= cublasGetError();
- if (status != CUBLAS_STATUS_SUCCESS) {
- fprintf (stderr, "!!!! CUBLAS error in cublasIsamax\n");
- exit(1);
- }
-
- print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]),
- sizeof(realw), cudaMemcpyDeviceToHost),222);
-
- printf("maximum %i %i %f \n",mp->NGLOB_AB,imax,max);
-
- // Shutdown
- status = cublasShutdown();
- if (status != CUBLAS_STATUS_SUCCESS) {
- fprintf (stderr, "!!!! shutdown error (A)\n");
- exit(1);
- }
-
- */
-
- // return result
- *norm = max;
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //double end_time = get_time();
- //printf("Elapsed time: %e\n",end_time-start_time);
- exit_on_cuda_error("get_norm_acoustic_from_device");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// ELASTIC simulations
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
-
- // reduction example:
- __shared__ realw sdata[BLOCKSIZE_TRANSFER] ;
-
- // load shared mem
- unsigned int tid = threadIdx.x;
- unsigned int bx = blockIdx.y*gridDim.x+blockIdx.x;
- unsigned int i = tid + bx*blockDim.x;
-
- // loads values into shared memory: assume array is a vector array
- sdata[tid] = (i < size) ? sqrt(array[i*3]*array[i*3]
- + array[i*3+1]*array[i*3+1]
- + array[i*3+2]*array[i*3+2]) : 0.0 ;
-
- __syncthreads();
-
- // do reduction in shared mem
- for(unsigned int s=blockDim.x/2; s>0; s>>=1)
- {
- if (tid < s){
- // summation:
- //sdata[tid] += sdata[tid + s];
- // maximum:
- if( sdata[tid] < sdata[tid + s] ) sdata[tid] = sdata[tid + s];
- }
- __syncthreads();
- }
-
- // write result for this block to global mem
- if (tid == 0) d_max[bx] = sdata[0];
-
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(get_norm_elastic_from_device,
- GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
- long* Mesh_pointer_f) {
-
- TRACE("get_norm_elastic_from_device");
- //double start_time = get_time();
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- realw max;
- realw *d_max;
-
- max = 0.0;
-
- // launch simple reduction kernel
- realw* h_max;
- int blocksize = BLOCKSIZE_TRANSFER;
-
- int size = mp->NGLOB_AB;
- int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
- //printf("num_blocks_x %i \n",num_blocks_x);
- h_max = (realw*) calloc(num_blocks_x*num_blocks_y,sizeof(realw));
- cudaMalloc((void**)&d_max,num_blocks_x*num_blocks_y*sizeof(realw));
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
-
- if(mp->simulation_type == 1 ){
- get_maximum_vector_kernel<<<grid,threads>>>(mp->d_displ,size,d_max);
- }else if(mp->simulation_type == 3 ){
- get_maximum_vector_kernel<<<grid,threads>>>(mp->d_b_displ,size,d_max);
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //double end_time = get_time();
- //printf("Elapsed time: %e\n",end_time-start_time);
- exit_on_cuda_error("kernel get_norm_elastic_from_device");
-#endif
-
- print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*num_blocks_y*sizeof(realw),
- cudaMemcpyDeviceToHost),222);
-
- // determines max for all blocks
- max = h_max[0];
- for(int i=1;i<num_blocks_x*num_blocks_y;i++) {
- if( max < h_max[i]) max = h_max[i];
- }
-
- cudaFree(d_max);
- free(h_max);
-
- // return result
- *norm = max;
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //double end_time = get_time();
- //printf("Elapsed time: %e\n",end_time-start_time);
- exit_on_cuda_error("get_norm_elastic_from_device");
-#endif
-}
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_acoustic_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_acoustic_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -37,7 +37,6 @@
#include "config.h"
#include "mesh_constants_cuda.h"
-// #include "epik_user.h"
/* ----------------------------------------------------------------------------------------------- */
@@ -64,10 +63,8 @@
int isource = blockIdx.x + gridDim.x*blockIdx.y; // bx
- int ispec;
- int iglob;
- realw stf;
- realw kappal;
+ int ispec,iglob;
+ realw stf,kappal;
if( isource < NSOURCES ){
@@ -77,13 +74,15 @@
if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
+ iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
stf = (realw) stf_pre_compute[isource];
- iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
atomicAdd(&potential_dot_dot_acoustic[iglob],
-sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal);
+ // debug: without atomic operation
// potential_dot_dot_acoustic[iglob] +=
// -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal;
}
@@ -96,49 +95,43 @@
extern "C"
void FC_FUNC_(compute_add_sources_ac_cuda,
- COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* NSOURCESf,
- double* h_stf_pre_compute,
- int* myrankf) {
+ COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer,
+ int* phase_is_innerf,
+ int* NSOURCESf,
+ double* h_stf_pre_compute) {
-TRACE("compute_add_sources_ac_cuda");
+ TRACE("compute_add_sources_ac_cuda");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// check if anything to do
if( mp->nsources_local == 0 ) return;
- int phase_is_inner = *phase_is_innerf;
int NSOURCES = *NSOURCESf;
- int myrank = *myrankf;
+ int phase_is_inner = *phase_is_innerf;
- int num_blocks_x = NSOURCES;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
// copies pre-computed source time factors onto GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(5,5,5);
- compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- phase_is_inner,
- mp->d_sourcearrays,
- mp->d_stf_pre_compute,
- myrank,
- mp->d_islice_selected_source,
- mp->d_ispec_selected_source,
- mp->d_ispec_is_acoustic,
- mp->d_kappastore,
- NSOURCES);
+ compute_add_sources_acoustic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ phase_is_inner,
+ mp->d_sourcearrays,
+ mp->d_stf_pre_compute,
+ mp->myrank,
+ mp->d_islice_selected_source,
+ mp->d_ispec_selected_source,
+ mp->d_ispec_is_acoustic,
+ mp->d_kappastore,
+ NSOURCES);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_add_sources_ac_cuda");
@@ -149,49 +142,43 @@
extern "C"
void FC_FUNC_(compute_add_sources_ac_s3_cuda,
- COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* NSOURCESf,
- double* h_stf_pre_compute,
- int* myrankf) {
+ COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer,
+ int* phase_is_innerf,
+ int* NSOURCESf,
+ double* h_stf_pre_compute) {
-TRACE("compute_add_sources_ac_s3_cuda");
+ TRACE("compute_add_sources_ac_s3_cuda");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// check if anything to do
if( mp->nsources_local == 0 ) return;
- int phase_is_inner = *phase_is_innerf;
int NSOURCES = *NSOURCESf;
- int myrank = *myrankf;
+ int phase_is_inner = *phase_is_innerf;
- int num_blocks_x = NSOURCES;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
// copies source time factors onto GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(5,5,5);
- compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- phase_is_inner,
- mp->d_sourcearrays,
- mp->d_stf_pre_compute,
- myrank,
- mp->d_islice_selected_source,
- mp->d_ispec_selected_source,
- mp->d_ispec_is_acoustic,
- mp->d_kappastore,
- NSOURCES);
+ compute_add_sources_acoustic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_dot_dot_acoustic,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ phase_is_inner,
+ mp->d_sourcearrays,
+ mp->d_stf_pre_compute,
+ mp->myrank,
+ mp->d_islice_selected_source,
+ mp->d_ispec_selected_source,
+ mp->d_ispec_is_acoustic,
+ mp->d_kappastore,
+ NSOURCES);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_add_sources_ac_s3_cuda");
@@ -273,27 +260,21 @@
int* h_ispec_is_inner,
int* h_ispec_is_acoustic,
int* h_ispec_selected_rec,
- int* myrank,
int* nrec,
int* time_index,
int* h_islice_selected_rec,
int* nadj_rec_local,
int* NTSTEP_BETWEEN_READ_ADJSRC) {
-TRACE("add_sources_ac_sim_2_or_3_cuda");
+ TRACE("add_sources_ac_sim_2_or_3_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// checks
if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
- // make sure grid dimension is less than 65535 in x dimension
- int num_blocks_x = mp->nadj_rec_local;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nadj_rec_local,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(5,5,5);
@@ -304,17 +285,18 @@
int ispec,i,j,k;
int irec_local = 0;
for(int irec = 0; irec < *nrec; irec++) {
- if(*myrank == h_islice_selected_rec[irec]) {
+ if(mp->myrank == h_islice_selected_rec[irec]) {
irec_local++;
// takes only acoustic sources
- ispec = h_ispec_selected_rec[irec]-1;
- if( h_ispec_is_acoustic[ispec] ){
+ ispec = h_ispec_selected_rec[irec] - 1;
+ if( h_ispec_is_acoustic[ispec] ){
if( h_ispec_is_inner[ispec] == *phase_is_inner) {
for(k=0;k<5;k++) {
for(j=0;j<5;j++) {
for(i=0;i<5;i++) {
+
mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,irec_local-1)]
= h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
*NTSTEP_BETWEEN_READ_ADJSRC,
@@ -347,20 +329,20 @@
// copies extracted array values onto GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
- (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
+ (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
// launches cuda kernel for acoustic adjoint sources
- add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
- *nrec,
- mp->d_adj_sourcearrays,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- mp->d_ispec_is_acoustic,
- mp->d_ispec_selected_rec,
- *phase_is_inner,
- mp->d_pre_computed_irec,
- mp->nadj_rec_local,
- mp->d_kappastore);
+ add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
+ *nrec,
+ mp->d_adj_sourcearrays,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ mp->d_ispec_is_acoustic,
+ mp->d_ispec_selected_rec,
+ *phase_is_inner,
+ mp->d_pre_computed_irec,
+ mp->nadj_rec_local,
+ mp->d_kappastore);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_viscoelastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_viscoelastic_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_add_sources_viscoelastic_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -62,8 +62,8 @@
int k = threadIdx.z;
int isource = blockIdx.x + gridDim.x*blockIdx.y; // bx
- int ispec;
- int iglob;
+
+ int ispec,iglob;
realw stf;
if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
@@ -94,51 +94,45 @@
extern "C"
void FC_FUNC_(compute_add_sources_el_cuda,
- COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* NSOURCESf,
- double* h_stf_pre_compute,
- int* myrankf) {
+ COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer,
+ double* h_stf_pre_compute,
+ int* h_NSOURCES,
+ int* h_phase_is_inner) {
-TRACE("compute_add_sources_el_cuda");
+ TRACE("\tcompute_add_sources_el_cuda");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// check if anything to do
if( mp->nsources_local == 0 ) return;
- int phase_is_inner = *phase_is_innerf;
- int NSOURCES = *NSOURCESf;
- int myrank = *myrankf;
+ int NSOURCES = *h_NSOURCES;
+ int phase_is_inner = *h_phase_is_inner;
- int num_blocks_x = NSOURCES;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
- //double* d_stf_pre_compute;
print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("compute_add_sources_el_cuda copy");
+#endif
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(5,5,5);
- compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- phase_is_inner,
- mp->d_sourcearrays,
- mp->d_stf_pre_compute,
- myrank,
- mp->d_islice_selected_source,
- mp->d_ispec_selected_source,
- mp->d_ispec_is_elastic,
- NSOURCES);
+ compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_ibool,
+ mp->d_ispec_is_inner,phase_is_inner,
+ mp->d_sourcearrays,
+ mp->d_stf_pre_compute,
+ mp->myrank,
+ mp->d_islice_selected_source,mp->d_ispec_selected_source,
+ mp->d_ispec_is_elastic,
+ NSOURCES);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_add_sources_kernel");
+ exit_on_cuda_error("compute_add_sources_el_cuda");
#endif
}
@@ -148,41 +142,38 @@
void FC_FUNC_(compute_add_sources_el_s3_cuda,
COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
double* h_stf_pre_compute,
- int* NSOURCESf,
- int* phase_is_inner,
- int* myrank) {
- TRACE("compute_add_sources_el_s3_cuda");
+ int* h_NSOURCES,
+ int* h_phase_is_inner) {
+
+ TRACE("\tcompute_add_sources_el_s3_cuda");
// EPIK_TRACER("compute_add_sources_el_s3_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- int NSOURCES = *NSOURCESf;
+ int NSOURCES = *h_NSOURCES;
+ int phase_is_inner = *h_phase_is_inner;
print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+ exit_on_cuda_error("compute_add_sources_el_s3_cuda copy");
#endif
- int num_blocks_x = NSOURCES;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(NSOURCES,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(5,5,5);
compute_add_sources_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_ibool,
- mp->d_ispec_is_inner, *phase_is_inner,
- mp->d_sourcearrays,
- mp->d_stf_pre_compute,
- *myrank,
- mp->d_islice_selected_source,mp->d_ispec_selected_source,
- mp->d_ispec_is_elastic,
- NSOURCES);
+ mp->d_ispec_is_inner, phase_is_inner,
+ mp->d_sourcearrays,
+ mp->d_stf_pre_compute,
+ mp->myrank,
+ mp->d_islice_selected_source,mp->d_ispec_selected_source,
+ mp->d_ispec_is_elastic,
+ NSOURCES);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_add_sources_el_s3_cuda");
@@ -219,30 +210,28 @@
extern "C"
void FC_FUNC_(add_source_master_rec_noise_cu,
- ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
- int* myrank_f,
- int* it_f,
- int* irec_master_noise_f,
- int* islice_selected_rec) {
+ ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer,
+ int* it_f,
+ int* irec_master_noise_f,
+ int* islice_selected_rec) {
-TRACE("add_source_master_rec_noise_cu");
+TRACE("\tadd_source_master_rec_noise_cu");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
int it = *it_f-1; // -1 for Fortran -> C indexing differences
int irec_master_noise = *irec_master_noise_f;
- int myrank = *myrank_f;
dim3 grid(1,1,1);
dim3 threads(NGLL3,1,1);
- if(myrank == islice_selected_rec[irec_master_noise-1]) {
+ if(mp->myrank == islice_selected_rec[irec_master_noise-1]) {
add_source_master_rec_noise_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_ibool,
- mp->d_ispec_selected_rec,
- irec_master_noise,
- mp->d_accel,
- mp->d_noise_sourcearray,
- it);
+ mp->d_ispec_selected_rec,
+ irec_master_noise,
+ mp->d_accel,
+ mp->d_noise_sourcearray,
+ it);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel");
@@ -313,27 +302,21 @@
int* h_ispec_is_inner,
int* h_ispec_is_elastic,
int* h_ispec_selected_rec,
- int* myrank,
int* nrec,
int* time_index,
int* h_islice_selected_rec,
int* nadj_rec_local,
int* NTSTEP_BETWEEN_READ_ADJSRC) {
-TRACE("add_sources_el_sim_type_2_or_3");
+ TRACE("\tadd_sources_el_sim_type_2_or_3");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// checks
if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
- // make sure grid dimension is less than 65535 in x dimension
- int num_blocks_x = mp->nadj_rec_local;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nadj_rec_local,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(5,5,5);
@@ -344,11 +327,12 @@
int ispec,i,j,k;
int irec_local = 0;
for(int irec = 0; irec < *nrec; irec++) {
- if(*myrank == h_islice_selected_rec[irec]) {
+ if(mp->myrank == h_islice_selected_rec[irec]) {
irec_local++;
// takes only elastic sources
- ispec = h_ispec_selected_rec[irec]-1;
+ ispec = h_ispec_selected_rec[irec] - 1;
+
if( h_ispec_is_elastic[ispec] ){
if( h_ispec_is_inner[ispec] == *phase_is_inner) {
@@ -356,34 +340,25 @@
for(j=0;j<5;j++) {
for(i=0;i<5;i++) {
- mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
- i,j,k,0,
- irec_local-1)]
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,irec_local-1)]
= h_adj_sourcearrays[INDEX6(*nadj_rec_local,
*NTSTEP_BETWEEN_READ_ADJSRC,
3,5,5,
- irec_local-1,
- *time_index-1,
+ irec_local-1,(*time_index)-1,
0,i,j,k)];
- mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
- i,j,k,1,
- irec_local-1)]
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,irec_local-1)]
= h_adj_sourcearrays[INDEX6(*nadj_rec_local,
*NTSTEP_BETWEEN_READ_ADJSRC,
3,5,5,
- irec_local-1,
- *time_index-1,
+ irec_local-1,(*time_index)-1,
1,i,j,k)];
- mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
- i,j,k,2,
- irec_local-1)]
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,irec_local-1)]
= h_adj_sourcearrays[INDEX6(*nadj_rec_local,
*NTSTEP_BETWEEN_READ_ADJSRC,
3,5,5,
- irec_local-1,
- *time_index-1,
+ irec_local-1,(*time_index)-1,
2,i,j,k)];
}
}
@@ -396,8 +371,8 @@
if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
// copies extracted array values onto GPU
- cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
- (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+ (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),98001);
// the irec_local variable needs to be precomputed (as
@@ -405,15 +380,15 @@
// and due to how it's incremented, it cannot be parallelized
add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
- *nrec,
- mp->d_adj_sourcearrays,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- mp->d_ispec_is_elastic,
- mp->d_ispec_selected_rec,
- *phase_is_inner,
- mp->d_pre_computed_irec,
- mp->nadj_rec_local);
+ *nrec,
+ mp->d_adj_sourcearrays,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ mp->d_ispec_is_elastic,
+ mp->d_ispec_selected_rec,
+ *phase_is_inner,
+ mp->d_pre_computed_irec,
+ mp->nadj_rec_local);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_coupling_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_coupling_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -115,25 +115,22 @@
extern "C"
void FC_FUNC_(compute_coupling_ac_el_cuda,
- COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer,
int* phase_is_innerf,
int* num_coupling_ac_el_facesf) {
TRACE("compute_coupling_ac_el_cuda");
//double start_time = get_time();
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
int phase_is_inner = *phase_is_innerf;
int num_coupling_ac_el_faces = *num_coupling_ac_el_facesf;
// way 1: exact blocksize to match NGLLSQUARE
int blocksize = NGLL2;
- int num_blocks_x = num_coupling_ac_el_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(num_coupling_ac_el_faces,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -275,25 +272,21 @@
extern "C"
void FC_FUNC_(compute_coupling_el_ac_cuda,
- COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer,
int* phase_is_innerf,
int* num_coupling_ac_el_facesf) {
TRACE("compute_coupling_el_ac_cuda");
//double start_time = get_time();
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
int phase_is_inner = *phase_is_innerf;
int num_coupling_ac_el_faces = *num_coupling_ac_el_facesf;
// way 1: exact blocksize to match NGLLSQUARE
int blocksize = 25;
- int num_blocks_x = num_coupling_ac_el_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(num_coupling_ac_el_faces,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -359,6 +352,7 @@
// gets spectral element face id
int igll = threadIdx.x ; // threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
int iface = blockIdx.x + gridDim.x*blockIdx.y;
+
realw nx,ny,nz;
realw force_normal_comp;
@@ -408,11 +402,11 @@
extern "C"
void FC_FUNC_(compute_coupling_ocean_cuda,
- COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f) {
+ COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer) {
- TRACE("compute_coupling_ocean_cuda");
+ TRACE("\tcompute_coupling_ocean_cuda");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// checks if anything to do
if( mp->num_free_surface_faces == 0 ) return;
@@ -420,12 +414,8 @@
// block sizes: exact blocksize to match NGLLSQUARE
int blocksize = NGLL2;
- int num_blocks_x = mp->num_free_surface_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->num_free_surface_faces,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -440,14 +430,14 @@
#endif
compute_coupling_ocean_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
- mp->d_rmassx,mp->d_rmassy,mp->d_rmassz,
- mp->d_rmass_ocean_load,
- mp->num_free_surface_faces,
- mp->d_free_surface_ispec,
- mp->d_free_surface_ijk,
- mp->d_free_surface_normal,
- mp->d_ibool,
- mp->d_updated_dof_ocean_load);
+ mp->d_rmassx,mp->d_rmassy,mp->d_rmassz,
+ mp->d_rmass_ocean_load,
+ mp->num_free_surface_faces,
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->d_free_surface_normal,
+ mp->d_ibool,
+ mp->d_updated_dof_ocean_load);
// for backward/reconstructed potentials
if(mp->simulation_type == 3) {
// re-initializes array
@@ -455,14 +445,14 @@
sizeof(int)*mp->NGLOB_AB),88502);
compute_coupling_ocean_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,
- mp->d_rmassx,mp->d_rmassy,mp->d_rmassz,
- mp->d_rmass_ocean_load,
- mp->num_free_surface_faces,
- mp->d_free_surface_ispec,
- mp->d_free_surface_ijk,
- mp->d_free_surface_normal,
- mp->d_ibool,
- mp->d_updated_dof_ocean_load);
+ mp->d_rmassx,mp->d_rmassy,mp->d_rmassz,
+ mp->d_rmass_ocean_load,
+ mp->num_free_surface_faces,
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->d_free_surface_normal,
+ mp->d_ibool,
+ mp->d_updated_dof_ocean_load);
}
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_acoustic_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -36,7 +36,37 @@
#include "config.h"
#include "mesh_constants_cuda.h"
+#ifdef USE_TEXTURES_FIELDS
+realw_texture d_potential_tex;
+realw_texture d_potential_dot_dot_tex;
+//backward/reconstructed
+realw_texture d_b_potential_tex;
+realw_texture d_b_potential_dot_dot_tex;
+//note: texture variables are implicitly static, and cannot be passed as arguments to cuda kernels;
+// thus, 1) we thus use if-statements (FORWARD_OR_ADJOINT) to determine from which texture to fetch from
+// 2) we use templates
+// since if-statements are a bit slower as the variable is only known at runtime, we use option 2)
+
+// templates definitions
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_potential(int x);
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_potential_dot_dot(int x);
+
+// templates for texture fetching
+// FORWARD_OR_ADJOINT == 1 <- forward arrays
+template<> __device__ float texfetch_potential<1>(int x) { return tex1Dfetch(d_potential_tex, x); }
+template<> __device__ float texfetch_potential_dot_dot<1>(int x) { return tex1Dfetch(d_potential_dot_dot_tex, x); }
+// FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
+template<> __device__ float texfetch_potential<3>(int x) { return tex1Dfetch(d_b_potential_tex, x); }
+template<> __device__ float texfetch_potential_dot_dot<3>(int x) { return tex1Dfetch(d_b_potential_dot_dot_tex, x); }
+
+#endif
+
+#ifdef USE_TEXTURES_CONSTANTS
+extern realw_texture d_hprime_xx_tex;
+#endif
+
+
/* ----------------------------------------------------------------------------------------------- */
// prepares a device array with with all inter-element edge-nodes -- this
@@ -49,12 +79,17 @@
int* d_ibool_interfaces_ext_mesh) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- int iinterface=0;
+ int ientry,iglob;
- for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+ for(int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
- d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces_ext_mesh*iinterface)] =
- d_potential_dot_dot_acoustic[(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)];
+
+ // entry in interface array
+ ientry = id + max_nibool_interfaces_ext_mesh*iinterface;
+ // global index in wavefield
+ iglob = d_ibool_interfaces_ext_mesh[ientry] - 1;
+
+ d_send_potential_dot_dot_buffer[ientry] = d_potential_dot_dot_acoustic[iglob];
}
}
@@ -66,58 +101,68 @@
// prepares and transfers the inter-element edge-nodes to the host to be MPI'd
extern "C"
void FC_FUNC_(transfer_boun_pot_from_device,
- TRANSFER_BOUN_POT_FROM_DEVICE)(
- int* size,
- long* Mesh_pointer_f,
- realw* potential_dot_dot_acoustic,
- realw* send_potential_dot_dot_buffer,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh,
- int* nibool_interfaces_ext_mesh,
- int* ibool_interfaces_ext_mesh,
- int* FORWARD_OR_ADJOINT){
+ TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer,
+ realw* potential_dot_dot_acoustic,
+ realw* send_potential_dot_dot_buffer,
+ int* FORWARD_OR_ADJOINT){
TRACE("transfer_boun_pot_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- if( *num_interfaces_ext_mesh == 0 ) return;
+ // checks if anything to do
+ if( mp->size_mpi_buffer_potential > 0 ){
- int blocksize = BLOCKSIZE_TRANSFER;
- int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_ext_mesh))/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_ext_mesh))/((double)blocksize)))*blocksize;
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
- if(*FORWARD_OR_ADJOINT == 1) {
- prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
- mp->d_send_potential_dot_dot_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ if(*FORWARD_OR_ADJOINT == 1) {
+ prepare_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
+ mp->d_send_potential_dot_dot_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+
+ // synchronizes
+ //synchronize_cuda();
+ // explicitly waits until previous compute stream finishes
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
+ print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_potential_dot_dot_buffer,
+ mp->size_mpi_buffer_potential*sizeof(realw),cudaMemcpyDeviceToHost),98000);
+ }
+ else if(*FORWARD_OR_ADJOINT == 3) {
+ // backward/reconstructed wavefield buffer
+ prepare_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_dot_dot_acoustic,
+ mp->d_b_send_potential_dot_dot_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+
+ // synchronizes
+ //synchronize_cuda();
+ // explicitly waits until previous compute stream finishes
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
+ print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_b_send_potential_dot_dot_buffer,
+ mp->size_mpi_buffer_potential*sizeof(realw),cudaMemcpyDeviceToHost),98000);
+ }
}
- else if(*FORWARD_OR_ADJOINT == 3) {
- prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
- mp->d_send_potential_dot_dot_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
- }
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after prepare_boundary_potential_on_device");
#endif
- print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_potential_dot_dot_buffer,
- (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),cudaMemcpyDeviceToHost),98000);
// finish timing of kernel+memcpy
// cudaEventRecord( stop, 0 );
@@ -143,17 +188,21 @@
int* d_ibool_interfaces_ext_mesh) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- int iinterface=0;
+ int ientry,iglob;
- for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
+ for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
+ // entry in interface array
+ ientry = id + max_nibool_interfaces_ext_mesh*iinterface;
+ // global index in wavefield
+ iglob = d_ibool_interfaces_ext_mesh[ientry] - 1;
+
// for testing atomic operations against not atomic operations (0.1ms vs. 0.04 ms)
// d_potential_dot_dot_acoustic[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)] +=
// d_send_potential_dot_dot_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)];
- atomicAdd(&d_potential_dot_dot_acoustic[(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)],
- d_send_potential_dot_dot_buffer[(id + max_nibool_interfaces_ext_mesh*iinterface)]);
+ atomicAdd(&d_potential_dot_dot_acoustic[iglob],d_send_potential_dot_dot_buffer[ientry]);
}
}
// ! This step is done via previous function transfer_and_assemble...
@@ -170,19 +219,15 @@
extern "C"
void FC_FUNC_(transfer_asmbl_pot_to_device,
- TRANSFER_ASMBL_POT_TO_DEVICE)(
- long* Mesh_pointer,
- realw* potential_dot_dot_acoustic,
- realw* buffer_recv_scalar_ext_mesh,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh,
- int* nibool_interfaces_ext_mesh,
- int* ibool_interfaces_ext_mesh,
- int* FORWARD_OR_ADJOINT) {
+ TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
+ realw* potential_dot_dot_acoustic,
+ realw* buffer_recv_scalar_ext_mesh,
+ int* FORWARD_OR_ADJOINT) {
TRACE("transfer_asmbl_pot_to_device");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
//double start_time = get_time();
// cudaEvent_t start, stop;
// realw time;
@@ -190,43 +235,49 @@
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
- if( *num_interfaces_ext_mesh == 0 ) return;
+ // checks if anything to do
+ if( mp->size_mpi_buffer_potential > 0 ){
- // copies buffer onto GPU
- cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
- (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw), cudaMemcpyHostToDevice);
+ // assembles on GPU
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
- // assembles on GPU
- int blocksize = BLOCKSIZE_TRANSFER;
- int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
- if(*FORWARD_OR_ADJOINT == 1) {
- //assemble forward field
- assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
- mp->d_send_potential_dot_dot_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
+ // synchronizes
+ synchronize_cuda();
+
+ if(*FORWARD_OR_ADJOINT == 1) {
+ // copies buffer onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
+ mp->size_mpi_buffer_potential*sizeof(realw), cudaMemcpyHostToDevice),98010);
+
+ //assemble forward field
+ assemble_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
+ mp->d_send_potential_dot_dot_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ }
+ else if(*FORWARD_OR_ADJOINT == 3) {
+ // copies buffer onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
+ mp->size_mpi_buffer_potential*sizeof(realw), cudaMemcpyHostToDevice),98011);
+
+ //assemble reconstructed/backward field
+ assemble_boundary_potential_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_dot_dot_acoustic,
+ mp->d_b_send_potential_dot_dot_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ }
}
- else if(*FORWARD_OR_ADJOINT == 3) {
- //assemble reconstructed/backward field
- assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
- mp->d_send_potential_dot_dot_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
- }
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
// cudaEventRecord( stop, 0 );
@@ -244,34 +295,32 @@
/* ----------------------------------------------------------------------------------------------- */
-/* KERNEL 2 */
+// KERNEL 2 - acoustic compute forces kernel
/* ----------------------------------------------------------------------------------------------- */
+template<int FORWARD_OR_ADJOINT> __global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,
+ int NGLOB, int* d_ibool,
+ int* d_phase_ispec_inner_acoustic,
+ int num_phase_ispec_acoustic,
+ int d_iphase,
+ int use_mesh_coloring_gpu,
+ realw* d_potential_acoustic, realw* d_potential_dot_dot_acoustic,
+ realw* d_xix, realw* d_xiy, realw* d_xiz,
+ realw* d_etax, realw* d_etay, realw* d_etaz,
+ realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+ realw* d_hprime_xx,
+ realw* hprimewgll_xx,
+ realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
+ realw* d_rhostore,
+ int gravity,
+ realw* minus_g,
+ realw* d_kappastore,
+ realw* wgll_cube){
-__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,
- int NGLOB, int* d_ibool,
- int* d_phase_ispec_inner_acoustic,
- int num_phase_ispec_acoustic,
- int d_iphase,
- int use_mesh_coloring_gpu,
- realw* d_potential_acoustic, realw* d_potential_dot_dot_acoustic,
- realw* d_xix, realw* d_xiy, realw* d_xiz,
- realw* d_etax, realw* d_etay, realw* d_etaz,
- realw* d_gammax, realw* d_gammay, realw* d_gammaz,
- realw* d_hprime_xx,
- realw* hprimewgll_xx,
- realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
- realw* d_rhostore,
- int gravity,
- realw* minus_g,
- realw* d_kappastore,
- realw* wgll_cube){
-
int bx = blockIdx.y*gridDim.x+blockIdx.x;
int tx = threadIdx.x;
- //const int NGLL3 = NGLL3;
const int NGLL3_ALIGN = NGLL3_PADDED;
int K = (tx/NGLL2);
@@ -281,6 +330,7 @@
int active,offset;
int iglob = 0;
int working_element;
+
realw temp1l,temp2l,temp3l;
realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
realw dpotentialdxl,dpotentialdyl,dpotentialdzl;
@@ -301,6 +351,8 @@
__shared__ realw s_temp2[NGLL3];
__shared__ realw s_temp3[NGLL3];
+ __shared__ realw sh_hprime_xx[NGLL2];
+
// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
@@ -321,24 +373,29 @@
}
#endif
- // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
iglob = d_ibool[working_element*NGLL3 + tx]-1;
-#ifdef USE_TEXTURES
- s_dummy_loc[tx] = tex1Dfetch(tex_potential_acoustic, iglob);
+#ifdef USE_TEXTURES_FIELDS
+ s_dummy_loc[tx] = texfetch_potential<FORWARD_OR_ADJOINT>(iglob);
#else
// changing iglob indexing to match fortran row changes fast style
s_dummy_loc[tx] = d_potential_acoustic[iglob];
#endif
}
+ if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+ sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
+#else
+ sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+ }
+
// synchronize all the threads (one thread for each of the NGLL grid points of the
// current spectral element) because we need the whole element to be ready in order
// to be able to compute the matrix products along cut planes of the 3D element below
__syncthreads();
-#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-
if (active) {
#ifndef MANUALLY_UNROLLED_LOOPS
@@ -348,16 +405,16 @@
temp3l = 0.f;
for (l=0;l<NGLLX;l++) {
- hp1 = d_hprime_xx[l*NGLLX+I];
+ hp1 = sh_hprime_xx[l*NGLLX+I];
offset1 = K*NGLL2+J*NGLLX+l;
temp1l += s_dummy_loc[offset1]*hp1;
//assumes that hprime_xx = hprime_yy = hprime_zz
- hp2 = d_hprime_xx[l*NGLLX+J];
+ hp2 = sh_hprime_xx[l*NGLLX+J];
offset2 = K*NGLL2+l*NGLLX+I;
temp2l += s_dummy_loc[offset2]*hp2;
- hp3 = d_hprime_xx[l*NGLLX+K];
+ hp3 = sh_hprime_xx[l*NGLLX+K];
offset3 = l*NGLL2+J*NGLLX+I;
temp3l += s_dummy_loc[offset3]*hp3;
}
@@ -504,34 +561,38 @@
iglob = d_ibool[working_element*NGLL3 + tx]-1;
-#ifdef USE_TEXTURES
- d_potential_dot_dot_acoustic[iglob] = tex1Dfetch(tex_potential_dot_dot_acoustic, iglob)
- + sum_terms;
-#else
-
#ifdef USE_MESH_COLORING_GPU
// no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+ d_potential_dot_dot_acoustic[iglob] = texfetch_potential_dot_dot<FORWARD_OR_ADJOINT>(iglob) + sum_terms;
+#else
d_potential_dot_dot_acoustic[iglob] += sum_terms;
-#else
+#endif // USE_TEXTURES_FIELDS
+
+
+#else // MESH_COLORING
+
//mesh coloring
if( use_mesh_coloring_gpu ){
-
// no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+ d_potential_dot_dot_acoustic[iglob] = texfetch_potential_dot_dot<FORWARD_OR_ADJOINT>(iglob) + sum_terms;
+#else
d_potential_dot_dot_acoustic[iglob] += sum_terms;
+#endif // USE_TEXTURES_FIELDS
}else{
+ // for testing purposes only: w/out atomic updates
+ //d_potential_dot_dot_acoustic[iglob] += sum_terms1;
+
atomicAdd(&d_potential_dot_dot_acoustic[iglob],sum_terms);
}
-#endif
+#endif // MESH_COLORING
-#endif
}
-
-#else // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
- d_potential_dot_dot_acoustic[iglob] = 123.123f;
-#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
}
@@ -539,18 +600,11 @@
void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
int* d_ibool,
- realw* d_xix,
- realw* d_xiy,
- realw* d_xiz,
- realw* d_etax,
- realw* d_etay,
- realw* d_etaz,
- realw* d_gammax,
- realw* d_gammay,
- realw* d_gammaz,
+ realw* d_xix,realw* d_xiy,realw* d_xiz,
+ realw* d_etax,realw* d_etay,realw* d_etaz,
+ realw* d_gammax,realw* d_gammay,realw* d_gammaz,
realw* d_rhostore,
- realw* d_kappastore)
-{
+ realw* d_kappastore){
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("before acoustic kernel Kernel 2");
@@ -560,17 +614,15 @@
/* grid_2_x = nb_elem_color; */
/* nb_elem_color is just how many blocks we are computing now */
- int num_blocks_x = nb_blocks_to_compute;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2;
- dim3 grid_2(num_blocks_x,num_blocks_y);
+ int blocksize = NGLL3_PADDED;
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(nb_blocks_to_compute,&num_blocks_x,&num_blocks_y);
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
// Cuda timing
// cudaEvent_t start, stop;
// realw time;
@@ -578,28 +630,30 @@
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
- Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
- mp->NGLOB_AB,
- d_ibool,
- mp->d_phase_ispec_inner_acoustic,
- mp->num_phase_ispec_acoustic,
- d_iphase,
- mp->use_mesh_coloring_gpu,
- mp->d_potential_acoustic, mp->d_potential_dot_dot_acoustic,
- d_xix, d_xiy, d_xiz,
- d_etax, d_etay, d_etaz,
- d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx,
- mp->d_hprimewgll_xx,
- mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
- d_rhostore,
- mp->gravity,
- mp->d_minus_g,
- d_kappastore,
- mp->d_wgll_cube);
+ // forward wavefields -> FORWARD_OR_ADJOINT == 1
+ Kernel_2_acoustic_impl<1><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+ mp->NGLOB_AB,
+ d_ibool,
+ mp->d_phase_ispec_inner_acoustic,
+ mp->num_phase_ispec_acoustic,
+ d_iphase,
+ mp->use_mesh_coloring_gpu,
+ mp->d_potential_acoustic, mp->d_potential_dot_dot_acoustic,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ d_rhostore,
+ mp->gravity,
+ mp->d_minus_g,
+ d_kappastore,
+ mp->d_wgll_cube);
if(mp->simulation_type == 3) {
- Kernel_2_acoustic_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+ // backward/reconstructed wavefields -> FORWARD_OR_ADJOINT == 3
+ Kernel_2_acoustic_impl<3><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
mp->NGLOB_AB,
d_ibool,
mp->d_phase_ispec_inner_acoustic,
@@ -627,8 +681,8 @@
// cudaEventDestroy( stop );
// printf("Kernel2 Execution Time: %f ms\n",time);
- /* cudaThreadSynchronize(); */
- /* TRACE("Kernel 2 finished"); */
+ // cudaThreadSynchronize(); //
+ // TRACE("Kernel 2 finished"); //
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//printf("Tried to start with %dx1 blocks\n",nb_blocks_to_compute);
exit_on_cuda_error("kernel Kernel_2");
@@ -643,7 +697,7 @@
extern "C"
void FC_FUNC_(compute_forces_acoustic_cuda,
- COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer,
int* iphase,
int* nspec_outer_acoustic,
int* nspec_inner_acoustic) {
@@ -651,7 +705,7 @@
TRACE("compute_forces_acoustic_cuda");
//double start_time = get_time();
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
int num_elements;
@@ -671,7 +725,7 @@
int nb_colors,nb_blocks_to_compute;
int istart;
- int color_offset,color_offset_nonpadded;
+ int offset,offset_nonpadded;
// sets up color loop
if( *iphase == 1 ){
@@ -680,16 +734,16 @@
istart = 0;
// array offsets (acoustic elements start after elastic ones)
- color_offset = mp->nspec_elastic * NGLL3_PADDED;
- color_offset_nonpadded = mp->nspec_elastic * NGLL3;
+ offset = mp->nspec_elastic * NGLL3_PADDED;
+ offset_nonpadded = mp->nspec_elastic * NGLL3;
}else{
// inner element colors (start after outer elements)
nb_colors = mp->num_colors_outer_acoustic + mp->num_colors_inner_acoustic;
istart = mp->num_colors_outer_acoustic;
// array offsets (inner elements start after outer ones)
- color_offset = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_PADDED;
- color_offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3;
+ offset = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_PADDED;
+ offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3;
}
// loops over colors
@@ -698,23 +752,17 @@
nb_blocks_to_compute = mp->h_num_elem_colors_acoustic[icolor];
Kernel_2_acoustic(nb_blocks_to_compute,mp,*iphase,
- mp->d_ibool + color_offset_nonpadded,
- mp->d_xix + color_offset,
- mp->d_xiy + color_offset,
- mp->d_xiz + color_offset,
- mp->d_etax + color_offset,
- mp->d_etay + color_offset,
- mp->d_etaz + color_offset,
- mp->d_gammax + color_offset,
- mp->d_gammay + color_offset,
- mp->d_gammaz + color_offset,
- mp->d_rhostore + color_offset,
- mp->d_kappastore + color_offset_nonpadded);
+ mp->d_ibool + offset_nonpadded,
+ mp->d_xix + offset,mp->d_xiy + offset,mp->d_xiz + offset,
+ mp->d_etax + offset,mp->d_etay + offset,mp->d_etaz + offset,
+ mp->d_gammax + offset,mp->d_gammay + offset,mp->d_gammaz + offset,
+ mp->d_rhostore + offset,
+ mp->d_kappastore + offset_nonpadded);
// for padded and aligned arrays
- color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+ offset += nb_blocks_to_compute * NGLL3_PADDED;
// for no-aligned arrays
- color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+ offset_nonpadded += nb_blocks_to_compute * NGLL3;
}
}else{
@@ -722,15 +770,9 @@
// no mesh coloring: uses atomic updates
Kernel_2_acoustic(num_elements, mp, *iphase,
mp->d_ibool,
- mp->d_xix,
- mp->d_xiy,
- mp->d_xiz,
- mp->d_etax,
- mp->d_etay,
- mp->d_etaz,
- mp->d_gammax,
- mp->d_gammay,
- mp->d_gammaz,
+ mp->d_xix,mp->d_xiy,mp->d_xiz,
+ mp->d_etax,mp->d_etay,mp->d_etaz,
+ mp->d_gammax,mp->d_gammay,mp->d_gammaz,
mp->d_rhostore,
mp->d_kappastore);
@@ -787,12 +829,12 @@
extern "C"
void FC_FUNC_(acoustic_enforce_free_surf_cuda,
- ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
+ ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer,
int* ABSORB_INSTEAD_OF_FREE_SURFACE) {
TRACE("acoustic_enforce_free_surf_cuda");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
// checks if anything to do
if( *ABSORB_INSTEAD_OF_FREE_SURFACE == 0 ){
@@ -800,35 +842,31 @@
// does not absorb free surface, thus we enforce the potential to be zero at surface
// block sizes
- int num_blocks_x = mp->num_free_surface_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->num_free_surface_faces,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(NGLL2,1,1);
// sets potentials to zero at free surface
- enforce_free_surface_cuda_kernel<<<grid,threads>>>(mp->d_potential_acoustic,
- mp->d_potential_dot_acoustic,
- mp->d_potential_dot_dot_acoustic,
- mp->num_free_surface_faces,
- mp->d_free_surface_ispec,
- mp->d_free_surface_ijk,
- mp->d_ibool,
- mp->d_ispec_is_acoustic);
+ enforce_free_surface_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_acoustic,
+ mp->d_potential_dot_acoustic,
+ mp->d_potential_dot_dot_acoustic,
+ mp->num_free_surface_faces,
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->d_ibool,
+ mp->d_ispec_is_acoustic);
// for backward/reconstructed potentials
if(mp->simulation_type == 3) {
- enforce_free_surface_cuda_kernel<<<grid,threads>>>(mp->d_b_potential_acoustic,
- mp->d_b_potential_dot_acoustic,
- mp->d_b_potential_dot_dot_acoustic,
- mp->num_free_surface_faces,
- mp->d_free_surface_ispec,
- mp->d_free_surface_ijk,
- mp->d_ibool,
- mp->d_ispec_is_acoustic);
-
+ enforce_free_surface_cuda_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_acoustic,
+ mp->d_b_potential_dot_acoustic,
+ mp->d_b_potential_dot_dot_acoustic,
+ mp->num_free_surface_faces,
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->d_ibool,
+ mp->d_ispec_is_acoustic);
}
}
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_viscoelastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_viscoelastic_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_forces_viscoelastic_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -38,13 +38,38 @@
#ifdef USE_TEXTURES_FIELDS
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_tex;
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_veloc_tex;
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_tex;
+realw_texture d_displ_tex;
+realw_texture d_veloc_tex;
+realw_texture d_accel_tex;
+//backward/reconstructed
+realw_texture d_b_displ_tex;
+realw_texture d_b_veloc_tex;
+realw_texture d_b_accel_tex;
+
+//note: texture variables are implicitly static, and cannot be passed as arguments to cuda kernels;
+// thus, 1) we thus use if-statements (FORWARD_OR_ADJOINT) to determine from which texture to fetch from
+// 2) we use templates
+// since if-statements are a bit slower as the variable is only known at runtime, we use option 2)
+
+// templates definitions
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_displ(int x);
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_veloc(int x);
+template<int FORWARD_OR_ADJOINT> __device__ float texfetch_accel(int x);
+
+// templates for texture fetching
+// FORWARD_OR_ADJOINT == 1 <- forward arrays
+template<> __device__ float texfetch_displ<1>(int x) { return tex1Dfetch(d_displ_tex, x); }
+template<> __device__ float texfetch_veloc<1>(int x) { return tex1Dfetch(d_veloc_tex, x); }
+template<> __device__ float texfetch_accel<1>(int x) { return tex1Dfetch(d_accel_tex, x); }
+// FORWARD_OR_ADJOINT == 3 <- backward/reconstructed arrays
+template<> __device__ float texfetch_displ<3>(int x) { return tex1Dfetch(d_b_displ_tex, x); }
+template<> __device__ float texfetch_veloc<3>(int x) { return tex1Dfetch(d_b_veloc_tex, x); }
+template<> __device__ float texfetch_accel<3>(int x) { return tex1Dfetch(d_b_accel_tex, x); }
+
#endif
#ifdef USE_TEXTURES_CONSTANTS
-texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_tex;
+realw_texture d_hprime_xx_tex;
#endif
@@ -60,16 +85,19 @@
int* d_ibool_interfaces_ext_mesh) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- //int iinterface=0;
+ int ientry,iglob;
for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
- if(id<d_nibool_interfaces_ext_mesh[iinterface]) {
- d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)] =
- d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)];
- d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1] =
- d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1];
- d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2] =
- d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2];
+ if( id < d_nibool_interfaces_ext_mesh[iinterface] ) {
+
+ // entry in interface array
+ ientry = id + max_nibool_interfaces_ext_mesh*iinterface;
+ // global index in wavefield
+ iglob = d_ibool_interfaces_ext_mesh[ientry] - 1;
+
+ d_send_accel_buffer[3*ientry] = d_accel[3*iglob];
+ d_send_accel_buffer[3*ientry + 1 ] = d_accel[3*iglob + 1];
+ d_send_accel_buffer[3*ientry + 2 ] = d_accel[3*iglob + 2];
}
}
@@ -81,29 +109,23 @@
// (elements on boundary)
extern "C"
void FC_FUNC_(transfer_boun_accel_from_device,
- TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
- realw* send_accel_buffer,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh,
- int* nibool_interfaces_ext_mesh,
- int* ibool_interfaces_ext_mesh,
- int* FORWARD_OR_ADJOINT){
-TRACE("transfer_boun_accel_from_device");
+ TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer,
+ realw* accel,
+ realw* send_accel_buffer,
+ int* FORWARD_OR_ADJOINT){
+TRACE("\ttransfer_boun_accel_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- if( *num_interfaces_ext_mesh == 0 ) return;
+ // checks if anything to do
+ if( mp->size_mpi_buffer > 0 ){
- if( mp->size_mpi_buffer > 0 ){
int blocksize = BLOCKSIZE_TRANSFER;
int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -113,25 +135,41 @@
// cudaEventCreate(&start);
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
+
if(*FORWARD_OR_ADJOINT == 1) {
prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
mp->num_interfaces_ext_mesh,
mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
+ // synchronizes
+ //synchronize_cuda();
+ // explicitly waits until previous compute stream finishes
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
+ // copies buffer from GPU to CPU host
+ print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
+ mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost),97001);
+
}
else if(*FORWARD_OR_ADJOINT == 3) {
- prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_send_accel_buffer,
+ prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel,mp->d_b_send_accel_buffer,
mp->num_interfaces_ext_mesh,
mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
+ // synchronizes
+ //synchronize_cuda();
+ // explicitly waits until previous compute stream finishes
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
+ // copies buffer from GPU to CPU host
+ print_CUDA_error_if_any(cudaMemcpy(send_accel_buffer,mp->d_b_send_accel_buffer,
+ mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost),97002);
}
- // copies buffer from GPU to CPU host
- cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
- mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost);
-
// finish timing of kernel+memcpy
// cudaEventRecord( stop, 0 );
// cudaEventSynchronize( stop );
@@ -155,7 +193,7 @@
// asynchronous transfer from device to host
- TRACE("transfer_boundary_from_device_a");
+ TRACE("\ttransfer_boundary_from_device_a");
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
@@ -163,63 +201,88 @@
int blocksize = BLOCKSIZE_TRANSFER;
int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
prepare_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,mp->d_send_accel_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
- // wait until kernel is finished before starting async memcpy
-#if CUDA_VERSION >= 4000
- cudaDeviceSynchronize();
-#else
- cudaThreadSynchronize();
-#endif
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ // waits until kernel is finished before starting async memcpy
+ //synchronize_cuda();
+ // waits until previous compute stream finishes
+ cudaStreamSynchronize(mp->compute_stream);
cudaMemcpyAsync(mp->h_send_accel_buffer,mp->d_send_accel_buffer,
- mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost,mp->copy_stream);
+ mp->size_mpi_buffer*sizeof(realw),cudaMemcpyDeviceToHost,mp->copy_stream);
}
}
/* ----------------------------------------------------------------------------------------------- */
+extern "C"
+void FC_FUNC_(transfer_boundary_to_device_a,
+ TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
+ realw* buffer_recv_vector_ext_mesh,
+ int* num_interfaces_ext_mesh,
+ int* max_nibool_interfaces_ext_mesh) {
+
+// asynchronous transfer from host to device
+
+ TRACE("transfer_boundary_to_device_a");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+ if( mp->size_mpi_buffer > 0 ){
+ // copy on host memory
+ memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_buffer*sizeof(realw));
+
+ // asynchronous copy to GPU using copy_stream
+ cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
+ mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->copy_stream);
+ }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// Assembly
+
+/* ----------------------------------------------------------------------------------------------- */
+
__global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
int num_interfaces_ext_mesh,
int max_nibool_interfaces_ext_mesh,
int* d_nibool_interfaces_ext_mesh,
int* d_ibool_interfaces_ext_mesh) {
- int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
//int bx = blockIdx.y*gridDim.x+blockIdx.x;
//int tx = threadIdx.x;
- //int iinterface=0;
+ int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
+ int ientry,iglob;
+
for( int iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
- if(id < d_nibool_interfaces_ext_mesh[iinterface]) {
+ if( id < d_nibool_interfaces_ext_mesh[iinterface] ) {
+ // entry in interface array
+ ientry = id + max_nibool_interfaces_ext_mesh*iinterface;
+ // global index in wavefield
+ iglob = d_ibool_interfaces_ext_mesh[ientry] - 1;
+
// for testing atomic operations against not atomic operations (0.1ms vs. 0.04 ms)
- // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)] +=
- // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)];
- // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1] +=
- // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1];
- // d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2] +=
- // d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2];
+ // d_accel[3*(iglob)] += d_send_accel_buffer[3*(ientry)];
+ // d_accel[3*(iglob)+1] += d_send_accel_buffer[3*(ientry)+1];
+ // d_accel[3*(iglob)+2] += d_send_accel_buffer[3*(ientry)+2];
-
- atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)],
- d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)]);
- atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+1],
- d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+1]);
- atomicAdd(&d_accel[3*(d_ibool_interfaces_ext_mesh[id+max_nibool_interfaces_ext_mesh*iinterface]-1)+2],
- d_send_accel_buffer[3*(id + max_nibool_interfaces_ext_mesh*iinterface)+2]);
+ atomicAdd(&d_accel[3*iglob],d_send_accel_buffer[3*ientry]);
+ atomicAdd(&d_accel[3*iglob + 1],d_send_accel_buffer[3*ientry + 1]);
+ atomicAdd(&d_accel[3*iglob + 2],d_send_accel_buffer[3*ientry + 2]);
}
}
// ! This step is done via previous function transfer_and_assemble...
@@ -231,31 +294,85 @@
// ! enddo
}
+
/* ----------------------------------------------------------------------------------------------- */
+// FORWARD_OR_ADJOINT == 1 for accel, and == 3 for b_accel
extern "C"
-void FC_FUNC_(transfer_boundary_to_device_a,
- TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
- realw* buffer_recv_vector_ext_mesh,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh) {
+void FC_FUNC_(transfer_asmbl_accel_to_device,
+ TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel,
+ realw* buffer_recv_vector_ext_mesh,
+ int* num_interfaces_ext_mesh,
+ int* max_nibool_interfaces_ext_mesh,
+ int* nibool_interfaces_ext_mesh,
+ int* ibool_interfaces_ext_mesh,
+ int* FORWARD_OR_ADJOINT) {
+TRACE("\ttransfer_asmbl_accel_to_device");
-// asynchronous transfer from host to device
-
- TRACE("transfer_boundary_to_device_a");
-
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
if( mp->size_mpi_buffer > 0 ){
- // copy on host memory
- memcpy(mp->h_recv_accel_buffer,buffer_recv_vector_ext_mesh,mp->size_mpi_buffer*sizeof(realw));
- // cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
- // mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->compute_stream);
- //printf("xfer to device\n");
- cudaMemcpyAsync(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
- mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice,mp->copy_stream);
+ //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously?
+ if(*FORWARD_OR_ADJOINT == 1 ){
+ // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
+ cudaStreamSynchronize(mp->copy_stream);
+ }
+ else if(*FORWARD_OR_ADJOINT == 3 ){
+ // explicitly synchronizes
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ synchronize_cuda();
+
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_send_accel_buffer, buffer_recv_vector_ext_mesh,
+ mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice),97001);
+ }
+
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ //double start_time = get_time();
+ // cudaEvent_t start, stop;
+ // realw time;
+ // cudaEventCreate(&start);
+ // cudaEventCreate(&stop);
+ // cudaEventRecord( start, 0 );
+
+ if(*FORWARD_OR_ADJOINT == 1) {
+ //assemble forward accel
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ }
+ else if(*FORWARD_OR_ADJOINT == 3) {
+ //assemble adjoint accel
+ assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_b_send_accel_buffer,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
+ }
+
+ // cudaEventRecord( stop, 0 );
+ // cudaEventSynchronize( stop );
+ // cudaEventElapsedTime( &time, start, stop );
+ // cudaEventDestroy( start );
+ // cudaEventDestroy( stop );
+ // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
}
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //double end_time = get_time();
+ //printf("Elapsed time: %e\n",end_time-start_time);
+ exit_on_cuda_error("transfer_asmbl_accel_to_device");
+#endif
}
/* ----------------------------------------------------------------------------------------------- */
@@ -277,13 +394,10 @@
//
// int blocksize = BLOCKSIZE_TRANSFER;
// int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
-// int num_blocks_x = size_padded/blocksize;
-// int num_blocks_y = 1;
-// while(num_blocks_x > 65535) {
-// num_blocks_x = (int) ceil(num_blocks_x*0.5f);
-// num_blocks_y = num_blocks_y*2;
-// }
//
+// int num_blocks_x, num_blocks_y;
+// get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+//
// //double start_time = get_time();
// dim3 grid(num_blocks_x,num_blocks_y);
// dim3 threads(blocksize,1,1);
@@ -323,86 +437,12 @@
//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
// //double end_time = get_time();
// //printf("Elapsed time: %e\n",end_time-start_time);
-// exit_on_cuda_error("transfer_asmbl_accel_to_device");
+// exit_on_cuda_error("assemble_accel_on_device");
//#endif
//}
-/* ----------------------------------------------------------------------------------------------- */
-// FORWARD_OR_ADJOINT == 1 for accel, and == 3 for b_accel
-extern "C"
-void FC_FUNC_(transfer_asmbl_accel_to_device,
- TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel,
- realw* buffer_recv_vector_ext_mesh,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh,
- int* nibool_interfaces_ext_mesh,
- int* ibool_interfaces_ext_mesh,
- int* FORWARD_OR_ADJOINT) {
-TRACE("transfer_asmbl_accel_to_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
- if( mp->size_mpi_buffer > 0 ){
-
- //daniel: todo - check if this copy is only needed for adjoint simulation, otherwise it is called asynchronously?
- if(*FORWARD_OR_ADJOINT == 1 ){
- // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
- cudaStreamSynchronize(mp->copy_stream);
- }
- else if(*FORWARD_OR_ADJOINT == 3 ){
- cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
- mp->size_mpi_buffer*sizeof(realw),cudaMemcpyHostToDevice);
- }
-
- int blocksize = BLOCKSIZE_TRANSFER;
- int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
- //double start_time = get_time();
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
- // cudaEvent_t start, stop;
- // realw time;
- // cudaEventCreate(&start);
- // cudaEventCreate(&stop);
- // cudaEventRecord( start, 0 );
- if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
- assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel, mp->d_send_accel_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
- }
- else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
- assemble_boundary_accel_on_device<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_accel, mp->d_send_accel_buffer,
- mp->num_interfaces_ext_mesh,
- mp->max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
- }
-
- // cudaEventRecord( stop, 0 );
- // cudaEventSynchronize( stop );
- // cudaEventElapsedTime( &time, start, stop );
- // cudaEventDestroy( start );
- // cudaEventDestroy( stop );
- // printf("Boundary Assemble Kernel Execution Time: %f ms\n",time);
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //double end_time = get_time();
- //printf("Elapsed time: %e\n",end_time-start_time);
- exit_on_cuda_error("transfer_asmbl_accel_to_device");
-#endif
-}
-
-
/* ----------------------------------------------------------------------------------------------- */
// KERNEL 2
@@ -438,17 +478,10 @@
// updates stress
__device__ void compute_element_att_stress(int tx,int working_element,int NSPEC,
- realw* R_xx,
- realw* R_yy,
- realw* R_xy,
- realw* R_xz,
- realw* R_yz,
- realw* sigma_xx,
- realw* sigma_yy,
- realw* sigma_zz,
- realw* sigma_xy,
- realw* sigma_xz,
- realw* sigma_yz) {
+ realw* R_xx,realw* R_yy,realw* R_xy,
+ realw* R_xz,realw* R_yz,
+ realw* sigma_xx,realw* sigma_yy,realw* sigma_zz,
+ realw* sigma_xy,realw* sigma_xz,realw* sigma_yz) {
int i_sls,offset_sls;
realw R_xx_val,R_yy_val;
@@ -650,13 +683,18 @@
/* ----------------------------------------------------------------------------------------------- */
+/*
+
+// unused
+// original elastic kernel, please leave this code here for reference...
+
__global__ void Kernel_2_impl(int nb_blocks_to_compute,
int NGLOB,
int* d_ibool,
int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
int d_iphase,
int use_mesh_coloring_gpu,
- realw d_deltat,
+ realw d_deltat,
realw* d_displ,realw* d_veloc,realw* d_accel,
realw* d_xix, realw* d_xiy, realw* d_xiz,
realw* d_etax, realw* d_etay, realw* d_etaz,
@@ -676,36 +714,20 @@
realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
realw* alphaval,realw* betaval,realw* gammaval,
int ANISOTROPY,
- realw* d_c11store,
- realw* d_c12store,
- realw* d_c13store,
- realw* d_c14store,
- realw* d_c15store,
- realw* d_c16store,
- realw* d_c22store,
- realw* d_c23store,
- realw* d_c24store,
- realw* d_c25store,
- realw* d_c26store,
- realw* d_c33store,
- realw* d_c34store,
- realw* d_c35store,
- realw* d_c36store,
- realw* d_c44store,
- realw* d_c45store,
- realw* d_c46store,
- realw* d_c55store,
- realw* d_c56store,
- realw* d_c66store,
+ realw* d_c11store,realw* d_c12store,realw* d_c13store,
+ realw* d_c14store,realw* d_c15store,realw* d_c16store,
+ realw* d_c22store,realw* d_c23store,realw* d_c24store,
+ realw* d_c25store,realw* d_c26store,realw* d_c33store,
+ realw* d_c34store,realw* d_c35store,realw* d_c36store,
+ realw* d_c44store,realw* d_c45store,realw* d_c46store,
+ realw* d_c55store,realw* d_c56store,realw* d_c66store,
int gravity,
realw* d_minus_g,
realw* d_minus_deriv_gravity,
realw* d_rhostore,
realw* wgll_cube){
- /* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
- int bx = blockIdx.y*gridDim.x+blockIdx.x;
- /* int bx = blockIdx.x; */
+ int bx = blockIdx.y*gridDim.x + blockIdx.x;
int tx = threadIdx.x;
const int NGLL3_ALIGN = NGLL3_PADDED;
@@ -784,7 +806,6 @@
}
#endif
- // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
iglob = d_ibool[working_element*NGLL3 + tx]-1;
#ifdef USE_TEXTURES_FIELDS
@@ -1097,13 +1118,13 @@
// computes deviatoric strain attenuation and/or for kernel calculations
if(COMPUTE_AND_STORE_STRAIN) {
realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
- /*
- epsilondev_xx[offset] = duxdxl - templ;
- epsilondev_yy[offset] = duydyl - templ;
- epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
- epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
- epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
- */
+
+ // epsilondev_xx[offset] = duxdxl - templ;
+ // epsilondev_yy[offset] = duydyl - templ;
+ // epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
+ // epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
+ // epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
+
// local storage: stresses at this current time step
epsilondev_xx_loc = duxdxl - templ;
epsilondev_yy_loc = duydyl - templ;
@@ -1415,70 +1436,1352 @@
} // kernel_2_impl()
+*/
+
/* ----------------------------------------------------------------------------------------------- */
+// note: kernel_2 is split into two kernels:
+// - a kernel without attenuation Kernel_2_noatt_impl() and
+// - a kernel including attenuation Kernel_2_att_impl()
+// this separation should help with performance
+
+
+// kernel without attenuation
+//
+// we use templates to distinguish between calls with forward or adjoint texture fields
+
+template<int FORWARD_OR_ADJOINT> __global__ void Kernel_2_noatt_impl(int nb_blocks_to_compute,
+ int NGLOB,
+ int* d_ibool,
+ int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
+ int d_iphase,
+ int use_mesh_coloring_gpu,
+ realw* d_displ,realw* d_veloc,realw* d_accel,
+ realw* d_xix, realw* d_xiy, realw* d_xiz,
+ realw* d_etax, realw* d_etay, realw* d_etaz,
+ realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+ realw* d_hprime_xx,
+ realw* d_hprimewgll_xx,
+ realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
+ realw* d_kappav, realw* d_muv,
+ int COMPUTE_AND_STORE_STRAIN,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
+ realw* epsilon_trace_over_3,
+ int SIMULATION_TYPE,
+ int NSPEC,
+ realw* one_minus_sum_beta,realw* factor_common,
+ realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
+ realw* alphaval,realw* betaval,realw* gammaval,
+ int ANISOTROPY,
+ realw* d_c11store,realw* d_c12store,realw* d_c13store,
+ realw* d_c14store,realw* d_c15store,realw* d_c16store,
+ realw* d_c22store,realw* d_c23store,realw* d_c24store,
+ realw* d_c25store,realw* d_c26store,realw* d_c33store,
+ realw* d_c34store,realw* d_c35store,realw* d_c36store,
+ realw* d_c44store,realw* d_c45store,realw* d_c46store,
+ realw* d_c55store,realw* d_c56store,realw* d_c66store,
+ int gravity,
+ realw* d_minus_g,
+ realw* d_minus_deriv_gravity,
+ realw* d_rhostore,
+ realw* wgll_cube ){
+
+// elastic compute kernel without attenuation
+// holds for: ATTENUATION = .false.
+// COMPUTE_AND_STORE_STRAIN = .true. or .false. (true for kernel simulations)
+
+ int bx = blockIdx.y*gridDim.x+blockIdx.x;
+ int tx = threadIdx.x;
+
+ const int NGLL3_ALIGN = NGLL3_PADDED;
+
+ int K = (tx/NGLL2);
+ int J = ((tx-K*NGLL2)/NGLLX);
+ int I = (tx-K*NGLL2-J*NGLLX);
+
+ int active,offset;
+ int iglob = 0;
+ int working_element;
+
+ realw tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+ realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+ realw duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+ realw duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+ realw duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+
+ realw fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
+ realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+ realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+
+ realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+ realw sum_terms1,sum_terms2,sum_terms3;
+
+ // gravity variables
+ realw sigma_yx,sigma_zx,sigma_zy;
+ realw rho_s_H1,rho_s_H2,rho_s_H3;
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+ int l;
+ realw hp1,hp2,hp3;
+#endif
+
+ __shared__ realw s_dummyx_loc[NGLL3];
+ __shared__ realw s_dummyy_loc[NGLL3];
+ __shared__ realw s_dummyz_loc[NGLL3];
+
+ __shared__ realw s_tempx1[NGLL3];
+ __shared__ realw s_tempx2[NGLL3];
+ __shared__ realw s_tempx3[NGLL3];
+
+ __shared__ realw s_tempy1[NGLL3];
+ __shared__ realw s_tempy2[NGLL3];
+ __shared__ realw s_tempy3[NGLL3];
+
+ __shared__ realw s_tempz1[NGLL3];
+ __shared__ realw s_tempz2[NGLL3];
+ __shared__ realw s_tempz3[NGLL3];
+
+ __shared__ realw sh_hprime_xx[NGLL2];
+
+// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
+// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
+ active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+
+// copy from global memory to shared memory
+// each thread writes one of the NGLL^3 = 125 data points
+ if (active) {
+
+#ifdef USE_MESH_COLORING_GPU
+ working_element = bx;
+#else
+ //mesh coloring
+ if( use_mesh_coloring_gpu ){
+ working_element = bx;
+ }else{
+ // iphase-1 and working_element-1 for Fortran->C array conventions
+ working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
+ }
+#endif
+
+ iglob = d_ibool[working_element*NGLL3 + tx]-1;
+ // debug
+ //if( iglob < 0 || iglob >= NGLOB ){ printf("wrong iglob %d\n",iglob); }
+
+#ifdef USE_TEXTURES_FIELDS
+ s_dummyx_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3);
+ s_dummyy_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3 + 1);
+ s_dummyz_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3 + 2);
+#else
+ // changing iglob indexing to match fortran row changes fast style
+ s_dummyx_loc[tx] = d_displ[iglob*3];
+ s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+ s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+#endif
+ }
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+ if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+ sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
+#else
+ sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+ }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+ __syncthreads();
+
+ if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+ tempx1l = 0.f;
+ tempx2l = 0.f;
+ tempx3l = 0.f;
+
+ tempy1l = 0.f;
+ tempy2l = 0.f;
+ tempy3l = 0.f;
+
+ tempz1l = 0.f;
+ tempz2l = 0.f;
+ tempz3l = 0.f;
+
+ for (l=0;l<NGLLX;l++) {
+ hp1 = sh_hprime_xx[l*NGLLX+I];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l += s_dummyx_loc[offset]*hp1;
+ tempy1l += s_dummyy_loc[offset]*hp1;
+ tempz1l += s_dummyz_loc[offset]*hp1;
+
+ //assumes that hprime_xx = hprime_yy = hprime_zz
+ hp2 = sh_hprime_xx[l*NGLLX+J];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l += s_dummyx_loc[offset]*hp2;
+ tempy2l += s_dummyy_loc[offset]*hp2;
+ tempz2l += s_dummyz_loc[offset]*hp2;
+
+ hp3 = sh_hprime_xx[l*NGLLX+K];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l += s_dummyx_loc[offset]*hp3;
+ tempy3l += s_dummyy_loc[offset]*hp3;
+ tempz3l += s_dummyz_loc[offset]*hp3;
+ }
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+#else
+
+ tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+#endif
+
+// compute derivatives of ux, uy and uz with respect to x, y and z
+ offset = working_element*NGLL3_ALIGN + tx;
+
+ xixl = d_xix[offset];
+ xiyl = d_xiy[offset];
+ xizl = d_xiz[offset];
+ etaxl = d_etax[offset];
+ etayl = d_etay[offset];
+ etazl = d_etaz[offset];
+ gammaxl = d_gammax[offset];
+ gammayl = d_gammay[offset];
+ gammazl = d_gammaz[offset];
+
+ duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+ duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+ duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+
+ duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+ duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+ duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+
+ duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+ duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+ duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+ // precompute some sums to save CPU time
+ duxdxl_plus_duydyl = duxdxl + duydyl;
+ duxdxl_plus_duzdzl = duxdxl + duzdzl;
+ duydyl_plus_duzdzl = duydyl + duzdzl;
+ duxdyl_plus_duydxl = duxdyl + duydxl;
+ duzdxl_plus_duxdzl = duzdxl + duxdzl;
+ duzdyl_plus_duydzl = duzdyl + duydzl;
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+ // computes deviatoric strain for kernel calculations
+ if(COMPUTE_AND_STORE_STRAIN) {
+ realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+ // local storage: stresses at this current time step
+ epsilondev_xx_loc = duxdxl - templ;
+ epsilondev_yy_loc = duydyl - templ;
+ epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+ epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+ epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+
+ if(SIMULATION_TYPE == 3) {
+ epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+ }
+ }
+
+ // compute elements with an elastic isotropic rheology
+ kappal = d_kappav[offset];
+ mul = d_muv[offset];
+
+ // full anisotropic case, stress calculations
+ if(ANISOTROPY){
+
+ c11 = d_c11store[offset];
+ c12 = d_c12store[offset];
+ c13 = d_c13store[offset];
+ c14 = d_c14store[offset];
+ c15 = d_c15store[offset];
+ c16 = d_c16store[offset];
+ c22 = d_c22store[offset];
+ c23 = d_c23store[offset];
+ c24 = d_c24store[offset];
+ c25 = d_c25store[offset];
+ c26 = d_c26store[offset];
+ c33 = d_c33store[offset];
+ c34 = d_c34store[offset];
+ c35 = d_c35store[offset];
+ c36 = d_c36store[offset];
+ c44 = d_c44store[offset];
+ c45 = d_c45store[offset];
+ c46 = d_c46store[offset];
+ c55 = d_c55store[offset];
+ c56 = d_c56store[offset];
+ c66 = d_c66store[offset];
+
+ sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl +
+ c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl;
+ sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl +
+ c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl;
+ sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl +
+ c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl;
+ sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl +
+ c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl;
+ sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl +
+ c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl;
+ sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
+ c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
+
+ }else{
+
+ // isotropic case
+
+ lambdalplus2mul = kappal + 1.33333333333333333333f * mul; // 4./3. = 1.3333333
+ lambdal = lambdalplus2mul - 2.0f * mul;
+
+ // compute the six components of the stress tensor sigma
+ sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+ sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+ sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+
+ sigma_xy = mul*duxdyl_plus_duydxl;
+ sigma_xz = mul*duzdxl_plus_duxdzl;
+ sigma_yz = mul*duzdyl_plus_duydzl;
+ }
+
+ jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)-xiyl*(etaxl*gammazl-etazl*gammaxl)+xizl*(etaxl*gammayl-etayl*gammaxl));
+
+ // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+ sigma_yx = sigma_xy;
+ sigma_zx = sigma_xz;
+ sigma_zy = sigma_yz;
+
+ if( gravity ){
+ // computes non-symmetric terms for gravity
+ compute_element_gravity(tx,working_element,d_ibool,d_minus_g,d_minus_deriv_gravity,
+ d_rhostore,wgll_cube,jacobianl,
+ s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+ &sigma_xx,&sigma_yy,&sigma_xz,&sigma_yz,
+ &rho_s_H1,&rho_s_H2,&rho_s_H3);
+ }
+
+ // form dot product with test vector, non-symmetric form
+ s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+ s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+ s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+
+ s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+ s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+ s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+
+ s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+ s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+ s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+
+ }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+ __syncthreads();
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+ if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+ tempx1l = 0.f;
+ tempy1l = 0.f;
+ tempz1l = 0.f;
+
+ tempx2l = 0.f;
+ tempy2l = 0.f;
+ tempz2l = 0.f;
+
+ tempx3l = 0.f;
+ tempy3l = 0.f;
+ tempz3l = 0.f;
+
+ for (l=0;l<NGLLX;l++) {
+ fac1 = d_hprimewgll_xx[I*NGLLX+l];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l += s_tempx1[offset]*fac1;
+ tempy1l += s_tempy1[offset]*fac1;
+ tempz1l += s_tempz1[offset]*fac1;
+
+ // assumes hprimewgll_xx == hprimewgll_yy == hprimewgll_zz
+ fac2 = d_hprimewgll_xx[J*NGLLX+l];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l += s_tempx2[offset]*fac2;
+ tempy2l += s_tempy2[offset]*fac2;
+ tempz2l += s_tempz2[offset]*fac2;
+
+ fac3 = d_hprimewgll_xx[K*NGLLX+l];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l += s_tempx3[offset]*fac3;
+ tempy3l += s_tempy3[offset]*fac3;
+ tempz3l += s_tempz3[offset]*fac3;
+ }
+
+#else
+
+ tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+ tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+ tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+ tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+ + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+ + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+ + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+ + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+ tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+ + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+ + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+ + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+ + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+ tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+ + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+ + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+ + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+ + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+ tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+ + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+ + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+ + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+ + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+ tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+ + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+ + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+ + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+ + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+ tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+ + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+ + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+ + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+ + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+#endif
+
+ fac1 = d_wgllwgll_yz[K*NGLLX+J];
+ fac2 = d_wgllwgll_xz[K*NGLLX+I];
+ fac3 = d_wgllwgll_xy[J*NGLLX+I];
+
+ sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+ sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+ sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+
+ // adds gravity term
+ if( gravity ){
+ sum_terms1 += rho_s_H1;
+ sum_terms2 += rho_s_H2;
+ sum_terms3 += rho_s_H3;
+ }
+
+#ifdef USE_MESH_COLORING_GPU
+ // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+ d_accel[iglob*3] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
+#else
+ d_accel[iglob*3] += sum_terms1;
+ d_accel[iglob*3 + 1] += sum_terms2;
+ d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+#else // MESH_COLORING
+
+ //mesh coloring
+ if( use_mesh_coloring_gpu ){
+
+ // no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+ d_accel[iglob*3] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
+#else
+ d_accel[iglob*3] += sum_terms1;
+ d_accel[iglob*3 + 1] += sum_terms2;
+ d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+
+ }else {
+
+ // for testing purposes only: w/out atomic updates
+ //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+ //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+ //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+ // w/out atomic update
+ //d_accel[iglob*3] += sum_terms1;
+ //d_accel[iglob*3 + 1] += sum_terms2;
+ //d_accel[iglob*3 + 2] += sum_terms3;
+
+ atomicAdd(&d_accel[iglob*3], sum_terms1);
+ atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+ atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+
+ } // if(use_mesh_coloring_gpu)
+
+#endif // MESH_COLORING
+
+ // save deviatoric strain for Runge-Kutta scheme
+ if( COMPUTE_AND_STORE_STRAIN ){
+ int ijk_ispec = tx + working_element*NGLL3;
+
+ // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+ epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
+ epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
+ epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
+ epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
+ epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
+ }
+
+ } // if(active)
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+} // kernel_2_noatt_impl()
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// kernel with attenuation
+//
+// we use templates to distinguish between calls with forward or adjoint texture fields
+
+template<int FORWARD_OR_ADJOINT> __global__ void Kernel_2_att_impl(int nb_blocks_to_compute,
+ int NGLOB,
+ int* d_ibool,
+ int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
+ int d_iphase,
+ int use_mesh_coloring_gpu,
+ realw d_deltat,
+ realw* d_displ,realw* d_veloc,realw* d_accel,
+ realw* d_xix, realw* d_xiy, realw* d_xiz,
+ realw* d_etax, realw* d_etay, realw* d_etaz,
+ realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+ realw* d_hprime_xx,
+ realw* d_hprimewgll_xx,
+ realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
+ realw* d_kappav, realw* d_muv,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
+ realw* epsilon_trace_over_3,
+ int SIMULATION_TYPE,
+ int NSPEC,
+ realw* one_minus_sum_beta,realw* factor_common,
+ realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
+ realw* alphaval,realw* betaval,realw* gammaval,
+ int ANISOTROPY,
+ realw* d_c11store,realw* d_c12store,realw* d_c13store,
+ realw* d_c14store,realw* d_c15store,realw* d_c16store,
+ realw* d_c22store,realw* d_c23store,realw* d_c24store,
+ realw* d_c25store,realw* d_c26store,realw* d_c33store,
+ realw* d_c34store,realw* d_c35store,realw* d_c36store,
+ realw* d_c44store,realw* d_c45store,realw* d_c46store,
+ realw* d_c55store,realw* d_c56store,realw* d_c66store,
+ int gravity,
+ realw* d_minus_g,
+ realw* d_minus_deriv_gravity,
+ realw* d_rhostore,
+ realw* wgll_cube){
+
+
+// elastic compute kernel with attenuation
+// holds for: ATTENUATION = .true.
+// COMPUTE_AND_STORE_STRAIN = .true. (always true for attenuation)
+
+ int bx = blockIdx.y*gridDim.x+blockIdx.x;
+ int tx = threadIdx.x;
+
+ const int NGLL3_ALIGN = NGLL3_PADDED;
+
+ int K = (tx/NGLL2);
+ int J = ((tx-K*NGLL2)/NGLLX);
+ int I = (tx-K*NGLL2-J*NGLLX);
+
+ int active,offset;
+ int iglob = 0;
+ int working_element;
+
+ realw tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+ realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+ realw duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+ realw duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+ realw duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+
+ realw tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
+ realw duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+ realw duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+
+ realw fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
+ realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+ realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+
+ realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+ realw sum_terms1,sum_terms2,sum_terms3;
+
+ // gravity variables
+ realw sigma_yx,sigma_zx,sigma_zy;
+ realw rho_s_H1,rho_s_H2,rho_s_H3;
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+ int l;
+ realw hp1,hp2,hp3;
+#endif
+
+ __shared__ realw s_dummyx_loc[NGLL3];
+ __shared__ realw s_dummyy_loc[NGLL3];
+ __shared__ realw s_dummyz_loc[NGLL3];
+
+ __shared__ realw s_dummyx_loc_att[NGLL3];
+ __shared__ realw s_dummyy_loc_att[NGLL3];
+ __shared__ realw s_dummyz_loc_att[NGLL3];
+
+ __shared__ realw s_tempx1[NGLL3];
+ __shared__ realw s_tempx2[NGLL3];
+ __shared__ realw s_tempx3[NGLL3];
+
+ __shared__ realw s_tempy1[NGLL3];
+ __shared__ realw s_tempy2[NGLL3];
+ __shared__ realw s_tempy3[NGLL3];
+
+ __shared__ realw s_tempz1[NGLL3];
+ __shared__ realw s_tempz2[NGLL3];
+ __shared__ realw s_tempz3[NGLL3];
+
+ __shared__ realw sh_hprime_xx[NGLL2];
+
+// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
+// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
+ active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+
+// copy from global memory to shared memory
+// each thread writes one of the NGLL^3 = 125 data points
+ if (active) {
+
+#ifdef USE_MESH_COLORING_GPU
+ working_element = bx;
+#else
+ //mesh coloring
+ if( use_mesh_coloring_gpu ){
+ working_element = bx;
+ }else{
+ // iphase-1 and working_element-1 for Fortran->C array conventions
+ working_element = d_phase_ispec_inner_elastic[bx + num_phase_ispec_elastic*(d_iphase-1)]-1;
+ }
+#endif
+
+ iglob = d_ibool[working_element*NGLL3 + tx]-1;
+
+#ifdef USE_TEXTURES_FIELDS
+ s_dummyx_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3);
+ s_dummyy_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3 + 1);
+ s_dummyz_loc[tx] = texfetch_displ<FORWARD_OR_ADJOINT>(iglob*3 + 2);
+#else
+ // changing iglob indexing to match fortran row changes fast style
+ s_dummyx_loc[tx] = d_displ[iglob*3];
+ s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+ s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+#endif
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+ // attenuation
+ // use first order Taylor expansion of displacement for local storage of stresses
+ // at this current time step, to fix attenuation in a consistent way
+#ifdef USE_TEXTURES_FIELDS
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * texfetch_veloc<FORWARD_OR_ADJOINT>(iglob*3);
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * texfetch_veloc<FORWARD_OR_ADJOINT>(iglob*3 + 1);
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * texfetch_veloc<FORWARD_OR_ADJOINT>(iglob*3 + 2);
+#else
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+#endif
+ }
+
+
+ if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+ sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_tex,tx);
+#else
+ sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+ }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+ __syncthreads();
+
+ if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+ tempx1l = 0.f;
+ tempx2l = 0.f;
+ tempx3l = 0.f;
+
+ tempy1l = 0.f;
+ tempy2l = 0.f;
+ tempy3l = 0.f;
+
+ tempz1l = 0.f;
+ tempz2l = 0.f;
+ tempz3l = 0.f;
+
+ for (l=0;l<NGLLX;l++) {
+ hp1 = sh_hprime_xx[l*NGLLX+I];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l += s_dummyx_loc[offset]*hp1;
+ tempy1l += s_dummyy_loc[offset]*hp1;
+ tempz1l += s_dummyz_loc[offset]*hp1;
+
+ //assumes that hprime_xx = hprime_yy = hprime_zz
+ hp2 = sh_hprime_xx[l*NGLLX+J];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l += s_dummyx_loc[offset]*hp2;
+ tempy2l += s_dummyy_loc[offset]*hp2;
+ tempz2l += s_dummyz_loc[offset]*hp2;
+
+ hp3 = sh_hprime_xx[l*NGLLX+K];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l += s_dummyx_loc[offset]*hp3;
+ tempy3l += s_dummyy_loc[offset]*hp3;
+ tempz3l += s_dummyz_loc[offset]*hp3;
+ }
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+ // attenuation
+ // temporary variables used for fixing attenuation in a consistent way
+ tempx1l_att = 0.f;
+ tempx2l_att = 0.f;
+ tempx3l_att = 0.f;
+
+ tempy1l_att = 0.f;
+ tempy2l_att = 0.f;
+ tempy3l_att = 0.f;
+
+ tempz1l_att = 0.f;
+ tempz2l_att = 0.f;
+ tempz3l_att = 0.f;
+
+ for (l=0;l<NGLLX;l++) {
+ hp1 = sh_hprime_xx[l*NGLLX+I];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l_att += s_dummyx_loc_att[offset]*hp1;
+ tempy1l_att += s_dummyy_loc_att[offset]*hp1;
+ tempz1l_att += s_dummyz_loc_att[offset]*hp1;
+
+ hp2 = sh_hprime_xx[l*NGLLX+J];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l_att += s_dummyx_loc_att[offset]*hp2;
+ tempy2l_att += s_dummyy_loc_att[offset]*hp2;
+ tempz2l_att += s_dummyz_loc_att[offset]*hp2;
+
+ hp3 = sh_hprime_xx[l*NGLLX+K];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l_att += s_dummyx_loc_att[offset]*hp3;
+ tempy3l_att += s_dummyy_loc_att[offset]*hp3;
+ tempz3l_att += s_dummyz_loc_att[offset]*hp3;
+ }
+
+#else
+
+ tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+ // attenuation
+ // temporary variables used for fixing attenuation in a consistent way
+ tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+
+ tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+
+ tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+ tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
+#endif
+
+// compute derivatives of ux, uy and uz with respect to x, y and z
+ offset = working_element*NGLL3_ALIGN + tx;
+
+ xixl = d_xix[offset];
+ xiyl = d_xiy[offset];
+ xizl = d_xiz[offset];
+ etaxl = d_etax[offset];
+ etayl = d_etay[offset];
+ etazl = d_etaz[offset];
+ gammaxl = d_gammax[offset];
+ gammayl = d_gammay[offset];
+ gammazl = d_gammaz[offset];
+
+ duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+ duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+ duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+
+ duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+ duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+ duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+
+ duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+ duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+ duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+ // precompute some sums to save CPU time
+ duxdxl_plus_duydyl = duxdxl + duydyl;
+ duxdxl_plus_duzdzl = duxdxl + duzdzl;
+ duydyl_plus_duzdzl = duydyl + duzdzl;
+ duxdyl_plus_duydxl = duxdyl + duydxl;
+ duzdxl_plus_duxdzl = duzdxl + duxdzl;
+ duzdyl_plus_duydzl = duzdyl + duydzl;
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+ // attenuation
+ // temporary variables used for fixing attenuation in a consistent way
+ duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
+ duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
+ duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
+
+ duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
+ duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
+ duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
+
+ duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
+ duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
+ duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
+
+ // precompute some sums to save CPU time
+ duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
+ duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
+ duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
+
+ // attenuation
+ // computes deviatoric strain attenuation and/or for kernel calculations
+ realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
+ // local storage: stresses at this current time step
+ epsilondev_xx_loc = duxdxl_att - templ;
+ epsilondev_yy_loc = duydyl_att - templ;
+ epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
+ epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
+ epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
+
+ if(SIMULATION_TYPE == 3) {
+ epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
+ }
+
+ // compute elements with an elastic isotropic rheology
+ kappal = d_kappav[offset];
+ mul = d_muv[offset];
+
+ // attenuation
+ // use unrelaxed parameters if attenuation
+ mul = mul * one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
+
+ // full anisotropic case, stress calculations
+ if(ANISOTROPY){
+
+ c11 = d_c11store[offset];
+ c12 = d_c12store[offset];
+ c13 = d_c13store[offset];
+ c14 = d_c14store[offset];
+ c15 = d_c15store[offset];
+ c16 = d_c16store[offset];
+ c22 = d_c22store[offset];
+ c23 = d_c23store[offset];
+ c24 = d_c24store[offset];
+ c25 = d_c25store[offset];
+ c26 = d_c26store[offset];
+ c33 = d_c33store[offset];
+ c34 = d_c34store[offset];
+ c35 = d_c35store[offset];
+ c36 = d_c36store[offset];
+ c44 = d_c44store[offset];
+ c45 = d_c45store[offset];
+ c46 = d_c46store[offset];
+ c55 = d_c55store[offset];
+ c56 = d_c56store[offset];
+ c66 = d_c66store[offset];
+
+ sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl +
+ c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl;
+ sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl +
+ c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl;
+ sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl +
+ c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl;
+ sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl +
+ c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl;
+ sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl +
+ c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl;
+ sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
+ c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
+
+ }else{
+
+ // isotropic case
+
+ lambdalplus2mul = kappal + 1.33333333333333333333f * mul; // 4./3. = 1.3333333
+ lambdal = lambdalplus2mul - 2.0f * mul;
+
+ // compute the six components of the stress tensor sigma
+ sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+ sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+ sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+
+ sigma_xy = mul*duxdyl_plus_duydxl;
+ sigma_xz = mul*duzdxl_plus_duxdzl;
+ sigma_yz = mul*duzdyl_plus_duydzl;
+ }
+
+ // attenuation
+ // subtracts memory variables if attenuation
+ compute_element_att_stress(tx,working_element,NSPEC,
+ R_xx,R_yy,R_xy,R_xz,R_yz,
+ &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+
+ jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)-xiyl*(etaxl*gammazl-etazl*gammaxl)+xizl*(etaxl*gammayl-etayl*gammaxl));
+
+ // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+ sigma_yx = sigma_xy;
+ sigma_zx = sigma_xz;
+ sigma_zy = sigma_yz;
+
+ if( gravity ){
+ // computes non-symmetric terms for gravity
+ compute_element_gravity(tx,working_element,d_ibool,d_minus_g,d_minus_deriv_gravity,
+ d_rhostore,wgll_cube,jacobianl,
+ s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+ &sigma_xx,&sigma_yy,&sigma_xz,&sigma_yz,
+ &rho_s_H1,&rho_s_H2,&rho_s_H3);
+ }
+
+ // form dot product with test vector, non-symmetric form
+ s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+ s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+ s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+
+ s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+ s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+ s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+
+ s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+ s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+ s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+
+ }
+
+// synchronize all the threads (one thread for each of the NGLL grid points of the
+// current spectral element) because we need the whole element to be ready in order
+// to be able to compute the matrix products along cut planes of the 3D element below
+ __syncthreads();
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+ if (active) {
+
+#ifndef MANUALLY_UNROLLED_LOOPS
+
+ tempx1l = 0.f;
+ tempy1l = 0.f;
+ tempz1l = 0.f;
+
+ tempx2l = 0.f;
+ tempy2l = 0.f;
+ tempz2l = 0.f;
+
+ tempx3l = 0.f;
+ tempy3l = 0.f;
+ tempz3l = 0.f;
+
+ for (l=0;l<NGLLX;l++) {
+
+ fac1 = d_hprimewgll_xx[I*NGLLX+l];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l += s_tempx1[offset]*fac1;
+ tempy1l += s_tempy1[offset]*fac1;
+ tempz1l += s_tempz1[offset]*fac1;
+
+ // assumes hprimewgll_xx == hprimewgll_yy == hprimewgll_zz
+ fac2 = d_hprimewgll_xx[J*NGLLX+l];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l += s_tempx2[offset]*fac2;
+ tempy2l += s_tempy2[offset]*fac2;
+ tempz2l += s_tempz2[offset]*fac2;
+
+ fac3 = d_hprimewgll_xx[K*NGLLX+l];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l += s_tempx3[offset]*fac3;
+ tempy3l += s_tempy3[offset]*fac3;
+ tempz3l += s_tempz3[offset]*fac3;
+
+ }
+#else
+
+ tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+ tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+ tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+
+ tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+ + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+ + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+ + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+ + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+ tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+ + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+ + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+ + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+ + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+ tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+ + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+ + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+ + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+ + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+
+ tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+ + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+ + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+ + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+ + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+ tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+ + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+ + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+ + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+ + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+ tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+ + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+ + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+ + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+ + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
+#endif
+
+ fac1 = d_wgllwgll_yz[K*NGLLX+J];
+ fac2 = d_wgllwgll_xz[K*NGLLX+I];
+ fac3 = d_wgllwgll_xy[J*NGLLX+I];
+
+ sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+ sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+ sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+
+ // adds gravity term
+ if( gravity ){
+ sum_terms1 += rho_s_H1;
+ sum_terms2 += rho_s_H2;
+ sum_terms3 += rho_s_H3;
+ }
+
+#ifdef USE_MESH_COLORING_GPU
+ // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+ d_accel[iglob*3] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
+#else
+ d_accel[iglob*3] += sum_terms1;
+ d_accel[iglob*3 + 1] += sum_terms2;
+ d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+#else // MESH_COLORING
+
+ //mesh coloring
+ if( use_mesh_coloring_gpu ){
+
+ // no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+ d_accel[iglob*3] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = texfetch_accel<FORWARD_OR_ADJOINT>(iglob*3 + 2) + sum_terms3;
+#else
+ d_accel[iglob*3] += sum_terms1;
+ d_accel[iglob*3 + 1] += sum_terms2;
+ d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+
+ }
+ else {
+
+ // for testing purposes only: w/out atomic updates
+ //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+ //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+ //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+ // w/out atomic update
+ //d_accel[iglob*3] += sum_terms1;
+ //d_accel[iglob*3 + 1] += sum_terms2;
+ //d_accel[iglob*3 + 2] += sum_terms3;
+
+ atomicAdd(&d_accel[iglob*3], sum_terms1);
+ atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+ atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+
+ } // if(use_mesh_coloring_gpu)
+
+#endif // MESH_COLORING
+
+ // attenuation
+ // update memory variables based upon the Runge-Kutta scheme
+ compute_element_att_memory(tx,working_element,NSPEC,
+ d_muv,
+ factor_common,alphaval,betaval,gammaval,
+ R_xx,R_yy,R_xy,R_xz,R_yz,
+ epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+ epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc);
+
+ // save deviatoric strain for Runge-Kutta scheme
+ int ijk_ispec = tx + working_element*NGLL3;
+
+ // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+ epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
+ epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
+ epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
+ epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
+ epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
+
+ } // if(active)
+
+ // JC JC here we will need to add GPU support for the new C-PML routines
+
+} // kernel_2_att_impl()
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
int COMPUTE_AND_STORE_STRAIN,
int ATTENUATION,int ANISOTROPY,
int* d_ibool,
- realw* d_xix,
- realw* d_xiy,
- realw* d_xiz,
- realw* d_etax,
- realw* d_etay,
- realw* d_etaz,
- realw* d_gammax,
- realw* d_gammay,
- realw* d_gammaz,
+ realw* d_xix,realw* d_xiy,realw* d_xiz,
+ realw* d_etax,realw* d_etay,realw* d_etaz,
+ realw* d_gammax,realw* d_gammay,realw* d_gammaz,
realw* d_kappav,
realw* d_muv,
- realw* d_epsilondev_xx,
- realw* d_epsilondev_yy,
- realw* d_epsilondev_xy,
- realw* d_epsilondev_xz,
- realw* d_epsilondev_yz,
+ realw* d_epsilondev_xx,realw* d_epsilondev_yy,realw* d_epsilondev_xy,
+ realw* d_epsilondev_xz,realw* d_epsilondev_yz,
realw* d_epsilon_trace_over_3,
realw* d_one_minus_sum_beta,
realw* d_factor_common,
- realw* d_R_xx,
- realw* d_R_yy,
- realw* d_R_xy,
- realw* d_R_xz,
- realw* d_R_yz,
- realw* d_b_epsilondev_xx,
- realw* d_b_epsilondev_yy,
- realw* d_b_epsilondev_xy,
- realw* d_b_epsilondev_xz,
- realw* d_b_epsilondev_yz,
+ realw* d_R_xx,realw* d_R_yy,realw* d_R_xy,
+ realw* d_R_xz,realw* d_R_yz,
+ realw* d_b_epsilondev_xx,realw* d_b_epsilondev_yy,realw* d_b_epsilondev_xy,
+ realw* d_b_epsilondev_xz,realw* d_b_epsilondev_yz,
realw* d_b_epsilon_trace_over_3,
- realw* d_b_R_xx,
- realw* d_b_R_yy,
- realw* d_b_R_xy,
- realw* d_b_R_xz,
- realw* d_b_R_yz,
- realw* d_c11store,
- realw* d_c12store,
- realw* d_c13store,
- realw* d_c14store,
- realw* d_c15store,
- realw* d_c16store,
- realw* d_c22store,
- realw* d_c23store,
- realw* d_c24store,
- realw* d_c25store,
- realw* d_c26store,
- realw* d_c33store,
- realw* d_c34store,
- realw* d_c35store,
- realw* d_c36store,
- realw* d_c44store,
- realw* d_c45store,
- realw* d_c46store,
- realw* d_c55store,
- realw* d_c56store,
- realw* d_c66store,
+ realw* d_b_R_xx,realw* d_b_R_yy,realw* d_b_R_xy,
+ realw* d_b_R_xz,realw* d_b_R_yz,
+ realw* d_c11store,realw* d_c12store,realw* d_c13store,
+ realw* d_c14store,realw* d_c15store,realw* d_c16store,
+ realw* d_c22store,realw* d_c23store,realw* d_c24store,
+ realw* d_c25store,realw* d_c26store,realw* d_c33store,
+ realw* d_c34store,realw* d_c35store,realw* d_c36store,
+ realw* d_c44store,realw* d_c45store,realw* d_c46store,
+ realw* d_c55store,realw* d_c56store,realw* d_c66store,
realw* d_rhostore){
+ TRACE("\tKernel_2");
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("before kernel Kernel 2");
#endif
@@ -1487,14 +2790,11 @@
/* grid_2_x = nb_elem_color; */
/* nb_elem_color is just how many blocks we are computing now */
- int num_blocks_x = nb_blocks_to_compute;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int blocksize = NGLL3_PADDED;
- int blocksize = NGLL3_PADDED;
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(nb_blocks_to_compute,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -1505,123 +2805,176 @@
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
- Kernel_2_impl<<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
- mp->NGLOB_AB,
- d_ibool,
- mp->d_phase_ispec_inner_elastic,
- mp->num_phase_ispec_elastic,
- d_iphase,
- mp->use_mesh_coloring_gpu,
- d_deltat,
- mp->d_displ,mp->d_veloc,mp->d_accel,
- d_xix, d_xiy, d_xiz,
- d_etax, d_etay, d_etaz,
- d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx,
- mp->d_hprimewgll_xx,
- mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
- d_kappav, d_muv,
- COMPUTE_AND_STORE_STRAIN,
- d_epsilondev_xx,
- d_epsilondev_yy,
- d_epsilondev_xy,
- d_epsilondev_xz,
- d_epsilondev_yz,
- d_epsilon_trace_over_3,
- mp->simulation_type,
- ATTENUATION,mp->NSPEC_AB,
- d_one_minus_sum_beta,
- d_factor_common,
- d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
- mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
- ANISOTROPY,
- d_c11store,
- d_c12store,
- d_c13store,
- d_c14store,
- d_c15store,
- d_c16store,
- d_c22store,
- d_c23store,
- d_c24store,
- d_c25store,
- d_c26store,
- d_c33store,
- d_c34store,
- d_c35store,
- d_c36store,
- d_c44store,
- d_c45store,
- d_c46store,
- d_c55store,
- d_c56store,
- d_c66store,
- mp->gravity,
- mp->d_minus_g,
- mp->d_minus_deriv_gravity,
- d_rhostore,
- mp->d_wgll_cube);
+ if( ATTENUATION ){
+ // debug
+ //printf("Running Kernel_2 with attenuation\n");
+ // compute kernels with attenuation
+ // forward wavefields -> FORWARD_OR_ADJOINT == 1
+ Kernel_2_att_impl<1><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+ mp->NGLOB_AB,
+ d_ibool,
+ mp->d_phase_ispec_inner_elastic,
+ mp->num_phase_ispec_elastic,
+ d_iphase,
+ mp->use_mesh_coloring_gpu,
+ d_deltat,
+ mp->d_displ,mp->d_veloc,mp->d_accel,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ d_kappav, d_muv,
+ d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
+ d_epsilondev_xz,d_epsilondev_yz,
+ d_epsilon_trace_over_3,
+ mp->simulation_type,
+ mp->NSPEC_AB,
+ d_one_minus_sum_beta,
+ d_factor_common,
+ d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+ mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+ ANISOTROPY,
+ d_c11store,d_c12store,d_c13store,
+ d_c14store,d_c15store,d_c16store,
+ d_c22store,d_c23store,d_c24store,
+ d_c25store,d_c26store,d_c33store,
+ d_c34store,d_c35store,d_c36store,
+ d_c44store,d_c45store,d_c46store,
+ d_c55store,d_c56store,d_c66store,
+ mp->gravity,
+ mp->d_minus_g,
+ mp->d_minus_deriv_gravity,
+ d_rhostore,
+ mp->d_wgll_cube);
- if(mp->simulation_type == 3) {
- Kernel_2_impl<<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
- mp->NGLOB_AB,
- d_ibool,
- mp->d_phase_ispec_inner_elastic,
- mp->num_phase_ispec_elastic,
- d_iphase,
- mp->use_mesh_coloring_gpu,
- d_deltat,
- mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
- d_xix, d_xiy, d_xiz,
- d_etax, d_etay, d_etaz,
- d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx,
- mp->d_hprimewgll_xx,
- mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
- d_kappav, d_muv,
- COMPUTE_AND_STORE_STRAIN,
- d_b_epsilondev_xx,
- d_b_epsilondev_yy,
- d_b_epsilondev_xy,
- d_b_epsilondev_xz,
- d_b_epsilondev_yz,
- d_b_epsilon_trace_over_3,
- mp->simulation_type,
- ATTENUATION,mp->NSPEC_AB,
- d_one_minus_sum_beta,
- d_factor_common,
- d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
- mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
- ANISOTROPY,
- d_c11store,
- d_c12store,
- d_c13store,
- d_c14store,
- d_c15store,
- d_c16store,
- d_c22store,
- d_c23store,
- d_c24store,
- d_c25store,
- d_c26store,
- d_c33store,
- d_c34store,
- d_c35store,
- d_c36store,
- d_c44store,
- d_c45store,
- d_c46store,
- d_c55store,
- d_c56store,
- d_c66store,
- mp->gravity,
- mp->d_minus_g,
- mp->d_minus_deriv_gravity,
- d_rhostore,
- mp->d_wgll_cube);
+ if(mp->simulation_type == 3) {
+ // backward/reconstructed wavefields -> FORWARD_OR_ADJOINT == 3
+ Kernel_2_att_impl<3><<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+ mp->NGLOB_AB,
+ d_ibool,
+ mp->d_phase_ispec_inner_elastic,
+ mp->num_phase_ispec_elastic,
+ d_iphase,
+ mp->use_mesh_coloring_gpu,
+ d_deltat,
+ mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ d_kappav, d_muv,
+ d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
+ d_b_epsilondev_xz,d_b_epsilondev_yz,
+ d_b_epsilon_trace_over_3,
+ mp->simulation_type,
+ mp->NSPEC_AB,
+ d_one_minus_sum_beta,
+ d_factor_common,
+ d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
+ mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+ ANISOTROPY,
+ d_c11store,d_c12store,d_c13store,
+ d_c14store,d_c15store,d_c16store,
+ d_c22store,d_c23store,d_c24store,
+ d_c25store,d_c26store,d_c33store,
+ d_c34store,d_c35store,d_c36store,
+ d_c44store,d_c45store,d_c46store,
+ d_c55store,d_c56store,d_c66store,
+ mp->gravity,
+ mp->d_minus_g,
+ mp->d_minus_deriv_gravity,
+ d_rhostore,
+ mp->d_wgll_cube);
+ }
+ }else{
+ // debug
+ //printf("Running Kernel_2 without attenuation\n");
+
+ // compute kernels without attenuation
+ // forward wavefields -> FORWARD_OR_ADJOINT == 1
+ Kernel_2_noatt_impl<1><<<grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+ mp->NGLOB_AB,
+ d_ibool,
+ mp->d_phase_ispec_inner_elastic,mp->num_phase_ispec_elastic,
+ d_iphase,
+ mp->use_mesh_coloring_gpu,
+ mp->d_displ,mp->d_veloc,mp->d_accel,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ d_kappav, d_muv,
+ COMPUTE_AND_STORE_STRAIN,
+ d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
+ d_epsilondev_xz,d_epsilondev_yz,
+ d_epsilon_trace_over_3,
+ mp->simulation_type,
+ mp->NSPEC_AB,
+ d_one_minus_sum_beta,d_factor_common,
+ d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+ mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+ ANISOTROPY,
+ d_c11store,d_c12store,d_c13store,
+ d_c14store,d_c15store,d_c16store,
+ d_c22store,d_c23store,d_c24store,
+ d_c25store,d_c26store,d_c33store,
+ d_c34store,d_c35store,d_c36store,
+ d_c44store,d_c45store,d_c46store,
+ d_c55store,d_c56store,d_c66store,
+ mp->gravity,
+ mp->d_minus_g,
+ mp->d_minus_deriv_gravity,
+ d_rhostore,
+ mp->d_wgll_cube );
+
+ // backward/reconstructed wavefield
+ if(mp->simulation_type == 3) {
+ // backward/reconstructed wavefields -> FORWARD_OR_ADJOINT == 3
+ Kernel_2_noatt_impl<3><<< grid,threads,0,mp->compute_stream>>>(nb_blocks_to_compute,
+ mp->NGLOB_AB,
+ d_ibool,
+ mp->d_phase_ispec_inner_elastic,mp->num_phase_ispec_elastic,
+ d_iphase,
+ mp->use_mesh_coloring_gpu,
+ mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ d_kappav, d_muv,
+ COMPUTE_AND_STORE_STRAIN,
+ d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
+ d_b_epsilondev_xz,d_b_epsilondev_yz,
+ d_b_epsilon_trace_over_3,
+ mp->simulation_type,
+ mp->NSPEC_AB,
+ d_one_minus_sum_beta,d_factor_common,
+ d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
+ mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+ ANISOTROPY,
+ d_c11store,d_c12store,d_c13store,
+ d_c14store,d_c15store,d_c16store,
+ d_c22store,d_c23store,d_c24store,
+ d_c25store,d_c26store,d_c33store,
+ d_c34store,d_c35store,d_c36store,
+ d_c44store,d_c45store,d_c46store,
+ d_c55store,d_c56store,d_c66store,
+ mp->gravity,
+ mp->d_minus_g,
+ mp->d_minus_deriv_gravity,
+ d_rhostore,
+ mp->d_wgll_cube );
+ }
}
-
// cudaEventRecord( stop, 0 );
// cudaEventSynchronize( stop );
// cudaEventElapsedTime( &time, start, stop );
@@ -1629,10 +2982,10 @@
// cudaEventDestroy( stop );
// printf("Kernel2 Execution Time: %f ms\n",time);
- /* cudaThreadSynchronize(); */
- /* LOG("Kernel 2 finished"); */
+ // cudaThreadSynchronize(); //
+ // LOG("Kernel 2 finished"); //
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("Kernel_2_impl ");
+ exit_on_cuda_error("Kernel_2_impl");
#endif
}
@@ -1641,21 +2994,21 @@
extern "C"
void FC_FUNC_(compute_forces_viscoelastic_cuda,
- COMPUTE_FORCES_VISCOELASTIC_CUDA)(long* Mesh_pointer_f,
- int* iphase,
- realw* deltat,
- int* nspec_outer_elastic,
- int* nspec_inner_elastic,
- int* COMPUTE_AND_STORE_STRAIN,
- int* ATTENUATION,
- int* ANISOTROPY) {
+ COMPUTE_FORCES_VISCOELASTIC_CUDA)(long* Mesh_pointer,
+ int* iphase,
+ realw* deltat,
+ int* nspec_outer_elastic,
+ int* nspec_inner_elastic,
+ int* COMPUTE_AND_STORE_STRAIN,
+ int* ATTENUATION,
+ int* ANISOTROPY) {
- TRACE("compute_forces_viscoelastic_cuda");
+ TRACE("\tcompute_forces_viscoelastic_cuda");
// EPIK_TRACER("compute_forces_viscoelastic_cuda");
//printf("Running compute_forces\n");
//double start_time = get_time();
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
int num_elements;
@@ -1669,14 +3022,12 @@
// mesh coloring
if( mp->use_mesh_coloring_gpu ){
-
// note: array offsets require sorted arrays, such that e.g. ibool starts with elastic elements
// and followed by acoustic ones.
// elastic elements also start with outer than inner element ordering
-
int nb_colors,nb_blocks_to_compute;
int istart;
- int color_offset,color_offset_nonpadded,color_offset_nonpadded_att2;
+ int offset,offset_nonpadded,offset_nonpadded_att2;
// sets up color loop
if( *iphase == 1 ){
@@ -1685,18 +3036,18 @@
istart = 0;
// array offsets
- color_offset = 0;
- color_offset_nonpadded = 0;
- color_offset_nonpadded_att2 = 0;
+ offset = 0;
+ offset_nonpadded = 0;
+ offset_nonpadded_att2 = 0;
}else{
// inner elements (start after outer elements)
nb_colors = mp->num_colors_outer_elastic + mp->num_colors_inner_elastic;
istart = mp->num_colors_outer_elastic;
// array offsets
- color_offset = (*nspec_outer_elastic) * NGLL3_PADDED;
- color_offset_nonpadded = (*nspec_outer_elastic) * NGLL3;
- color_offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3 * N_SLS;
+ offset = (*nspec_outer_elastic) * NGLL3_PADDED;
+ offset_nonpadded = (*nspec_outer_elastic) * NGLL3;
+ offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3 * N_SLS;
}
// loops over colors
@@ -1713,145 +3064,75 @@
Kernel_2(nb_blocks_to_compute,mp,*iphase,*deltat,
*COMPUTE_AND_STORE_STRAIN,
*ATTENUATION,*ANISOTROPY,
- mp->d_ibool + color_offset_nonpadded,
- mp->d_xix + color_offset,
- mp->d_xiy + color_offset,
- mp->d_xiz + color_offset,
- mp->d_etax + color_offset,
- mp->d_etay + color_offset,
- mp->d_etaz + color_offset,
- mp->d_gammax + color_offset,
- mp->d_gammay + color_offset,
- mp->d_gammaz + color_offset,
- mp->d_kappav + color_offset,
- mp->d_muv + color_offset,
- mp->d_epsilondev_xx + color_offset_nonpadded,
- mp->d_epsilondev_yy + color_offset_nonpadded,
- mp->d_epsilondev_xy + color_offset_nonpadded,
- mp->d_epsilondev_xz + color_offset_nonpadded,
- mp->d_epsilondev_yz + color_offset_nonpadded,
- mp->d_epsilon_trace_over_3 + color_offset_nonpadded,
- mp->d_one_minus_sum_beta + color_offset_nonpadded,
- mp->d_factor_common + color_offset_nonpadded_att2,
- mp->d_R_xx + color_offset_nonpadded,
- mp->d_R_yy + color_offset_nonpadded,
- mp->d_R_xy + color_offset_nonpadded,
- mp->d_R_xz + color_offset_nonpadded,
- mp->d_R_yz + color_offset_nonpadded,
- mp->d_b_epsilondev_xx + color_offset_nonpadded,
- mp->d_b_epsilondev_yy + color_offset_nonpadded,
- mp->d_b_epsilondev_xy + color_offset_nonpadded,
- mp->d_b_epsilondev_xz + color_offset_nonpadded,
- mp->d_b_epsilondev_yz + color_offset_nonpadded,
- mp->d_b_epsilon_trace_over_3 + color_offset_nonpadded,
- mp->d_b_R_xx + color_offset_nonpadded,
- mp->d_b_R_yy + color_offset_nonpadded,
- mp->d_b_R_xy + color_offset_nonpadded,
- mp->d_b_R_xz + color_offset_nonpadded,
- mp->d_b_R_yz + color_offset_nonpadded,
- mp->d_c11store + color_offset,
- mp->d_c12store + color_offset,
- mp->d_c13store + color_offset,
- mp->d_c14store + color_offset,
- mp->d_c15store + color_offset,
- mp->d_c16store + color_offset,
- mp->d_c22store + color_offset,
- mp->d_c23store + color_offset,
- mp->d_c24store + color_offset,
- mp->d_c25store + color_offset,
- mp->d_c26store + color_offset,
- mp->d_c33store + color_offset,
- mp->d_c34store + color_offset,
- mp->d_c35store + color_offset,
- mp->d_c36store + color_offset,
- mp->d_c44store + color_offset,
- mp->d_c45store + color_offset,
- mp->d_c46store + color_offset,
- mp->d_c55store + color_offset,
- mp->d_c56store + color_offset,
- mp->d_c66store + color_offset,
- mp->d_rhostore + color_offset);
+ mp->d_ibool + offset_nonpadded,
+ mp->d_xix + offset,mp->d_xiy + offset,mp->d_xiz + offset,
+ mp->d_etax + offset,mp->d_etay + offset,mp->d_etaz + offset,
+ mp->d_gammax + offset,mp->d_gammay + offset,mp->d_gammaz + offset,
+ mp->d_kappav + offset,
+ mp->d_muv + offset,
+ mp->d_epsilondev_xx + offset_nonpadded,mp->d_epsilondev_yy + offset_nonpadded,mp->d_epsilondev_xy + offset_nonpadded,
+ mp->d_epsilondev_xz + offset_nonpadded,mp->d_epsilondev_yz + offset_nonpadded,
+ mp->d_epsilon_trace_over_3 + offset_nonpadded,
+ mp->d_one_minus_sum_beta + offset_nonpadded,
+ mp->d_factor_common + offset_nonpadded_att2,
+ mp->d_R_xx + offset_nonpadded,mp->d_R_yy + offset_nonpadded,mp->d_R_xy + offset_nonpadded,
+ mp->d_R_xz + offset_nonpadded,mp->d_R_yz + offset_nonpadded,
+ mp->d_b_epsilondev_xx + offset_nonpadded,mp->d_b_epsilondev_yy + offset_nonpadded,mp->d_b_epsilondev_xy + offset_nonpadded,
+ mp->d_b_epsilondev_xz + offset_nonpadded,mp->d_b_epsilondev_yz + offset_nonpadded,
+ mp->d_b_epsilon_trace_over_3 + offset_nonpadded,
+ mp->d_b_R_xx + offset_nonpadded,mp->d_b_R_yy + offset_nonpadded,mp->d_b_R_xy + offset_nonpadded,
+ mp->d_b_R_xz + offset_nonpadded,mp->d_b_R_yz + offset_nonpadded,
+ mp->d_c11store + offset,mp->d_c12store + offset,mp->d_c13store + offset,
+ mp->d_c14store + offset,mp->d_c15store + offset,mp->d_c16store + offset,
+ mp->d_c22store + offset,mp->d_c23store + offset,mp->d_c24store + offset,
+ mp->d_c25store + offset,mp->d_c26store + offset,mp->d_c33store + offset,
+ mp->d_c34store + offset,mp->d_c35store + offset,mp->d_c36store + offset,
+ mp->d_c44store + offset,mp->d_c45store + offset,mp->d_c46store + offset,
+ mp->d_c55store + offset,mp->d_c56store + offset,mp->d_c66store + offset,
+ mp->d_rhostore + offset);
// for padded and aligned arrays
- color_offset += nb_blocks_to_compute * NGLL3_PADDED;
+ offset += nb_blocks_to_compute * NGLL3_PADDED;
// for no-aligned arrays
- color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
+ offset_nonpadded += nb_blocks_to_compute * NGLL3;
// for factor_common array
- color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
+ offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
- //daniel: we use the same stream, so kernels are executed one after the other
- // synchronizes in case we run on only 1 process to avoid race-conditions
- //if( mp->NPROC == 1 ){
- // // Wait until previous compute stream finishes.
- // cudaStreamSynchronize(mp->compute_stream);
- //}
+ //note: we use the same stream, so kernels are executed one after the other
+ // thus, there should be no need to synchronize in case we run on only 1 process to avoid race-conditions
}
}else{
-
// no mesh coloring: uses atomic updates
-
Kernel_2(num_elements,mp,*iphase,*deltat,
*COMPUTE_AND_STORE_STRAIN,
*ATTENUATION,*ANISOTROPY,
mp->d_ibool,
- mp->d_xix,
- mp->d_xiy,
- mp->d_xiz,
- mp->d_etax,
- mp->d_etay,
- mp->d_etaz,
- mp->d_gammax,
- mp->d_gammay,
- mp->d_gammaz,
+ mp->d_xix,mp->d_xiy,mp->d_xiz,
+ mp->d_etax,mp->d_etay,mp->d_etaz,
+ mp->d_gammax,mp->d_gammay,mp->d_gammaz,
mp->d_kappav,
mp->d_muv,
- mp->d_epsilondev_xx,
- mp->d_epsilondev_yy,
- mp->d_epsilondev_xy,
- mp->d_epsilondev_xz,
- mp->d_epsilondev_yz,
+ mp->d_epsilondev_xx,mp->d_epsilondev_yy,mp->d_epsilondev_xy,
+ mp->d_epsilondev_xz,mp->d_epsilondev_yz,
mp->d_epsilon_trace_over_3,
mp->d_one_minus_sum_beta,
mp->d_factor_common,
- mp->d_R_xx,
- mp->d_R_yy,
- mp->d_R_xy,
- mp->d_R_xz,
- mp->d_R_yz,
- mp->d_b_epsilondev_xx,
- mp->d_b_epsilondev_yy,
- mp->d_b_epsilondev_xy,
- mp->d_b_epsilondev_xz,
- mp->d_b_epsilondev_yz,
+ mp->d_R_xx,mp->d_R_yy,mp->d_R_xy,
+ mp->d_R_xz,mp->d_R_yz,
+ mp->d_b_epsilondev_xx,mp->d_b_epsilondev_yy,mp->d_b_epsilondev_xy,
+ mp->d_b_epsilondev_xz,mp->d_b_epsilondev_yz,
mp->d_b_epsilon_trace_over_3,
- mp->d_b_R_xx,
- mp->d_b_R_yy,
- mp->d_b_R_xy,
- mp->d_b_R_xz,
- mp->d_b_R_yz,
- mp->d_c11store,
- mp->d_c12store,
- mp->d_c13store,
- mp->d_c14store,
- mp->d_c15store,
- mp->d_c16store,
- mp->d_c22store,
- mp->d_c23store,
- mp->d_c24store,
- mp->d_c25store,
- mp->d_c26store,
- mp->d_c33store,
- mp->d_c34store,
- mp->d_c35store,
- mp->d_c36store,
- mp->d_c44store,
- mp->d_c45store,
- mp->d_c46store,
- mp->d_c55store,
- mp->d_c56store,
- mp->d_c66store,
+ mp->d_b_R_xx,mp->d_b_R_yy,mp->d_b_R_xy,
+ mp->d_b_R_xz,mp->d_b_R_yz,
+ mp->d_c11store,mp->d_c12store,mp->d_c13store,
+ mp->d_c14store,mp->d_c15store,mp->d_c16store,
+ mp->d_c22store,mp->d_c23store,mp->d_c24store,
+ mp->d_c25store,mp->d_c26store,mp->d_c33store,
+ mp->d_c34store,mp->d_c35store,mp->d_c36store,
+ mp->d_c44store,mp->d_c45store,mp->d_c46store,
+ mp->d_c55store,mp->d_c56store,mp->d_c66store,
mp->d_rhostore);
}
}
@@ -1860,13 +3141,13 @@
extern "C"
void FC_FUNC_(sync_copy_from_device,
- SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f,
+ SYNC_copy_FROM_DEVICE)(long* Mesh_pointer,
int* iphase,
realw* send_buffer) {
TRACE("sync_copy_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
// Wait until async-memcpy of outer elements is finished and start MPI.
if( *iphase != 2 ){ exit_on_cuda_error("sync_copy_from_device must be called for iphase == 2"); }
@@ -1877,8 +3158,7 @@
// There have been problems using the pinned-memory with MPI, so
// we copy the buffer into a non-pinned region.
- memcpy(send_buffer,mp->h_send_accel_buffer,
- mp->size_mpi_buffer*sizeof(float));
+ memcpy(send_buffer,mp->h_send_accel_buffer,mp->size_mpi_buffer*sizeof(float));
}
// memory copy is now finished, so non-blocking MPI send can proceed
}
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_kernels_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_kernels_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -45,20 +45,119 @@
/* ----------------------------------------------------------------------------------------------- */
+__global__ void compute_kernels_ani_cudakernel(int* ispec_is_elastic,
+ int* ibool,
+ realw* accel,
+ realw* b_displ,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
+ realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+ realw* rho_kl,
+ realw deltat,
+ realw* cijkl_kl,
+ realw* epsilon_trace_over_3,
+ realw* b_epsilon_trace_over_3,
+ int NSPEC_AB) {
+
+ int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+ int ijk = threadIdx.x;
+ int ijk_ispec = ijk + NGLL3*ispec;
+ int ijk21_ispec = ijk + 21*NGLL3*ispec;
+
+ realw prod[21];
+ realw eps[6];
+ realw b_eps[6];
+ realw epsdev[6];
+ realw b_epsdev[6];
+ realw eps_trace_over_3,b_eps_trace_over_3;
+ int i,j;
+
+ // handles case when there is 1 extra block (due to rectangular grid)
+ if(ispec < NSPEC_AB) {
+
+ // elastic elements only
+ if( ispec_is_elastic[ispec] ) {
+ int iglob = ibool[ijk_ispec] - 1 ;
+
+ // anisotropic kernels:
+ // density kernel
+ rho_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_displ[3*iglob]+
+ accel[3*iglob+1]*b_displ[3*iglob+1]+
+ accel[3*iglob+2]*b_displ[3*iglob+2]);
+
+
+ // anisotropic kernel
+ epsdev[0] = epsilondev_xx[ijk_ispec];
+ epsdev[1] = epsilondev_yy[ijk_ispec];
+ epsdev[2] = epsilondev_xy[ijk_ispec];
+ epsdev[3] = epsilondev_xz[ijk_ispec];
+ epsdev[4] = epsilondev_yz[ijk_ispec];
+
+ b_epsdev[0] = b_epsilondev_xx[ijk_ispec];
+ b_epsdev[1] = b_epsilondev_yy[ijk_ispec];
+ b_epsdev[2] = b_epsilondev_xy[ijk_ispec];
+ b_epsdev[3] = b_epsilondev_xz[ijk_ispec];
+ b_epsdev[4] = b_epsilondev_yz[ijk_ispec];
+
+ eps_trace_over_3 = epsilon_trace_over_3[ijk_ispec];
+ b_eps_trace_over_3 = b_epsilon_trace_over_3[ijk_ispec];
+
+ //! Building of the local matrix of the strain tensor
+ //! for the adjoint field and the regular backward field
+ //!eps11 et eps22
+ eps[0] = epsdev[0] + eps_trace_over_3;
+ eps[1] = epsdev[1] + eps_trace_over_3;
+ //!eps33
+ eps[2] = -(eps[0]+eps[1])+3*eps_trace_over_3;
+ //!eps23
+ eps[3] = epsdev[4];
+ //!eps13
+ eps[4] = epsdev[3];
+ //!eps12
+ eps[5] = epsdev[2];
+
+ // backward arrays
+ b_eps[0] = b_epsdev[0] + b_eps_trace_over_3;
+ b_eps[1] = b_epsdev[1] + b_eps_trace_over_3;
+ b_eps[2] = -(b_eps[0]+b_eps[1])+3*b_eps_trace_over_3;
+ b_eps[3] = b_epsdev[4];
+ b_eps[4] = b_epsdev[3];
+ b_eps[5] = b_epsdev[2];
+
+ //! Computing the 21 strain products without assuming eps(i)*b_eps(j) = eps(j)*b_eps(i)
+ int p = 0;
+ for( i=0; i<6; i++){
+ for( j=i; j<6; j++){
+ prod[p] = eps[i] * b_eps[j];
+ if( j > i ){
+ prod[p] = prod[p] + eps[j]*b_eps[i];
+ if( j > 2 && i < 3 ){ prod[p] = prod[p]*2; }
+ }
+ if(i > 2 ){ prod[p] = prod[p]*4; }
+ p++;
+ }
+ }
+
+ // all 21 anisotropic coefficients
+ for( i=0; i<21; i++){
+ cijkl_kl[i+ijk21_ispec] += deltat * prod[i];
+ }
+
+ }
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
__global__ void compute_kernels_cudakernel(int* ispec_is_elastic,
int* ibool,
realw* accel,
realw* b_displ,
- realw* epsilondev_xx,
- realw* epsilondev_yy,
- realw* epsilondev_xy,
- realw* epsilondev_xz,
- realw* epsilondev_yz,
- realw* b_epsilondev_xx,
- realw* b_epsilondev_yy,
- realw* b_epsilondev_xy,
- realw* b_epsilondev_xz,
- realw* b_epsilondev_yz,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
+ realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,realw* b_epsilondev_yz,
realw* rho_kl,
realw deltat,
realw* mu_kl,
@@ -68,15 +167,14 @@
int NSPEC_AB) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+ int ijk = threadIdx.x;
+ int ijk_ispec = ijk + NGLL3*ispec;
// handles case when there is 1 extra block (due to rectangular grid)
if(ispec < NSPEC_AB) {
// elastic elements only
if( ispec_is_elastic[ispec] ) {
-
- int ijk = threadIdx.x;
- int ijk_ispec = ijk + NGLL3*ispec;
int iglob = ibool[ijk_ispec] - 1 ;
// isotropic kernels:
@@ -109,43 +207,62 @@
void FC_FUNC_(compute_kernels_elastic_cuda,
COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
realw* deltat_f) {
-TRACE("compute_kernels_elastic_cuda");
+ TRACE("compute_kernels_elastic_cuda");
+
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
realw deltat = *deltat_f;
- int num_blocks_x = mp->NSPEC_AB;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->NSPEC_AB,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
- mp->d_accel, mp->d_b_displ,
- mp->d_epsilondev_xx,
- mp->d_epsilondev_yy,
- mp->d_epsilondev_xy,
- mp->d_epsilondev_xz,
- mp->d_epsilondev_yz,
- mp->d_b_epsilondev_xx,
- mp->d_b_epsilondev_yy,
- mp->d_b_epsilondev_xy,
- mp->d_b_epsilondev_xz,
- mp->d_b_epsilondev_yz,
- mp->d_rho_kl,
- deltat,
- mp->d_mu_kl,
- mp->d_kappa_kl,
- mp->d_epsilon_trace_over_3,
- mp->d_b_epsilon_trace_over_3,
- mp->NSPEC_AB);
+ if( mp->anisotropic_kl ){
+ compute_kernels_ani_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
+ mp->d_accel, mp->d_b_displ,
+ mp->d_epsilondev_xx,
+ mp->d_epsilondev_yy,
+ mp->d_epsilondev_xy,
+ mp->d_epsilondev_xz,
+ mp->d_epsilondev_yz,
+ mp->d_b_epsilondev_xx,
+ mp->d_b_epsilondev_yy,
+ mp->d_b_epsilondev_xy,
+ mp->d_b_epsilondev_xz,
+ mp->d_b_epsilondev_yz,
+ mp->d_rho_kl,
+ deltat,
+ mp->d_cijkl_kl,
+ mp->d_epsilon_trace_over_3,
+ mp->d_b_epsilon_trace_over_3,
+ mp->NSPEC_AB);
+ }else{
+ compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
+ mp->d_accel, mp->d_b_displ,
+ mp->d_epsilondev_xx,
+ mp->d_epsilondev_yy,
+ mp->d_epsilondev_xy,
+ mp->d_epsilondev_xz,
+ mp->d_epsilondev_yz,
+ mp->d_b_epsilondev_xx,
+ mp->d_b_epsilondev_yy,
+ mp->d_b_epsilondev_xy,
+ mp->d_b_epsilondev_xz,
+ mp->d_b_epsilondev_yz,
+ mp->d_rho_kl,
+ deltat,
+ mp->d_mu_kl,
+ mp->d_kappa_kl,
+ mp->d_epsilon_trace_over_3,
+ mp->d_b_epsilon_trace_over_3,
+ mp->NSPEC_AB);
+ }
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_kernels_elastic_cuda");
#endif
@@ -171,12 +288,13 @@
realw deltat,
int num_free_surface_faces) {
int iface = blockIdx.x + blockIdx.y*gridDim.x;
+ int igll = threadIdx.x;
+ int ipoin = igll + NGLL2*iface;
if(iface < num_free_surface_faces) {
int ispec = free_surface_ispec[iface]-1;
- int igll = threadIdx.x;
- int ipoin = igll + NGLL2*iface;
+
int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1 ;
int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
@@ -206,20 +324,18 @@
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
- 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
+ // checks if anything to do
+ if( mp->num_free_surface_faces == 0 ) return;
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->num_free_surface_faces,&num_blocks_x,&num_blocks_y);
- int num_blocks_x = mp->num_free_surface_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
-
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(NGLL2,1,1);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+ NDIM*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),81000);
+
compute_kernels_strength_noise_cuda_kernel<<<grid,threads>>>(mp->d_displ,
mp->d_free_surface_ispec,
mp->d_free_surface_ijk,
@@ -250,15 +366,9 @@
realw* scalar_field,
realw* vector_field_element,
realw* d_hprime_xx,
- realw* d_xix,
- realw* d_xiy,
- realw* d_xiz,
- realw* d_etax,
- realw* d_etay,
- realw* d_etaz,
- realw* d_gammax,
- realw* d_gammay,
- realw* d_gammaz,
+ realw* d_xix,realw* d_xiy,realw* d_xiz,
+ realw* d_etax,realw* d_etay,realw* d_etaz,
+ realw* d_gammax,realw* d_gammay,realw* d_gammaz,
realw rhol,
int gravity) {
@@ -268,7 +378,6 @@
realw rho_invl;
int l,offset,offset1,offset2,offset3;
- //const int NGLLX = 5;
const int NGLL3_ALIGN = NGLL3_PADDED;
int K = (ijk/NGLL2);
@@ -334,15 +443,9 @@
realw* rhostore,
realw* kappastore,
realw* d_hprime_xx,
- realw* d_xix,
- realw* d_xiy,
- realw* d_xiz,
- realw* d_etax,
- realw* d_etay,
- realw* d_etaz,
- realw* d_gammax,
- realw* d_gammay,
- realw* d_gammaz,
+ realw* d_xix,realw* d_xiy,realw* d_xiz,
+ realw* d_etax,realw* d_etay,realw* d_etaz,
+ realw* d_gammax,realw* d_gammay,realw* d_gammaz,
realw* potential_dot_dot_acoustic,
realw* b_potential_acoustic,
realw* b_potential_dot_dot_acoustic,
@@ -353,59 +456,65 @@
int gravity) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+ int ijk = threadIdx.x;
+ // local and global indices
+ int ijk_ispec = ijk + NGLL3*ispec;
+ int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
+ int iglob;
+
+ // shared memory between all threads within this block
+ __shared__ realw scalar_field_displ[NGLL3];
+ __shared__ realw scalar_field_accel[NGLL3];
+
+ int active = 0;
+
// handles case when there is 1 extra block (due to rectangular grid)
if( ispec < NSPEC_AB ){
-
// acoustic elements only
- if( ispec_is_acoustic[ispec] ) {
+ if( ispec_is_acoustic[ispec] ){
+ active = 1;
- int ijk = threadIdx.x;
-
- // local and global indices
- int ijk_ispec = ijk + NGLL3*ispec;
- int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
- int iglob = ibool[ijk_ispec] - 1;
-
- realw accel_elm[3];
- realw b_displ_elm[3];
- realw rhol,kappal;
-
- // shared memory between all threads within this block
- __shared__ realw scalar_field_displ[NGLL3];
- __shared__ realw scalar_field_accel[NGLL3];
-
// copy field values
+ iglob = ibool[ijk_ispec] - 1;
scalar_field_displ[ijk] = b_potential_acoustic[iglob];
scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
- __syncthreads();
+ }
+ }
- // gets material parameter
- rhol = rhostore[ijk_ispec_padded];
+ // synchronizes threads
+ __syncthreads();
- // displacement vector from backward field
- compute_gradient_kernel(ijk,ispec,scalar_field_displ,b_displ_elm,
- d_hprime_xx,
- d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
- rhol,gravity);
+ if( active ){
+ realw accel_elm[3];
+ realw b_displ_elm[3];
+ realw rhol,kappal;
- // acceleration vector
- compute_gradient_kernel(ijk,ispec,scalar_field_accel,accel_elm,
- d_hprime_xx,
- d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
- rhol,gravity);
+ // gets material parameter
+ rhol = rhostore[ijk_ispec_padded];
- // density kernel
- rho_ac_kl[ijk_ispec] -= deltat * rhol * (accel_elm[0]*b_displ_elm[0] +
- accel_elm[1]*b_displ_elm[1] +
- accel_elm[2]*b_displ_elm[2]);
+ // displacement vector from backward field
+ compute_gradient_kernel(ijk,ispec,scalar_field_displ,b_displ_elm,
+ d_hprime_xx,
+ d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+ rhol,gravity);
- // bulk modulus kernel
- kappal = kappastore[ijk_ispec];
- kappa_ac_kl[ijk_ispec] -= deltat / kappal * potential_dot_dot_acoustic[iglob]
- * b_potential_dot_dot_acoustic[iglob];
- }
- }
+ // acceleration vector
+ compute_gradient_kernel(ijk,ispec,scalar_field_accel,accel_elm,
+ d_hprime_xx,
+ d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+ rhol,gravity);
+
+ // density kernel
+ rho_ac_kl[ijk_ispec] -= deltat * rhol * (accel_elm[0]*b_displ_elm[0] +
+ accel_elm[1]*b_displ_elm[1] +
+ accel_elm[2]*b_displ_elm[2]);
+
+ // bulk modulus kernel
+ kappal = kappastore[ijk_ispec];
+ kappa_ac_kl[ijk_ispec] -= deltat / kappal * potential_dot_dot_acoustic[iglob]
+ * b_potential_dot_dot_acoustic[iglob];
+ } // active
}
/* ----------------------------------------------------------------------------------------------- */
@@ -413,8 +522,7 @@
extern "C"
void FC_FUNC_(compute_kernels_acoustic_cuda,
- COMPUTE_KERNELS_ACOUSTIC_CUDA)(
- long* Mesh_pointer,
+ COMPUTE_KERNELS_ACOUSTIC_CUDA)(long* Mesh_pointer,
realw* deltat_f) {
TRACE("compute_kernels_acoustic_cuda");
@@ -424,12 +532,8 @@
int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
realw deltat = *deltat_f;
- int num_blocks_x = mp->NSPEC_AB;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->NSPEC_AB,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -439,15 +543,9 @@
mp->d_rhostore,
mp->d_kappastore,
mp->d_hprime_xx,
- mp->d_xix,
- mp->d_xiy,
- mp->d_xiz,
- mp->d_etax,
- mp->d_etay,
- mp->d_etaz,
- mp->d_gammax,
- mp->d_gammay,
- mp->d_gammaz,
+ mp->d_xix,mp->d_xiy,mp->d_xiz,
+ mp->d_etax,mp->d_etay,mp->d_etaz,
+ mp->d_gammax,mp->d_gammay,mp->d_gammaz,
mp->d_potential_dot_dot_acoustic,
mp->d_b_potential_acoustic,
mp->d_b_potential_dot_dot_acoustic,
@@ -477,15 +575,14 @@
int NSPEC_AB) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+ int ijk = threadIdx.x;
+ int ijk_ispec = ijk + NGLL3*ispec;
// handles case when there is 1 extra block (due to rectangular grid)
if(ispec < NSPEC_AB) {
// elastic elements only
if( ispec_is_elastic[ispec] ) {
-
- int ijk = threadIdx.x;
- int ijk_ispec = ijk + NGLL3*ispec;
int iglob = ibool[ijk_ispec] - 1 ;
// approximate hessian
@@ -504,72 +601,72 @@
realw* b_potential_dot_dot_acoustic,
realw* rhostore,
realw* d_hprime_xx,
- realw* d_xix,
- realw* d_xiy,
- realw* d_xiz,
- realw* d_etax,
- realw* d_etay,
- realw* d_etaz,
- realw* d_gammax,
- realw* d_gammay,
- realw* d_gammaz,
+ realw* d_xix,realw* d_xiy,realw* d_xiz,
+ realw* d_etax,realw* d_etay,realw* d_etaz,
+ realw* d_gammax,realw* d_gammay,realw* d_gammaz,
realw* hess_kl,
realw deltat,
int NSPEC_AB,
int gravity) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
+ int ijk = threadIdx.x;
+ int ijk_ispec = ijk + NGLL3*ispec;
+ int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
+ int iglob;
+ // shared memory between all threads within this block
+ __shared__ realw scalar_field_accel[NGLL3];
+ __shared__ realw scalar_field_b_accel[NGLL3];
+
+ int active = 0;
+
// handles case when there is 1 extra block (due to rectangular grid)
if(ispec < NSPEC_AB) {
// acoustic elements only
if( ispec_is_acoustic[ispec] ){
+ active = 1;
- // local and global indices
- int ijk = threadIdx.x;
- int ijk_ispec = ijk + NGLL3*ispec;
- int iglob = ibool[ijk_ispec] - 1 ;
+ // global indices
+ iglob = ibool[ijk_ispec] - 1 ;
- int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
-
- realw accel_elm[3];
- realw b_accel_elm[3];
- realw rhol;
-
- // shared memory between all threads within this block
- __shared__ realw scalar_field_accel[NGLL3];
- __shared__ realw scalar_field_b_accel[NGLL3];
-
// copy field values
scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
scalar_field_b_accel[ijk] = b_potential_dot_dot_acoustic[iglob];
- __syncthreads();
+ }
+ }
- // gets material parameter
- rhol = rhostore[ijk_ispec_padded];
+ // synchronizes threads
+ __syncthreads();
- // acceleration vector
- compute_gradient_kernel(ijk,ispec,
- scalar_field_accel,accel_elm,
- d_hprime_xx,
- d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
- rhol,gravity);
+ if( active ){
+ realw accel_elm[3];
+ realw b_accel_elm[3];
+ realw rhol;
- // acceleration vector from backward field
- compute_gradient_kernel(ijk,ispec,
- scalar_field_b_accel,b_accel_elm,
- d_hprime_xx,
- d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
- rhol,gravity);
- // approximates hessian
- hess_kl[ijk_ispec] += deltat * (accel_elm[0]*b_accel_elm[0] +
- accel_elm[1]*b_accel_elm[1] +
- accel_elm[2]*b_accel_elm[2]);
+ // gets material parameter
+ rhol = rhostore[ijk_ispec_padded];
- } // ispec_is_acoustic
+ // acceleration vector
+ compute_gradient_kernel(ijk,ispec,
+ scalar_field_accel,accel_elm,
+ d_hprime_xx,
+ d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+ rhol,gravity);
- }
+ // acceleration vector from backward field
+ compute_gradient_kernel(ijk,ispec,
+ scalar_field_b_accel,b_accel_elm,
+ d_hprime_xx,
+ d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz,
+ rhol,gravity);
+ // approximates hessian
+ hess_kl[ijk_ispec] += deltat * (accel_elm[0]*b_accel_elm[0] +
+ accel_elm[1]*b_accel_elm[1] +
+ accel_elm[2]*b_accel_elm[2]);
+
+ } // active
}
/* ----------------------------------------------------------------------------------------------- */
@@ -587,12 +684,8 @@
int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
realw deltat = *deltat_f;
- int num_blocks_x = mp->NSPEC_AB;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->NSPEC_AB,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -614,15 +707,9 @@
mp->d_b_potential_dot_dot_acoustic,
mp->d_rhostore,
mp->d_hprime_xx,
- mp->d_xix,
- mp->d_xiy,
- mp->d_xiz,
- mp->d_etax,
- mp->d_etay,
- mp->d_etaz,
- mp->d_gammax,
- mp->d_gammay,
- mp->d_gammaz,
+ mp->d_xix,mp->d_xiy,mp->d_xiz,
+ mp->d_etax,mp->d_etay,mp->d_etaz,
+ mp->d_gammax,mp->d_gammay,mp->d_gammaz,
mp->d_hess_ac_kl,
deltat,
mp->NSPEC_AB,
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_acoustic_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_acoustic_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -122,16 +122,18 @@
extern "C"
void FC_FUNC_(compute_stacey_acoustic_cuda,
- COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer,
int* phase_is_innerf,
- int* SAVE_FORWARDf,
realw* h_b_absorb_potential) {
TRACE("compute_stacey_acoustic_cuda");
//double start_time = get_time();
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+ // checks if anything to do
+ if( mp->d_num_abs_boundary_faces == 0 ) return;
+
int phase_is_inner = *phase_is_innerf;
- int SAVE_FORWARD = *SAVE_FORWARDf;
// way 1: Elapsed time: 4.385948e-03
// > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
@@ -141,18 +143,14 @@
// > NGLLSQUARE==NGLL2==25, no further check inside kernel
int blocksize = NGLL2;
- int num_blocks_x = mp->d_num_abs_boundary_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->d_num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
// adjoint simulations: reads in absorbing boundary
- if (mp->simulation_type == 3 && mp->d_num_abs_boundary_faces > 0 ){
+ if (mp->simulation_type == 3 ){
// copies array to GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,h_b_absorb_potential,
mp->d_b_reclen_potential,cudaMemcpyHostToDevice),7700);
@@ -170,7 +168,7 @@
mp->d_ispec_is_acoustic,
phase_is_inner,
mp->simulation_type,
- SAVE_FORWARD,
+ mp->save_forward,
mp->d_num_abs_boundary_faces,
mp->d_b_potential_dot_acoustic,
mp->d_b_potential_dot_dot_acoustic,
@@ -178,7 +176,8 @@
mp->gravity);
// adjoint simulations: stores absorbed wavefield part
- if (mp->simulation_type == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ){
+ if (mp->simulation_type == 1 && mp->save_forward ){
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
// copies array to CPU
print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_potential,mp->d_b_absorb_potential,
mp->d_b_reclen_potential,cudaMemcpyDeviceToHost),7701);
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_viscoelastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_viscoelastic_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/compute_stacey_viscoelastic_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -54,7 +54,6 @@
int SIMULATION_TYPE,
int SAVE_FORWARD,
int num_abs_boundary_faces,
- realw* b_accel,
realw* b_absorb_field) {
int igll = threadIdx.x; // tx
@@ -110,12 +109,7 @@
atomicAdd(&accel[iglob*3+1],-ty*jacobianw);
atomicAdd(&accel[iglob*3+2],-tz*jacobianw);
- if(SIMULATION_TYPE == 3) {
- atomicAdd(&b_accel[iglob*3 ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
- atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
- atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
- }
- else if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
+ if(SAVE_FORWARD && SIMULATION_TYPE == 1) {
b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)] = tx*jacobianw;
b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)] = ty*jacobianw;
b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)] = tz*jacobianw;
@@ -127,23 +121,63 @@
/* ----------------------------------------------------------------------------------------------- */
+__global__ void compute_stacey_elastic_sim3_kernel(int* abs_boundary_ispec,
+ int* abs_boundary_ijk,
+ int* ibool,
+ int* ispec_is_inner,
+ int* ispec_is_elastic,
+ int phase_is_inner,
+ int num_abs_boundary_faces,
+ realw* b_accel,
+ realw* b_absorb_field) {
+ int igll = threadIdx.x; // tx
+ int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
+
+ int i,j,k,iglob,ispec;
+
+ // don't compute points outside NGLLSQUARE==NGLL2==25
+ // way 2: no further check needed since blocksize = 25
+ if( iface < num_abs_boundary_faces){
+
+ //if(igll < NGLL2 && iface < num_abs_boundary_faces) {
+
+ // "-1" from index values to convert from Fortran-> C indexing
+ ispec = abs_boundary_ispec[iface]-1;
+
+ if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
+
+ i = abs_boundary_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+ j = abs_boundary_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+ k = abs_boundary_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+
+ iglob = ibool[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)]-1;
+
+ atomicAdd(&b_accel[iglob*3 ],-b_absorb_field[INDEX3(NDIM,NGLL2,0,igll,iface)]);
+ atomicAdd(&b_accel[iglob*3+1],-b_absorb_field[INDEX3(NDIM,NGLL2,1,igll,iface)]);
+ atomicAdd(&b_accel[iglob*3+2],-b_absorb_field[INDEX3(NDIM,NGLL2,2,igll,iface)]);
+ }
+ } // num_abs_boundary_faces
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
extern "C"
void FC_FUNC_(compute_stacey_viscoelastic_cuda,
- COMPUTE_STACEY_VISCOELASTIC_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_STACEY_VISCOELASTIC_CUDA)(long* Mesh_pointer,
int* phase_is_innerf,
- int* SAVE_FORWARDf,
- realw* h_b_absorb_field) {
+ realw* b_absorb_field) {
-TRACE("compute_stacey_viscoelastic_cuda");
+ TRACE("\tcompute_stacey_viscoelastic_cuda");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- // check
+ // checks if anything to do
if( mp->d_num_abs_boundary_faces == 0 ) return;
int phase_is_inner = *phase_is_innerf;
- int SAVE_FORWARD = *SAVE_FORWARDf;
// way 1
// > NGLLSQUARE==NGLL2==25, but we handle this inside kernel
@@ -153,19 +187,15 @@
// > NGLLSQUARE==NGLL2==25, no further check inside kernel
int blocksize = NGLL2;
- int num_blocks_x = mp->d_num_abs_boundary_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->d_num_abs_boundary_faces,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- if(mp->simulation_type == 3 && mp->d_num_abs_boundary_faces > 0) {
- // The read is done in fortran
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field,
+ if(mp->simulation_type == 3 ) {
+ // reading is done in fortran routine
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,b_absorb_field,
mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700);
}
@@ -173,7 +203,7 @@
exit_on_cuda_error("between cudamemcpy and compute_stacey_elastic_kernel");
#endif
- compute_stacey_elastic_kernel<<<grid,threads>>>(mp->d_veloc,
+ compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc,
mp->d_accel,
mp->d_abs_boundary_ispec,
mp->d_abs_boundary_ijk,
@@ -186,11 +216,23 @@
mp->d_ispec_is_elastic,
phase_is_inner,
mp->simulation_type,
- SAVE_FORWARD,
+ mp->save_forward,
mp->d_num_abs_boundary_faces,
- mp->d_b_accel,
mp->d_b_absorb_field);
+ // adjoint simulations
+ if(mp->simulation_type == 3 ){
+ compute_stacey_elastic_sim3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_abs_boundary_ispec,
+ mp->d_abs_boundary_ijk,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ mp->d_ispec_is_elastic,
+ phase_is_inner,
+ mp->d_num_abs_boundary_faces,
+ mp->d_b_accel,
+ mp->d_b_absorb_field);
+ }
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_stacey_elastic_kernel");
#endif
@@ -199,11 +241,15 @@
// if (mp->simulation_type == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) &
// write(IOABS,rec=it) b_reclen_field,b_absorb_field,b_reclen_field
- if(mp->simulation_type == 1 && SAVE_FORWARD && mp->d_num_abs_boundary_faces > 0 ) {
- print_CUDA_error_if_any(cudaMemcpy(h_b_absorb_field,mp->d_b_absorb_field,
+ if(mp->simulation_type == 1 && mp->save_forward ) {
+ // explicitly wait until compute stream is done
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ cudaStreamSynchronize(mp->compute_stream);
+
+ // copies absorb_field values to CPU
+ print_CUDA_error_if_any(cudaMemcpy(b_absorb_field,mp->d_b_absorb_field,
mp->d_b_reclen_field,cudaMemcpyDeviceToHost),7701);
- // The write is done in fortran
- // write_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&it);
+ // writing is done in fortran routine
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/it_update_displacement_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -52,14 +52,20 @@
realw deltat,
realw deltatsqover2,
realw deltatover2) {
+
+ // two dimensional array of blocks on grid where each block has one dimensional array of threads
+ //int tid = threadIdx.x;
+ //int bx = blockIdx.y*gridDim.x+blockIdx.x;
+ //int id = tid + bx*blockDim.x;
+
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
displ[id] = displ[id] + deltat*veloc[id] + deltatsqover2*accel[id];
veloc[id] = veloc[id] + deltatover2*accel[id];
- accel[id] = 0; // can do this using memset...not sure if faster
+ accel[id] = 0.0f; // can do this using memset...not sure if faster,probably not
}
}
@@ -67,8 +73,7 @@
extern "C"
void FC_FUNC_(it_update_displacement_cuda,
- IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
- int* size_F,
+ IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer,
realw* deltat_F,
realw* deltatsqover2_F,
realw* deltatover2_F,
@@ -76,48 +81,36 @@
realw* b_deltatsqover2_F,
realw* b_deltatover2_F) {
-TRACE("it_update_displacement_cuda");
+ TRACE("\tit_update_displacement_cuda");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- //int i,device;
+ realw deltat = *deltat_F;
+ realw deltatsqover2 = *deltatsqover2_F;
+ realw deltatover2 = *deltatover2_F;
- int size = *size_F;
- //cublasStatus status;
+ int size = NDIM * mp->NGLOB_AB;
int blocksize = BLOCKSIZE_KERNEL1;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
+ // debug
+ //realw max_d,max_v,max_a;
+ //max_d = get_device_array_maximum_value(mp->d_displ, size);
+ //max_v = get_device_array_maximum_value(mp->d_veloc, size);
+ //max_a = get_device_array_maximum_value(mp->d_accel, size);
+ //printf("rank %d - max displ: %f veloc: %f accel: %f\n",mp->myrank,max_d,max_v,max_a);
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-// exit_on_cuda_error("Before UpdateDispVeloc_kernel");
-//#endif
-
- realw deltat = *deltat_F;
- realw deltatsqover2 = *deltatsqover2_F;
- realw deltatover2 = *deltatover2_F;
-
//launch kernel
UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_displ,mp->d_veloc,mp->d_accel,
- size,deltat,deltatsqover2,deltatover2);
+ size,deltat,deltatsqover2,deltatover2);
- //cudaThreadSynchronize();
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-// //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-// // sync and check to catch errors from previous async operations
-// exit_on_cuda_error("UpdateDispVeloc_kernel");
-//#endif
-
// kernel for backward fields
if(mp->simulation_type == 3) {
realw b_deltat = *b_deltat_F;
@@ -125,12 +118,7 @@
realw b_deltatover2 = *b_deltatover2_F;
UpdateDispVeloc_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_displ,mp->d_b_veloc,mp->d_b_accel,
- size,b_deltat,b_deltatsqover2,b_deltatover2);
-
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-// //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
-// exit_on_cuda_error("after SIM_TYPE==3 UpdateDispVeloc_kernel");
-//#endif
+ size,b_deltat,b_deltatsqover2,b_deltatover2);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -152,10 +140,11 @@
realw deltat,
realw deltatsqover2,
realw deltatover2) {
+
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
potential_acoustic[id] = potential_acoustic[id]
+ deltat*potential_dot_acoustic[id]
@@ -164,7 +153,7 @@
potential_dot_acoustic[id] = potential_dot_acoustic[id]
+ deltatover2*potential_dot_dot_acoustic[id];
- potential_dot_dot_acoustic[id] = 0;
+ potential_dot_dot_acoustic[id] = 0.0f;
}
}
@@ -172,56 +161,50 @@
extern "C"
void FC_FUNC_(it_update_displacement_ac_cuda,
- it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
- int* size_F,
+ it_update_displacement_ac_cuda)(long* Mesh_pointer,
realw* deltat_F,
realw* deltatsqover2_F,
realw* deltatover2_F,
realw* b_deltat_F,
realw* b_deltatsqover2_F,
realw* b_deltatover2_F) {
-TRACE("it_update_displacement_ac_cuda");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+ TRACE("\tit_update_displacement_ac_cuda");
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- //int i,device;
- int size = *size_F;
- //cublasStatus status;
+ int size = mp->NGLOB_AB;
int blocksize = BLOCKSIZE_KERNEL1;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
+ //launch kernel
+ // forward wavefields
realw deltat = *deltat_F;
realw deltatsqover2 = *deltatsqover2_F;
realw deltatover2 = *deltatover2_F;
- //launch kernel
UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_acoustic,
- mp->d_potential_dot_acoustic,
- mp->d_potential_dot_dot_acoustic,
- size,deltat,deltatsqover2,deltatover2);
+ mp->d_potential_dot_acoustic,
+ mp->d_potential_dot_dot_acoustic,
+ size,deltat,deltatsqover2,deltatover2);
+ // backward/reconstructed wavefields
if(mp->simulation_type == 3) {
realw b_deltat = *b_deltat_F;
realw b_deltatsqover2 = *b_deltatsqover2_F;
realw b_deltatover2 = *b_deltatover2_F;
UpdatePotential_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_b_potential_acoustic,
- mp->d_b_potential_dot_acoustic,
- mp->d_b_potential_dot_dot_acoustic,
- size,b_deltat,b_deltatsqover2,b_deltatover2);
+ mp->d_b_potential_dot_acoustic,
+ mp->d_b_potential_dot_dot_acoustic,
+ size,b_deltat,b_deltatsqover2,b_deltatover2);
}
- //cudaThreadSynchronize();
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
exit_on_cuda_error("it_update_displacement_ac_cuda");
@@ -244,10 +227,11 @@
realw* rmassx,
realw* rmassy,
realw* rmassz) {
+
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
accel[3*id] = accel[3*id]*rmassx[id];
accel[3*id+1] = accel[3*id+1]*rmassy[id];
@@ -268,8 +252,8 @@
realw* rmassz) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
accel[3*id] = accel[3*id]*rmassx[id];
accel[3*id+1] = accel[3*id+1]*rmassy[id];
@@ -283,10 +267,11 @@
realw* accel,
int size,
realw deltatover2) {
+
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
veloc[3*id] = veloc[3*id] + deltatover2*accel[3*id];
veloc[3*id+1] = veloc[3*id+1] + deltatover2*accel[3*id+1];
@@ -299,65 +284,60 @@
extern "C"
void FC_FUNC_(kernel_3_a_cuda,
KERNEL_3_A_CUDA)(long* Mesh_pointer,
- int* size_F,
realw* deltatover2_F,
realw* b_deltatover2_F,
int* APPROXIMATE_OCEAN_LOAD) {
-TRACE("kernel_3_a_cuda");
- Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
+ TRACE("\tkernel_3_a_cuda");
- int size = *size_F;
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- realw deltatover2 = *deltatover2_F;
- realw b_deltatover2 = *b_deltatover2_F;
+ int size = mp->NGLOB_AB;
- int blocksize = BLOCKSIZE_KERNEL3;
- int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+ int blocksize = BLOCKSIZE_KERNEL3;
+ int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
- // check whether we can update accel and veloc, or only accel at this point
- if( *APPROXIMATE_OCEAN_LOAD == 0 ){
- // updates both, accel and veloc
- kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,
- mp->d_accel,
- size, deltatover2,
+ // check whether we can update accel and veloc, or only accel at this point
+ if( *APPROXIMATE_OCEAN_LOAD == 0 ){
+ realw deltatover2 = *deltatover2_F;
+
+ // updates both, accel and veloc
+ kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,
+ mp->d_accel,
+ size, deltatover2,
+ mp->d_rmassx,mp->d_rmassy,mp->d_rmassz);
+ if(mp->simulation_type == 3) {
+ realw b_deltatover2 = *b_deltatover2_F;
+ kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,
+ mp->d_b_accel,
+ size, b_deltatover2,
mp->d_rmassx,mp->d_rmassy,mp->d_rmassz);
+ }
+ }else{
+ // updates only accel
+ kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_accel,
+ size,
+ mp->d_rmassx,
+ mp->d_rmassy,
+ mp->d_rmassz);
- if(mp->simulation_type == 3) {
- kernel_3_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,
- mp->d_b_accel,
- size, b_deltatover2,
- mp->d_rmassx,mp->d_rmassy,mp->d_rmassz);
- }
- }else{
- // updates only accel
- kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_accel,
+ if(mp->simulation_type == 3) {
+ kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_accel,
size,
mp->d_rmassx,
mp->d_rmassy,
mp->d_rmassz);
-
- if(mp->simulation_type == 3) {
- kernel_3_accel_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_accel,
- size,
- mp->d_rmassx,
- mp->d_rmassy,
- mp->d_rmassz);
- }
}
+ }
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
+ //printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
exit_on_cuda_error("after kernel 3 a");
#endif
}
@@ -367,36 +347,31 @@
extern "C"
void FC_FUNC_(kernel_3_b_cuda,
KERNEL_3_B_CUDA)(long* Mesh_pointer,
- int* size_F,
- realw* deltatover2_F,
- realw* b_deltatover2_F) {
- TRACE("kernel_3_b_cuda");
+ realw* deltatover2_F,
+ realw* b_deltatover2_F) {
+ TRACE("\tkernel_3_b_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- int size = *size_F;
- realw deltatover2 = *deltatover2_F;
- realw b_deltatover2 = *b_deltatover2_F;
+ int size = mp->NGLOB_AB;
int blocksize = BLOCKSIZE_KERNEL3;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
+ realw deltatover2 = *deltatover2_F;
// updates only veloc at this point
kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_veloc,
mp->d_accel,
size,deltatover2);
if(mp->simulation_type == 3) {
+ realw b_deltatover2 = *b_deltatover2_F;
kernel_3_veloc_cuda_device<<< grid, threads,0,mp->compute_stream>>>(mp->d_b_veloc,
mp->d_b_accel,
size,b_deltatover2);
@@ -421,10 +396,11 @@
__global__ void kernel_3_a_acoustic_cuda_device(realw* potential_dot_dot_acoustic,
int size,
realw* rmass_acoustic) {
+
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
// multiplies pressure with the inverse of the mass matrix
potential_dot_dot_acoustic[id] = potential_dot_dot_acoustic[id]*rmass_acoustic[id];
@@ -440,8 +416,8 @@
realw* rmass_acoustic) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- /* because of block and grid sizing problems, there is a small */
- /* amount of buffer at the end of the calculation */
+ // because of block and grid sizing problems, there is a small
+ // amount of buffer at the end of the calculation
if(id < size) {
// Newmark time scheme: corrector term
potential_dot_acoustic[id] = potential_dot_acoustic[id] + deltatover2*potential_dot_dot_acoustic[id];
@@ -451,36 +427,34 @@
/* ----------------------------------------------------------------------------------------------- */
extern "C"
-void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
- long* Mesh_pointer,
- int* size_F) {
+void FC_FUNC_(kernel_3_a_acoustic_cuda,
+ KERNEL_3_ACOUSTIC_CUDA)(long* Mesh_pointer ) {
TRACE("kernel_3_a_acoustic_cuda");
- Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- int size = *size_F;
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- int blocksize = BLOCKSIZE_KERNEL3;
- int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
+ int size = mp->NGLOB_AB;
- kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_dot_acoustic,
+ int blocksize = BLOCKSIZE_KERNEL3;
+ int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+ kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_dot_acoustic,
+ size,
+ mp->d_rmass_acoustic);
+
+ if(mp->simulation_type == 3) {
+ kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_dot_acoustic,
size,
mp->d_rmass_acoustic);
+ }
- if(mp->simulation_type == 3) {
- kernel_3_a_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_dot_acoustic,
- size,
- mp->d_rmass_acoustic);
- }
-
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
//printf("checking updatedispl_kernel launch...with %dx%d blocks\n",num_blocks_x,num_blocks_y);
exit_on_cuda_error("after kernel 3 a");
@@ -490,41 +464,40 @@
/* ----------------------------------------------------------------------------------------------- */
extern "C"
-void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
- long* Mesh_pointer,
- int* size_F,
- realw* deltatover2_F,
- realw* b_deltatover2_F) {
+void FC_FUNC_(kernel_3_b_acoustic_cuda,
+ KERNEL_3_ACOUSTIC_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_F,
+ realw* b_deltatover2_F) {
TRACE("kernel_3_b_acoustic_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- int size = *size_F;
- realw deltatover2 = *deltatover2_F;
- realw b_deltatover2 = *b_deltatover2_F;
+ int size = mp->NGLOB_AB;
int blocksize = BLOCKSIZE_KERNEL3;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
- int num_blocks_x = size_padded/blocksize;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(size_padded/blocksize,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
+ realw deltatover2 = *deltatover2_F;
+
kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_potential_dot_acoustic,
- mp->d_potential_dot_dot_acoustic,
- size, deltatover2,
- mp->d_rmass_acoustic);
+ mp->d_potential_dot_dot_acoustic,
+ size, deltatover2,
+ mp->d_rmass_acoustic);
if(mp->simulation_type == 3) {
+ realw b_deltatover2 = *b_deltatover2_F;
+
kernel_3_b_acoustic_cuda_device<<< grid, threads>>>(mp->d_b_potential_dot_acoustic,
- mp->d_b_potential_dot_dot_acoustic,
- size, b_deltatover2,
- mp->d_rmass_acoustic);
+ mp->d_b_potential_dot_dot_acoustic,
+ size, b_deltatover2,
+ mp->d_rmass_acoustic);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/mesh_constants_cuda.h 2013-08-20 14:13:26 UTC (rev 22718)
@@ -76,14 +76,6 @@
// maximum function
#define MAX(x,y) (((x) < (y)) ? (y) : (x))
-// utility functions: defined in check_fields_cuda.cu
-double get_time();
-void get_free_memory(double* free_db, double* used_db, double* total_db);
-void print_CUDA_error_if_any(cudaError_t err, int num);
-void pause_for_debugger(int pause);
-void exit_on_cuda_error(char* kernel_name);
-void exit_on_error(char* info);
-
/* ----------------------------------------------------------------------------------------------- */
// cuda constant arrays
@@ -108,13 +100,6 @@
/* ----------------------------------------------------------------------------------------------- */
-// type of "working" variables: see also CUSTOM_REAL
-// double precision temporary variables leads to 10% performance decrease
-// in Kernel_2_impl (not very much..)
-typedef float realw;
-
-/* ----------------------------------------------------------------------------------------------- */
-
// (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
// leads up to ~ 5% performance increase
//#define USE_MESH_COLORING_GPU
@@ -124,7 +109,7 @@
// Texture memory usage:
// requires CUDA version >= 4.0, see check below
// Use textures for d_displ and d_accel -- 10% performance boost
-#define USE_TEXTURES_FIELDS
+//#define USE_TEXTURES_FIELDS
// Using texture memory for the hprime-style constants is slower on
// Fermi generation hardware, but *may* be faster on Kepler
@@ -157,28 +142,59 @@
#define BLOCKSIZE_KERNEL3 128
#define BLOCKSIZE_TRANSFER 256
+// maximum grid dimension in one direction of GPU
+#define MAXIMUM_GRID_DIM 65535
+
/* ----------------------------------------------------------------------------------------------- */
// indexing
-
#define INDEX2(xsize,x,y) x + (y)*xsize
#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
-//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
-//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*(j))))
-//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
#define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*NGLL3_PADDED
-//#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*NGLL3_PADDED
/* ----------------------------------------------------------------------------------------------- */
+// custom type declarations
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// type of "working" variables: see also CUSTOM_REAL
+// double precision temporary variables leads to 10% performance decrease
+// in Kernel_2_impl (not very much..)
+typedef float realw;
+
+// textures
+typedef texture<float, cudaTextureType1D, cudaReadModeElementType> realw_texture;
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// utility functions: defined in check_fields_cuda.cu
+
+/* ----------------------------------------------------------------------------------------------- */
+
+double get_time();
+void get_free_memory(double* free_db, double* used_db, double* total_db);
+void print_CUDA_error_if_any(cudaError_t err, int num);
+void pause_for_debugger(int pause);
+void exit_on_cuda_error(char* kernel_name);
+void exit_on_error(char* info);
+void synchronize_cuda();
+void synchronize_mpi();
+void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y);
+realw get_device_array_maximum_value(realw* array,int size);
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
// mesh pointer wrapper structure
/* ----------------------------------------------------------------------------------------------- */
@@ -189,11 +205,12 @@
int NSPEC_AB;
int NGLOB_AB;
+ // mpi process
int myrank;
- int NPROC;
// constants
int simulation_type;
+ int save_forward;
int use_mesh_coloring_gpu;
int absorbing_conditions;
int gravity;
@@ -229,33 +246,25 @@
realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz;
realw* d_wgll_cube;
-#ifdef USE_TEXTURES_CONSTANTS
- const textureReference* d_hprime_xx_tex_ptr;
- realw* d_hprime_xx_tex;
-#endif
-
-
// A buffer for mpi-send/recv, which is duplicated in fortran but is
// allocated with pinned memory to facilitate asynchronus device <->
// host memory transfers
float* h_send_accel_buffer;
float* h_send_b_accel_buffer;
+
float* send_buffer;
float* h_recv_accel_buffer;
float* h_recv_b_accel_buffer;
float* recv_buffer;
+
int size_mpi_buffer;
+ int size_mpi_buffer_potential;
- // buffers and constants for the MPI-send required for async-memcpy
- // + non-blocking MPI
- //daniel: check if needed
- //float* buffer_recv_vector_ext_mesh;
+ // mpi interfaces
int num_interfaces_ext_mesh;
int max_nibool_interfaces_ext_mesh;
- //int* nibool_interfaces_ext_mesh;
- //int* my_neighbours_ext_mesh;
- //int* request_send_vector_ext_mesh;
- //int* request_recv_vector_ext_mesh;
+ int* d_nibool_interfaces_ext_mesh;
+ int* d_ibool_interfaces_ext_mesh;
// overlapped memcpy streams
cudaStream_t compute_stream;
@@ -271,13 +280,6 @@
// backward/reconstructed elastic wavefield
realw* d_b_displ; realw* d_b_veloc; realw* d_b_accel;
-#ifdef USE_TEXTURES_FIELDS
- // Texture references for fast non-coalesced scattered access
- const textureReference* d_displ_tex_ref_ptr;
- const textureReference* d_veloc_tex_ref_ptr;
- const textureReference* d_accel_tex_ref_ptr;
-#endif
-
// elastic elements
int* d_ispec_is_elastic;
@@ -296,11 +298,8 @@
// mpi buffer
realw* d_send_accel_buffer;
+ realw* d_b_send_accel_buffer;
- // interfaces
- int* d_nibool_interfaces_ext_mesh;
- int* d_ibool_interfaces_ext_mesh;
-
//used for absorbing stacey boundaries
int d_num_abs_boundary_faces;
int* d_abs_boundary_ispec;
@@ -426,9 +425,11 @@
realw* d_b_gammaval;
// sensitivity kernels
+ int anisotropic_kl;
realw* d_rho_kl;
realw* d_mu_kl;
realw* d_kappa_kl;
+ realw* d_cijkl_kl;
// noise sensitivity kernel
realw* d_Sigma_kl;
@@ -468,6 +469,7 @@
// mpi buffer
realw* d_send_potential_dot_dot_buffer;
+ realw* d_b_send_potential_dot_dot_buffer;
realw* d_b_absorb_potential;
int d_b_reclen_potential;
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/noise_tomography_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/noise_tomography_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -102,15 +102,15 @@
// randomize displ for testing
extern "C"
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer,realw* h_displ) {
TRACE("make_displ_rand");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
// realw* displ_rnd = (realw*)malloc(mp->NGLOB_AB*3*sizeof(realw));
for(int i=0;i<mp->NGLOB_AB*3;i++) {
h_displ[i] = rand();
}
- cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(realw),cudaMemcpyHostToDevice);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(realw),cudaMemcpyHostToDevice),44001);
}
/* ----------------------------------------------------------------------------------------------- */
@@ -145,18 +145,15 @@
extern "C"
void FC_FUNC_(transfer_surface_to_host,
- TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
+ TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer,
realw* h_noise_surface_movie) {
TRACE("transfer_surface_to_host");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
+ Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
- int num_blocks_x = mp->num_free_surface_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->num_free_surface_faces,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(NGLL2,1,1);
@@ -167,8 +164,8 @@
mp->d_displ,
mp->d_noise_surface_movie);
- cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
- 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
+ print_CUDA_error_if_any(cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
+ 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost),44002);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("transfer_surface_to_host");
@@ -250,25 +247,22 @@
extern "C"
void FC_FUNC_(noise_read_add_surface_movie_cu,
- NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+ NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer,
realw* h_noise_surface_movie,
int* NOISE_TOMOGRAPHYf) {
-TRACE("noise_read_add_surface_movie_cu");
+ TRACE("noise_read_add_surface_movie_cu");
// EPIK_TRACER("noise_read_add_surface_movie_cu");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf;
- cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
- 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+ 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),44003);
- int num_blocks_x = mp->num_free_surface_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->num_free_surface_faces,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(NGLL2,1,1);
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_constants_cuda.h 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_constants_cuda.h 2013-08-20 14:13:26 UTC (rev 22718)
@@ -31,7 +31,19 @@
typedef float realw; // type of "working" variables
+// CUDA version >= 5.0 needed for new symbol addressing and texture binding
+#if CUDA_VERSION < 5000
+ #ifndef USE_OLDER_CUDA4_GPU
+ #define USE_OLDER_CUDA4_GPU
+ #endif
+#else
+ #undef USE_OLDER_CUDA4_GPU
+#endif
+#ifdef USE_OLDER_CUDA4_GPU
+#pragma message ("\nCompiling with: USE_OLDER_CUDA4_GPU enabled\n")
+#endif
+
/* ----------------------------------------------------------------------------------------------- */
// CONSTANT arrays setup
@@ -279,51 +291,4 @@
}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-/* CUDA specific things from specfem3D_kernels.cu */
-
-// older TEXTURE usage. For now just acoustic simulations. See usage
-// of USE_TEXTURES_FIELDS elsewhere in code for elastic case
-#ifdef USE_TEXTURES
-
-// declaration of textures
-texture<realw, 1, cudaReadModeElementType> tex_potential_acoustic;
-texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
-
-
- void bindTexturesPotential(realw* d_potential_acoustic)
- {
- cudaError_t err;
-
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
-
- err = cudaBindTexture(NULL,tex_potential_acoustic, d_potential_acoustic,
- channelDescFloat, NGLOB*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in bindTexturesPotential for potential_acoustic: %s\n", cudaGetErrorString(err));
- exit(1);
- }
- }
-
- void bindTexturesPotential_dot_dot(realw* d_potential_dot_dot_acoustic)
- {
- cudaError_t err;
-
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
-
- err = cudaBindTexture(NULL,tex_potential_dot_dot_acoustic, d_potential_dot_dot_acoustic,
- channelDescFloat, NGLOB*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in bindTexturesPotential_dot_dot for potential_dot_dot_acoustic: %s\n", cudaGetErrorString(err));
- exit(1);
- }
- }
-
-#endif // USE_TEXTURES
-
-
#endif //CUDA_HEADER_H
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/prepare_mesh_constants_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -44,13 +44,23 @@
#ifdef USE_OLDER_CUDA4_GPU
#else
#ifdef USE_TEXTURES_FIELDS
-extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_tex;
-extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_veloc_tex;
-extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_tex;
+ // elastic
+ extern realw_texture d_displ_tex;
+ extern realw_texture d_veloc_tex;
+ extern realw_texture d_accel_tex;
+ // backward/reconstructed
+ extern realw_texture d_b_displ_tex;
+ extern realw_texture d_b_veloc_tex;
+ extern realw_texture d_b_accel_tex;
+ // acoustic
+ extern realw_texture d_potential_tex;
+ extern realw_texture d_potential_dot_dot_tex;
+ // backward/reconstructed
+ extern realw_texture d_b_potential_tex;
+ extern realw_texture d_b_potential_dot_dot_tex;
#endif
-
#ifdef USE_TEXTURES_CONSTANTS
-extern texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_tex;
+ extern realw_texture d_hprime_xx_tex;
#endif
#endif
@@ -64,7 +74,7 @@
// copies integer array from CPU host to GPU device
void copy_todevice_int(void** d_array_addr_ptr,int* h_array,int size){
- TRACE("copy_todevice_int");
+ TRACE(" copy_todevice_int");
// allocates memory on GPU
//
@@ -86,7 +96,7 @@
// copies integer array from CPU host to GPU device
void copy_todevice_realw(void** d_array_addr_ptr,realw* h_array,int size){
- TRACE("copy_todevice_realw");
+ TRACE(" copy_todevice_realw");
// allocates memory on GPU
print_CUDA_error_if_any(cudaMalloc((void**)d_array_addr_ptr,size*sizeof(realw)),
@@ -106,19 +116,15 @@
extern "C"
void FC_FUNC_(prepare_constants_device,
PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
- int* h_NGLLX,
- int* NSPEC_AB, int* NGLOB_AB,
+ int* h_NGLLX, int* NSPEC_AB, int* NGLOB_AB,
realw* h_xix, realw* h_xiy, realw* h_xiz,
realw* h_etax, realw* h_etay, realw* h_etaz,
realw* h_gammax, realw* h_gammay, realw* h_gammaz,
realw* h_kappav, realw* h_muv,
int* h_ibool,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh,
- int* h_nibool_interfaces_ext_mesh,
- int* h_ibool_interfaces_ext_mesh,
- realw* h_hprime_xx,
- realw* h_hprimewgll_xx,
+ int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
+ int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
+ realw* h_hprime_xx, realw* h_hprimewgll_xx,
realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
int* ABSORBING_CONDITIONS,
int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
@@ -126,24 +132,18 @@
realw* h_abs_boundary_jacobian2Dw,
int* h_num_abs_boundary_faces,
int* h_ispec_is_inner,
- int* NSOURCES,
- int* nsources_local_f,
+ int* NSOURCES, int* nsources_local_f,
realw* h_sourcearrays,
- int* h_islice_selected_source,
- int* h_ispec_selected_source,
- int* h_number_receiver_global,
- int* h_ispec_selected_rec,
+ int* h_islice_selected_source, int* h_ispec_selected_source,
+ int* h_number_receiver_global, int* h_ispec_selected_rec,
int* nrec,int* nrec_local,
int* SIMULATION_TYPE,
int* USE_MESH_COLORING_GPU_f,
int* nspec_acoustic,int* nspec_elastic,
- int* my_neighbours_ext_mesh,
- int* request_send_vector_ext_mesh,
- int* request_recv_vector_ext_mesh,
- realw* buffer_recv_vector_ext_mesh
- ) {
+ int* h_myrank,
+ int* SAVE_FORWARD ) {
-TRACE("prepare_constants_device");
+ TRACE("prepare_constants_device");
// allocates mesh parameter structure
Mesh* mp = (Mesh*) malloc( sizeof(Mesh) );
@@ -155,14 +155,8 @@
exit_on_error("NGLLX must be 5 for CUDA devices");
}
- // sets number of processes
-#ifdef WITH_MPI
- int nproc;
- MPI_Comm_size(MPI_COMM_WORLD,&nproc);
- mp->NPROC = nproc;
-#else
- mp->NPROC = 1;
-#endif
+ // sets processes mpi rank
+ mp->myrank = *h_myrank;
// sets global parameters
mp->NSPEC_AB = *NSPEC_AB;
@@ -171,6 +165,7 @@
// constants
mp->simulation_type = *SIMULATION_TYPE;
mp->absorbing_conditions = *ABSORBING_CONDITIONS;
+ mp->save_forward = *SAVE_FORWARD;
// sets constant arrays
setConst_hprime_xx(h_hprime_xx,mp);
@@ -192,21 +187,21 @@
#ifdef USE_TEXTURES_CONSTANTS
{
#ifdef USE_OLDER_CUDA4_GPU
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
const textureReference* d_hprime_xx_tex_ptr;
print_CUDA_error_if_any(cudaGetTextureReference(&d_hprime_xx_tex_ptr, "d_hprime_xx_tex"), 4101);
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
print_CUDA_error_if_any(cudaBindTexture(0, d_hprime_xx_tex_ptr, mp->d_hprime_xx, &channelDesc, sizeof(realw)*(NGLL2)), 4001);
#else
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx, mp->d_hprime_xx, &channelDesc, sizeof(realw)*(NGLL2)), 4001);
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_hprime_xx_tex, mp->d_hprime_xx, &channelDesc, sizeof(realw)*(NGLL2)), 4001);
#endif
}
#endif
- /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
+ // mesh
+ // Assuming NGLLX=5. Padded is then 128 (5^3+3)
int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
- // mesh
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(realw)),1001);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(realw)),1002);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(realw)),1003);
@@ -248,113 +243,37 @@
// global indexing
copy_todevice_int((void**)&mp->d_ibool,h_ibool,NGLL3*(mp->NSPEC_AB));
- //print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool,size_padded*sizeof(int)),1021);
- //print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool, h_ibool,
- // NGLL3*(mp->NSPEC_AB)*sizeof(int),cudaMemcpyHostToDevice),1022);
-
-
// prepare interprocess-edge exchange information
mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh;
mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh;
if( mp->num_interfaces_ext_mesh > 0 ){
-
copy_todevice_int((void**)&mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
mp->num_interfaces_ext_mesh);
-
- //print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_ext_mesh,
- // (mp->num_interfaces_ext_mesh)*sizeof(int)),1201);
- //print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
- // (mp->num_interfaces_ext_mesh)*sizeof(int),cudaMemcpyHostToDevice),1202);
-
copy_todevice_int((void**)&mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
(mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh));
-
- //print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_ext_mesh,
- // (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int)),1203);
- //print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
- // (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int),
- // cudaMemcpyHostToDevice),1204);
}
- // Allocate pinned mpi-buffers.
- // MPI buffers use pinned memory allocated by cudaMallocHost, which
- // enables the use of asynchronous memory copies from host <->
- // device
- int size_mpi_buffer = 3 * (mp->num_interfaces_ext_mesh) * (mp->max_nibool_interfaces_ext_mesh);
- // send buffer
- mp->size_mpi_buffer = size_mpi_buffer;
- if( mp->size_mpi_buffer > 0 ){
- print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
- mp->send_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
- // adjoint
- print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(size_mpi_buffer)),8004);
- // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
-
- // receive buffer
- print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
- mp->recv_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
- }
-
- //daniel: check if needed
- //mp->nibool_interfaces_ext_mesh = h_nibool_interfaces_ext_mesh;
- //mp->my_neighbours_ext_mesh = my_neighbours_ext_mesh;
- //mp->request_send_vector_ext_mesh = request_send_vector_ext_mesh;
- //mp->request_recv_vector_ext_mesh = request_recv_vector_ext_mesh;
- //mp->buffer_recv_vector_ext_mesh = buffer_recv_vector_ext_mesh;
-
// setup two streams, one for compute and one for host<->device memory copies
// compute stream
cudaStreamCreate(&mp->compute_stream);
// copy stream (needed to transfer mpi buffers)
- if( mp->size_mpi_buffer > 0 ){
+ if( mp->num_interfaces_ext_mesh * mp->max_nibool_interfaces_ext_mesh > 0 ){
cudaStreamCreate(&mp->copy_stream);
- //cudaStreamCreate(&mp->b_copy_stream);
}
// inner elements
copy_todevice_int((void**)&mp->d_ispec_is_inner,h_ispec_is_inner,mp->NSPEC_AB);
-// print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,mp->NSPEC_AB*sizeof(int)),1205);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
- // mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),1206);
-
// absorbing boundaries
mp->d_num_abs_boundary_faces = *h_num_abs_boundary_faces;
if( mp->absorbing_conditions && mp->d_num_abs_boundary_faces > 0 ){
copy_todevice_int((void**)&mp->d_abs_boundary_ispec,h_abs_boundary_ispec,mp->d_num_abs_boundary_faces);
-
-// print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ispec),
-// (mp->d_num_abs_boundary_faces)*sizeof(int)),1101);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ispec, h_abs_boundary_ispec,
-// (mp->d_num_abs_boundary_faces)*sizeof(int),
-// cudaMemcpyHostToDevice),1102);
-
copy_todevice_int((void**)&mp->d_abs_boundary_ijk,h_abs_boundary_ijk,
3*NGLL2*(mp->d_num_abs_boundary_faces));
-
-// print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ijk),
-// 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
-// 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int),
-// cudaMemcpyHostToDevice),1104);
-
copy_todevice_realw((void**)&mp->d_abs_boundary_normal,h_abs_boundary_normal,
NDIM*NGLL2*(mp->d_num_abs_boundary_faces));
-
-// print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_normal),
-// 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1105);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
-// 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
-// cudaMemcpyHostToDevice),1106);
-
copy_todevice_realw((void**)&mp->d_abs_boundary_jacobian2Dw,h_abs_boundary_jacobian2Dw,
NGLL2*(mp->d_num_abs_boundary_faces));
-
-// print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_jacobian2Dw),
-// NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1107);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
-// NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
- // cudaMemcpyHostToDevice),1108);
}
// sources
@@ -415,19 +334,15 @@
extern "C"
void FC_FUNC_(prepare_fields_acoustic_device,
- PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
- realw* rmass_acoustic,
- realw* rhostore,
- realw* kappastore,
- int* num_phase_ispec_acoustic,
- int* phase_ispec_inner_acoustic,
+ PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer,
+ realw* rmass_acoustic, realw* rhostore, realw* kappastore,
+ int* num_phase_ispec_acoustic, int* phase_ispec_inner_acoustic,
int* ispec_is_acoustic,
int* NOISE_TOMOGRAPHY,
int* num_free_surface_faces,
int* free_surface_ispec,
int* free_surface_ijk,
- int* b_reclen_potential,
- realw* b_absorb_potential,
+ int* b_reclen_potential, realw* b_absorb_potential,
int* ELASTIC_SIMULATION,
int* num_coupling_ac_el_faces,
int* coupling_ac_el_ispec,
@@ -440,31 +355,50 @@
TRACE("prepare_fields_acoustic_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
- /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
- int size_padded = NGLL3_PADDED * mp->NSPEC_AB;
-// int size_nonpadded = NGLL3 * mp->NSPEC_AB;
- int size_glob = mp->NGLOB_AB;
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
// allocates arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),2001);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),2002);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),2003);
+ int size = mp->NGLOB_AB;
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size),2001);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size),2002);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size),2003);
+ // initializes values to zero
+ //print_CUDA_error_if_any(cudaMemset(mp->d_potential_acoustic,0,sizeof(realw)*size),2007);
+ //print_CUDA_error_if_any(cudaMemset(mp->d_potential_dot_acoustic,0,sizeof(realw)*size),2007);
+ //print_CUDA_error_if_any(cudaMemset(mp->d_potential_dot_dot_acoustic,0,sizeof(realw)*size),2007);
+ #ifdef USE_TEXTURES_FIELDS
+ {
+ #ifdef USE_OLDER_CUDA4_GPU
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ const textureReference* d_potential_tex_ref_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_potential_tex_ref_ptr, "d_potential_tex"), 2001);
+ print_CUDA_error_if_any(cudaBindTexture(0, d_potential_tex_ref_ptr, mp->d_potential_acoustic, &channelDesc, sizeof(realw)*size), 2001);
+
+ const textureReference* d_potential_dot_dot_tex_ref_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_potential_dot_dot_tex_ref_ptr, "d_potential_dot_dot_tex"), 2003);
+ print_CUDA_error_if_any(cudaBindTexture(0, d_potential_dot_dot_tex_ref_ptr, mp->d_potential_dot_dot_acoustic, &channelDesc, sizeof(realw)*size), 2003);
+ #else
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_potential_tex, mp->d_potential_acoustic, &channelDesc, sizeof(realw)*size), 2001);
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_potential_dot_dot_tex, mp->d_potential_dot_dot_acoustic, &channelDesc, sizeof(realw)*size), 2003);
+ #endif
+ }
+ #endif
+
// mpi buffer
- if( mp->num_interfaces_ext_mesh > 0 ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
- (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),2004);
+ mp->size_mpi_buffer_potential = (mp->num_interfaces_ext_mesh) * (mp->max_nibool_interfaces_ext_mesh);
+ if( mp->size_mpi_buffer_potential > 0 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),mp->size_mpi_buffer_potential *sizeof(realw)),2004);
}
// mass matrix
copy_todevice_realw((void**)&mp->d_rmass_acoustic,rmass_acoustic,mp->NGLOB_AB);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),2005);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
-// sizeof(realw)*size_glob,cudaMemcpyHostToDevice),2100);
-
+ // density
// padded array
+ // Assuming NGLLX==5. Padded is then 128 (5^3+3)
+ int size_padded = NGLL3_PADDED * mp->NSPEC_AB;
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),2006);
// transfer constant element data with padding
for(int i=0; i < mp->NSPEC_AB; i++) {
@@ -475,64 +409,34 @@
// non-padded array
copy_todevice_realw((void**)&mp->d_kappastore,kappastore,NGLL3*mp->NSPEC_AB);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),2007);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
-// NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),2105);
-
// phase elements
mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
-
copy_todevice_int((void**)&mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
2*mp->num_phase_ispec_acoustic);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic),
-// mp->num_phase_ispec_acoustic*2*sizeof(int)),2008);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
-// mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),2101);
-
copy_todevice_int((void**)&mp->d_ispec_is_acoustic,ispec_is_acoustic,mp->NSPEC_AB);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),
-// mp->NSPEC_AB*sizeof(int)),2009);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
-// mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),2102);
-
// free surface
if( *NOISE_TOMOGRAPHY == 0 ){
// allocate surface arrays
mp->num_free_surface_faces = *num_free_surface_faces;
if( mp->num_free_surface_faces > 0 ){
-
copy_todevice_int((void**)&mp->d_free_surface_ispec,free_surface_ispec,mp->num_free_surface_faces);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
-// mp->num_free_surface_faces*sizeof(int)),2201);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-// mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2203);
-
-
copy_todevice_int((void**)&mp->d_free_surface_ijk,free_surface_ijk,
3*NGLL2*mp->num_free_surface_faces);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-// 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),2202);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-// 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),2204);
}
}
// absorbing boundaries
- if( mp->absorbing_conditions ){
- mp->d_b_reclen_potential = *b_reclen_potential;
-
- copy_todevice_realw((void**)&mp->d_b_absorb_potential,b_absorb_potential,mp->d_b_reclen_potential);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),2301);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
-// mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2302);
+ if( mp->absorbing_conditions && mp->d_num_abs_boundary_faces > 0 ){
+ // absorb_field array used for file i/o
+ if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && mp->save_forward )){
+ // note: b_reclen_potential is record length in bytes ( CUSTOM_REAL * NGLLSQUARE * num_abs_boundary_faces )
+ mp->d_b_reclen_potential = *b_reclen_potential;
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_b_absorb_potential,mp->d_b_reclen_potential),2201);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,mp->d_b_reclen_potential,cudaMemcpyHostToDevice),2202);
+ }
}
-
// for seismograms
if( mp->nrec_local > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
@@ -542,40 +446,14 @@
if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
}
-
// coupling with elastic parts
if( *ELASTIC_SIMULATION && *num_coupling_ac_el_faces > 0 ){
-
copy_todevice_int((void**)&mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,(*num_coupling_ac_el_faces));
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ispec),
-// (*num_coupling_ac_el_faces)*sizeof(int)),2601);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ispec,coupling_ac_el_ispec,
-// (*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2602);
-
copy_todevice_int((void**)&mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,3*NGLL2*(*num_coupling_ac_el_faces));
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
-// 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),2603);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
-// 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),2604);
-
copy_todevice_realw((void**)&mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
3*NGLL2*(*num_coupling_ac_el_faces));
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
-// 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2605);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
-// 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2606);
-
copy_todevice_realw((void**)&mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
NGLL2*(*num_coupling_ac_el_faces));
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
-// NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),2607);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
-// NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),2608);
-
}
// mesh coloring
@@ -595,41 +473,65 @@
extern "C"
void FC_FUNC_(prepare_fields_acoustic_adj_dev,
- PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
+ PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer,
int* APPROXIMATE_HESS_KL) {
TRACE("prepare_fields_acoustic_adj_dev");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
- int size_glob = mp->NGLOB_AB;
-
// kernel simulations
if( mp->simulation_type != 3 ) return;
// allocates backward/reconstructed arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),3014);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),3015);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),3016);
+ int size = mp->NGLOB_AB;
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size),3014);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size),3015);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size),3016);
+ // initializes values to zero
+ //print_CUDA_error_if_any(cudaMemset(mp->d_b_potential_acoustic,0,sizeof(realw)*size),3007);
+ //print_CUDA_error_if_any(cudaMemset(mp->d_b_potential_dot_acoustic,0,sizeof(realw)*size),3007);
+ //print_CUDA_error_if_any(cudaMemset(mp->d_b_potential_dot_dot_acoustic,0,sizeof(realw)*size),3007);
- // allocates kernels
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3017);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3018);
+ #ifdef USE_TEXTURES_FIELDS
+ {
+ #ifdef USE_OLDER_CUDA4_GPU
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ const textureReference* d_b_potential_tex_ref_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_b_potential_tex_ref_ptr, "d_b_potential_tex"), 3001);
+ print_CUDA_error_if_any(cudaBindTexture(0, d_b_potential_tex_ref_ptr, mp->d_b_potential_acoustic, &channelDesc, sizeof(realw)*size), 3001);
+ const textureReference* d_b_potential_dot_dot_tex_ref_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_b_potential_dot_dot_tex_ref_ptr, "d_b_potential_dot_dot_tex"),3003);
+ print_CUDA_error_if_any(cudaBindTexture(0, d_b_potential_dot_dot_tex_ref_ptr, mp->d_b_potential_dot_dot_acoustic, &channelDesc, sizeof(realw)*size), 3003);
+ #else
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_b_potential_tex, mp->d_b_potential_acoustic, &channelDesc, sizeof(realw)*size), 3001);
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_b_potential_dot_dot_tex, mp->d_b_potential_dot_dot_acoustic, &channelDesc, sizeof(realw)*size), 3003);
+ #endif
+ }
+ #endif
+
+ // allocates kernels
+ size = NGLL3*mp->NSPEC_AB;
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),size*sizeof(realw)),3017);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),size*sizeof(realw)),3018);
// initializes kernel values to zero
- print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),3019);
- print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),3020);
+ print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,size*sizeof(realw)),3019);
+ print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,size*sizeof(realw)),3020);
// preconditioner
if( *APPROXIMATE_HESS_KL ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),3030);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),size*sizeof(realw)),3030);
// initializes with zeros
- print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),3031);
+ print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0,size*sizeof(realw)),3031);
}
+ // mpi buffer
+ if( mp->size_mpi_buffer_potential > 0 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_potential_dot_dot_buffer),mp->size_mpi_buffer_potential*sizeof(realw)),3014);
+ }
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_fields_acoustic_adj_dev");
#endif
@@ -644,19 +546,13 @@
extern "C"
void FC_FUNC_(prepare_fields_elastic_device,
- PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
- int* size,
- realw* rmassx,
- realw* rmassy,
- realw* rmassz,
- realw* rho_vp,
- realw* rho_vs,
+ PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer,
+ realw* rmassx, realw* rmassy, realw* rmassz,
+ realw* rho_vp, realw* rho_vs,
int* num_phase_ispec_elastic,
int* phase_ispec_inner_elastic,
int* ispec_is_elastic,
- realw* h_b_absorb_field,
- int* h_b_reclen_field,
- int* SAVE_FORWARD,
+ realw* b_absorb_field, int* b_reclen_field,
int* COMPUTE_AND_STORE_STRAIN,
realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
realw* epsilondev_xz,realw* epsilondev_yz,
@@ -677,309 +573,214 @@
int* num_colors_inner_elastic,
int* num_elem_colors_elastic,
int* ANISOTROPY,
- realw *c11store,
- realw *c12store,
- realw *c13store,
- realw *c14store,
- realw *c15store,
- realw *c16store,
- realw *c22store,
- realw *c23store,
- realw *c24store,
- realw *c25store,
- realw *c26store,
- realw *c33store,
- realw *c34store,
- realw *c35store,
- realw *c36store,
- realw *c44store,
- realw *c45store,
- realw *c46store,
- realw *c55store,
- realw *c56store,
- realw *c66store){
+ realw *c11store,realw *c12store,realw *c13store,
+ realw *c14store,realw *c15store,realw *c16store,
+ realw *c22store,realw *c23store,realw *c24store,
+ realw *c25store,realw *c26store,realw *c33store,
+ realw *c34store,realw *c35store,realw *c36store,
+ realw *c44store,realw *c45store,realw *c46store,
+ realw *c55store,realw *c56store,realw *c66store ){
-TRACE("prepare_fields_elastic_device");
+ TRACE("prepare_fields_elastic_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
- /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
- int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
-// int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+ int size;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),4001);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003);
+ // debug
+ //printf("prepare_fields_elastic_device: rank %d - wavefield setup\n",mp->myrank);
+ //synchronize_mpi();
+ // elastic wavefields
+ size = NDIM * mp->NGLOB_AB;
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*size),4001);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*size),4002);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*size),4003);
+ // initializes values to zero
+ //print_CUDA_error_if_any(cudaMemset(mp->d_displ,0,sizeof(realw)*size),4007);
+ //print_CUDA_error_if_any(cudaMemset(mp->d_veloc,0,sizeof(realw)*size),4007);
+ //print_CUDA_error_if_any(cudaMemset(mp->d_accel,0,sizeof(realw)*size),4007);
+
#ifdef USE_TEXTURES_FIELDS
{
#ifdef USE_OLDER_CUDA4_GPU
- print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_tex_ref_ptr, "d_displ_tex"), 4001);
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_tex_ref_ptr, mp->d_displ, &channelDesc, sizeof(realw)*(*size)), 4001);
- #else
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_tex, mp->d_displ, &channelDesc, sizeof(realw)*(*size)), 4001);
- #endif
- }
+ const textureReference* d_displ_tex_ref_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_displ_tex_ref_ptr, "d_displ_tex"), 4001);
+ print_CUDA_error_if_any(cudaBindTexture(0, d_displ_tex_ref_ptr, mp->d_displ, &channelDesc, sizeof(realw)*size), 4001);
- {
- #ifdef USE_OLDER_CUDA4_GPU
- print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_veloc_tex_ref_ptr, "d_veloc_tex"), 4002);
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- print_CUDA_error_if_any(cudaBindTexture(0, mp->d_veloc_tex_ref_ptr, mp->d_veloc, &channelDesc, sizeof(realw)*(*size)), 4002);
- #else
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- print_CUDA_error_if_any(cudaBindTexture(0, &d_veloc_tex, mp->d_veloc, &channelDesc, sizeof(realw)*(*size)), 4002);
- #endif
- }
+ const textureReference* d_veloc_tex_ref_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_veloc_tex_ref_ptr, "d_veloc_tex"), 4002);
+ print_CUDA_error_if_any(cudaBindTexture(0, d_veloc_tex_ref_ptr, mp->d_veloc, &channelDesc, sizeof(realw)*size), 4002);
- {
- #ifdef USE_OLDER_CUDA4_GPU
- print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_tex_ref_ptr, "d_accel_tex"), 4003);
- cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_tex_ref_ptr, mp->d_accel, &channelDesc, sizeof(realw)*(*size)), 4003);
+ const textureReference* d_accel_tex_ref_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_accel_tex_ref_ptr, "d_accel_tex"), 4003);
+ print_CUDA_error_if_any(cudaBindTexture(0, d_accel_tex_ref_ptr, mp->d_accel, &channelDesc, sizeof(realw)*size), 4003);
#else
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
- print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_tex, mp->d_accel, &channelDesc, sizeof(realw)*(*size)), 4003);
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_displ_tex, mp->d_displ, &channelDesc, sizeof(realw)*size), 4001);
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_veloc_tex, mp->d_veloc, &channelDesc, sizeof(realw)*size), 4002);
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_accel_tex, mp->d_accel, &channelDesc, sizeof(realw)*size), 4003);
#endif
}
#endif
- // mpi buffer
+ // debug
+ //synchronize_mpi();
+
+ // MPI buffer
+ mp->size_mpi_buffer = NDIM * (mp->num_interfaces_ext_mesh) * (mp->max_nibool_interfaces_ext_mesh);
if( mp->size_mpi_buffer > 0 ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
- mp->size_mpi_buffer*sizeof(realw)),4004);
+ // note: Allocate pinned mpi-buffers.
+ // MPI buffers use pinned memory allocated by cudaMallocHost, which
+ // enables the use of asynchronous memory copies from host <-> device
+ // send buffer
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
+ //mp->send_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
+ // adjoint
+ //print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_send_b_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
+ // mp->b_send_buffer = (float*)malloc((size_mpi_buffer)*sizeof(float));
+ // receive buffer
+ print_CUDA_error_if_any(cudaMallocHost((void**)&(mp->h_recv_accel_buffer),sizeof(float)*(mp->size_mpi_buffer)),8004);
+ mp->recv_buffer = (float*)malloc((mp->size_mpi_buffer)*sizeof(float));
+
+ // non-pinned buffer
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),mp->size_mpi_buffer*sizeof(realw)),4004);
+ // adjoint
+ if( mp->simulation_type == 3 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_send_accel_buffer),mp->size_mpi_buffer*sizeof(realw)),4004);
+ }
}
+ // debug
+ //printf("prepare_fields_elastic_device: rank %d - mass matrix\n",mp->myrank);
+ //synchronize_mpi();
+
// mass matrix
copy_todevice_realw((void**)&mp->d_rmassx,rmassx,mp->NGLOB_AB);
copy_todevice_realw((void**)&mp->d_rmassy,rmassy,mp->NGLOB_AB);
copy_todevice_realw((void**)&mp->d_rmassz,rmassz,mp->NGLOB_AB);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
-// // transfer element data
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
-// sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010);
-
-
// element indices
copy_todevice_int((void**)&mp->d_ispec_is_elastic,ispec_is_elastic,mp->NSPEC_AB);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
-// mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012);
-
// phase elements
mp->num_phase_ispec_elastic = *num_phase_ispec_elastic;
copy_todevice_int((void**)&mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,2*mp->num_phase_ispec_elastic);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
-// mp->num_phase_ispec_elastic*2*sizeof(int)),4008);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
-// mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011);
+ // debug
+ //synchronize_mpi();
// for seismograms
if( mp->nrec_local > 0 ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
- 3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+ // debug
+ //printf("prepare_fields_elastic_device: rank %d - seismogram setup\n",mp->myrank);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
+
mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
}
+ // debug
+ //synchronize_mpi();
+
// absorbing conditions
if( mp->absorbing_conditions && mp->d_num_abs_boundary_faces > 0){
+
+ // debug
+ //printf("prepare_fields_elastic_device: rank %d - absorbing boundary setup\n",mp->myrank);
+
// non-padded arrays
+ // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
copy_todevice_realw((void**)&mp->d_rho_vp,rho_vp,NGLL3*mp->NSPEC_AB);
copy_todevice_realw((void**)&mp->d_rho_vs,rho_vs,NGLL3*mp->NSPEC_AB);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007);
-
- // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
-// NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
-// NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014);
-
// absorb_field array used for file i/o
- if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && *SAVE_FORWARD )){
- mp->d_b_reclen_field = *h_b_reclen_field;
+ if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && mp->save_forward )){
+ // note: b_reclen_field is length in bytes already (CUSTOM_REAL * NDIM * NGLLSQUARE * num_abs_boundary_faces )
+ mp->d_b_reclen_field = *b_reclen_field;
- copy_todevice_realw((void**)&mp->d_b_absorb_field,h_b_absorb_field,mp->d_b_reclen_field);
+ // debug
+ //printf("prepare_fields_elastic_device: rank %d - absorbing boundary i/o %d\n",mp->myrank,mp->d_b_reclen_field);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
-// mp->d_b_reclen_field),4016);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
-// mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_b_absorb_field,mp->d_b_reclen_field),4101);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,b_absorb_field,mp->d_b_reclen_field,cudaMemcpyHostToDevice),4102);
+
}
}
+ // debug
+ //synchronize_mpi();
+
// strains used for attenuation and kernel simulations
if( *COMPUTE_AND_STORE_STRAIN ){
- // strains
- int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+ // debug
+ //printf("prepare_fields_elastic_device: rank %d - strain setup\n",mp->myrank);
+ //synchronize_mpi();
- copy_todevice_realw((void**)&mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
-// epsilondev_size*sizeof(realw)),4301);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
-// cudaMemcpyHostToDevice),4302);
-
- copy_todevice_realw((void**)&mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
-// epsilondev_size*sizeof(realw)),4302);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
-// cudaMemcpyHostToDevice),4303);
-
- copy_todevice_realw((void**)&mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
-// epsilondev_size*sizeof(realw)),4304);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
-// cudaMemcpyHostToDevice),4305);
-
- copy_todevice_realw((void**)&mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
-// epsilondev_size*sizeof(realw)),4306);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
-// cudaMemcpyHostToDevice),4307);
-
- copy_todevice_realw((void**)&mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
-// epsilondev_size*sizeof(realw)),4308);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
-// cudaMemcpyHostToDevice),4309);
-
+ // strains
+ size = NGLL3 * mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+ copy_todevice_realw((void**)&mp->d_epsilondev_xx,epsilondev_xx,size);
+ copy_todevice_realw((void**)&mp->d_epsilondev_yy,epsilondev_yy,size);
+ copy_todevice_realw((void**)&mp->d_epsilondev_xy,epsilondev_xy,size);
+ copy_todevice_realw((void**)&mp->d_epsilondev_xz,epsilondev_xz,size);
+ copy_todevice_realw((void**)&mp->d_epsilondev_yz,epsilondev_yz,size);
}
// attenuation memory variables
if( *ATTENUATION ){
- // memory arrays
- copy_todevice_realw((void**)&mp->d_R_xx,R_xx,(*R_size));
+ // debug
+ //printf("prepare_fields_elastic_device: rank %d - attenuation setup\n",mp->myrank);
+ //synchronize_mpi();
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
-// (*R_size)*sizeof(realw)),4401);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
-// cudaMemcpyHostToDevice),4402);
-
- copy_todevice_realw((void**)&mp->d_R_yy,R_yy,(*R_size));
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
-// (*R_size)*sizeof(realw)),4403);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
-// cudaMemcpyHostToDevice),4404);
-
- copy_todevice_realw((void**)&mp->d_R_xy,R_xy,(*R_size));
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
-// (*R_size)*sizeof(realw)),4405);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
-// cudaMemcpyHostToDevice),4406);
-
- copy_todevice_realw((void**)&mp->d_R_xz,R_xz,(*R_size));
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
-// (*R_size)*sizeof(realw)),4407);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
-// cudaMemcpyHostToDevice),4408);
-
- copy_todevice_realw((void**)&mp->d_R_yz,R_yz,(*R_size));
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
-// (*R_size)*sizeof(realw)),4409);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
-// cudaMemcpyHostToDevice),4410);
-
+ // memory arrays
+ size = *R_size;
+ copy_todevice_realw((void**)&mp->d_R_xx,R_xx,size);
+ copy_todevice_realw((void**)&mp->d_R_yy,R_yy,size);
+ copy_todevice_realw((void**)&mp->d_R_xy,R_xy,size);
+ copy_todevice_realw((void**)&mp->d_R_xz,R_xz,size);
+ copy_todevice_realw((void**)&mp->d_R_yz,R_yz,size);
// attenuation factors
copy_todevice_realw((void**)&mp->d_one_minus_sum_beta,one_minus_sum_beta,NGLL3*mp->NSPEC_AB);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
-// NGLL3*mp->NSPEC_AB*sizeof(realw)),4430);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
-// NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431);
-
copy_todevice_realw((void**)&mp->d_factor_common,factor_common,N_SLS*NGLL3*mp->NSPEC_AB);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
-// N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
-// N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433);
-
// alpha,beta,gamma factors
copy_todevice_realw((void**)&mp->d_alphaval,alphaval,N_SLS);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
-// N_SLS*sizeof(realw)),4434);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
-// N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
-
copy_todevice_realw((void**)&mp->d_betaval,betaval,N_SLS);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
-// N_SLS*sizeof(realw)),4436);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
-// N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
-
copy_todevice_realw((void**)&mp->d_gammaval,gammaval,N_SLS);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
-// N_SLS*sizeof(realw)),4438);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
-// N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
-
}
// anisotropy
if( *ANISOTROPY ){
+ // debug
+ //printf("prepare_fields_elastic_device: rank %d - attenuation setup\n",mp->myrank);
+ //synchronize_mpi();
+
+ // Assuming NGLLX==5. Padded is then 128 (5^3+3)
+ int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
+
// allocates memory on GPU
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),
- size_padded*sizeof(realw)),4700);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),
- size_padded*sizeof(realw)),4701);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),
- size_padded*sizeof(realw)),4702);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),
- size_padded*sizeof(realw)),4703);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),
- size_padded*sizeof(realw)),4704);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),
- size_padded*sizeof(realw)),4705);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),
- size_padded*sizeof(realw)),4706);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),
- size_padded*sizeof(realw)),4707);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),
- size_padded*sizeof(realw)),4708);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),
- size_padded*sizeof(realw)),4709);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),
- size_padded*sizeof(realw)),4710);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),
- size_padded*sizeof(realw)),4711);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),
- size_padded*sizeof(realw)),4712);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),
- size_padded*sizeof(realw)),4713);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),
- size_padded*sizeof(realw)),4714);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),
- size_padded*sizeof(realw)),4715);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),
- size_padded*sizeof(realw)),4716);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),
- size_padded*sizeof(realw)),4717);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),
- size_padded*sizeof(realw)),4718);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),
- size_padded*sizeof(realw)),4719);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),
- size_padded*sizeof(realw)),4720);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),size_padded*sizeof(realw)),4700);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),size_padded*sizeof(realw)),4701);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),size_padded*sizeof(realw)),4702);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),size_padded*sizeof(realw)),4703);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),size_padded*sizeof(realw)),4704);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),size_padded*sizeof(realw)),4705);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),size_padded*sizeof(realw)),4706);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),size_padded*sizeof(realw)),4707);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),size_padded*sizeof(realw)),4708);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),size_padded*sizeof(realw)),4709);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),size_padded*sizeof(realw)),4710);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),size_padded*sizeof(realw)),4711);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),size_padded*sizeof(realw)),4712);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),size_padded*sizeof(realw)),4713);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),size_padded*sizeof(realw)),4714);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),size_padded*sizeof(realw)),4715);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),size_padded*sizeof(realw)),4716);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),size_padded*sizeof(realw)),4717);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),size_padded*sizeof(realw)),4718);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),size_padded*sizeof(realw)),4719);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),size_padded*sizeof(realw)),4720);
// transfer constant element data with padding
for(int i=0;i < mp->NSPEC_AB;i++) {
@@ -1030,45 +831,26 @@
// ocean load approximation
if( *APPROXIMATE_OCEAN_LOAD ){
+ // debug
+ //printf("prepare_fields_elastic_device: rank %d - ocean load setup\n",mp->myrank);
+ //synchronize_mpi();
+
// oceans needs a free surface
mp->num_free_surface_faces = *num_free_surface_faces;
if( mp->num_free_surface_faces > 0 ){
// mass matrix
copy_todevice_realw((void**)&mp->d_rmass_ocean_load,rmass_ocean_load,mp->NGLOB_AB);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
-// sizeof(realw)*mp->NGLOB_AB),4501);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
-// sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502);
// surface normal
copy_todevice_realw((void**)&mp->d_free_surface_normal,free_surface_normal,
3*NGLL2*(mp->num_free_surface_faces));
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
-// 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
-// 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504);
-
// temporary global array: used to synchronize updates on global accel array
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
sizeof(int)*mp->NGLOB_AB),4505);
if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
-
copy_todevice_int((void**)&mp->d_free_surface_ispec,free_surface_ispec,mp->num_free_surface_faces);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
-// mp->num_free_surface_faces*sizeof(int)),4601);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
-// mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603);
-
copy_todevice_int((void**)&mp->d_free_surface_ijk,free_surface_ijk,
3*NGLL2*mp->num_free_surface_faces);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
-// 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-// 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604);
}
}
}
@@ -1082,6 +864,10 @@
// JC JC here we will need to add GPU support for the new C-PML routines
+ // debug
+ //printf("prepare_fields_elastic_device: rank %d - done\n",mp->myrank);
+ //synchronize_mpi();
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_fields_elastic_device");
#endif
@@ -1091,8 +877,8 @@
extern "C"
void FC_FUNC_(prepare_fields_elastic_adj_dev,
- PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
- int* size,
+ PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer,
+ int* size_f,
int* COMPUTE_AND_STORE_STRAIN,
realw* epsilon_trace_over_3,
realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
@@ -1102,147 +888,144 @@
int* R_size,
realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
+ int* ANISOTROPIC_KL,
int* APPROXIMATE_HESS_KL){
TRACE("prepare_fields_elastic_adj_dev");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+ int size;
// checks if kernel simulation
if( mp->simulation_type != 3 ) return;
// kernel simulations
+ // debug
+ //printf("prepare_fields_elastic_adj_dev: rank %d - kernel setup\n",mp->myrank);
+ //synchronize_mpi();
+
+ // backward/reconstructed wavefields
// allocates backward/reconstructed arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),5201);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),5202);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),5203);
+ size = *size_f;
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*size),5201);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*size),5202);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*size),5203);
+ // initializes values to zero
+ //print_CUDA_error_if_any(cudaMemset(mp->d_b_displ,0,sizeof(realw)*size),5207);
+ //print_CUDA_error_if_any(cudaMemset(mp->d_b_veloc,0,sizeof(realw)*size),5207);
+ //print_CUDA_error_if_any(cudaMemset(mp->d_b_accel,0,sizeof(realw)*size),5207);
- // allocates kernels
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5204);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5205);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5206);
+ #ifdef USE_TEXTURES_FIELDS
+ {
+ #ifdef USE_OLDER_CUDA4_GPU
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ const textureReference* d_b_displ_tex_ref_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_b_displ_tex_ref_ptr, "d_b_displ_tex"), 4001);
+ print_CUDA_error_if_any(cudaBindTexture(0, d_b_displ_tex_ref_ptr, mp->d_b_displ, &channelDesc, sizeof(realw)*size), 4001);
+ const textureReference* d_b_veloc_tex_ref_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_b_veloc_tex_ref_ptr, "d_b_veloc_tex"), 4002);
+ print_CUDA_error_if_any(cudaBindTexture(0, d_b_veloc_tex_ref_ptr, mp->d_b_veloc, &channelDesc, sizeof(realw)*size), 4002);
+
+ const textureReference* d_b_accel_tex_ref_ptr;
+ print_CUDA_error_if_any(cudaGetTextureReference(&d_b_accel_tex_ref_ptr, "d_b_accel_tex"), 4003);
+ print_CUDA_error_if_any(cudaBindTexture(0, d_b_accel_tex_ref_ptr, mp->d_b_accel, &channelDesc, sizeof(realw)*size), 4003);
+ #else
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_b_displ_tex, mp->d_b_displ, &channelDesc, sizeof(realw)*size), 4001);
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_b_veloc_tex, mp->d_b_veloc, &channelDesc, sizeof(realw)*size), 4002);
+ print_CUDA_error_if_any(cudaBindTexture(0, &d_b_accel_tex, mp->d_b_accel, &channelDesc, sizeof(realw)*size), 4003);
+ #endif
+ }
+ #endif
+
+
+ // anisotropic kernel flag
+ mp->anisotropic_kl = *ANISOTROPIC_KL;
+
+ // anisotropic/isotropic kernels
+ // debug
+ //printf("prepare_fields_elastic_adj_dev: rank %d - anisotropic/isotropic kernels\n",mp->myrank);
+ //synchronize_mpi();
+
+ // allocates kernels
+ size = NGLL3 * mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+ // density kernel
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),size*sizeof(realw)),5204);
// initializes kernel values to zero
- print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),5207);
- print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),5208);
- print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),5209);
+ print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,size*sizeof(realw)),5214);
+ if( mp->anisotropic_kl ){
+ // anisotropic kernels
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_cijkl_kl),21*size*sizeof(realw)),5205);
+ print_CUDA_error_if_any(cudaMemset(mp->d_cijkl_kl,0,21*size*sizeof(realw)),5215);
+
+ }else{
+ // isotropic kernels
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),size*sizeof(realw)),5206);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),size*sizeof(realw)),5207);
+ print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,size*sizeof(realw)),5216);
+ print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,size*sizeof(realw)),5217);
+ }
+
// strains used for attenuation and kernel simulations
if( *COMPUTE_AND_STORE_STRAIN ){
// strains
- int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+ // debug
+ //printf("prepare_fields_elastic_adj_dev: rank %d - strains\n",mp->myrank);
+ //synchronize_mpi();
+ size = NGLL3 * mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+
// solid pressure
- copy_todevice_realw((void**)&mp->d_epsilon_trace_over_3,epsilon_trace_over_3,NGLL3*mp->NSPEC_AB);
+ copy_todevice_realw((void**)&mp->d_epsilon_trace_over_3,epsilon_trace_over_3,size);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
-// NGLL3*mp->NSPEC_AB*sizeof(realw)),5310);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
-// NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5311);
// backward solid pressure
+ copy_todevice_realw((void**)&mp->d_b_epsilon_trace_over_3,b_epsilon_trace_over_3,size);
- copy_todevice_realw((void**)&mp->d_b_epsilon_trace_over_3,b_epsilon_trace_over_3,NGLL3*mp->NSPEC_AB);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
-// NGLL3*mp->NSPEC_AB*sizeof(realw)),5312);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
-// NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),5313);
-
// prepares backward strains
-
- copy_todevice_realw((void**)&mp->d_b_epsilondev_xx,b_epsilondev_xx,epsilondev_size);
- copy_todevice_realw((void**)&mp->d_b_epsilondev_yy,b_epsilondev_yy,epsilondev_size);
- copy_todevice_realw((void**)&mp->d_b_epsilondev_xy,b_epsilondev_xy,epsilondev_size);
- copy_todevice_realw((void**)&mp->d_b_epsilondev_xz,b_epsilondev_xz,epsilondev_size);
- copy_todevice_realw((void**)&mp->d_b_epsilondev_yz,b_epsilondev_yz,epsilondev_size);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
-// epsilondev_size*sizeof(realw)),5321);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
-// epsilondev_size*sizeof(realw)),5322);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
-// epsilondev_size*sizeof(realw)),5323);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
-// epsilondev_size*sizeof(realw)),5324);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
-// epsilondev_size*sizeof(realw)),5325);
-
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
-// epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5326);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
-// epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5327);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
-// epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5328);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
-// epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5329);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
-// epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),5330);
+ copy_todevice_realw((void**)&mp->d_b_epsilondev_xx,b_epsilondev_xx,size);
+ copy_todevice_realw((void**)&mp->d_b_epsilondev_yy,b_epsilondev_yy,size);
+ copy_todevice_realw((void**)&mp->d_b_epsilondev_xy,b_epsilondev_xy,size);
+ copy_todevice_realw((void**)&mp->d_b_epsilondev_xz,b_epsilondev_xz,size);
+ copy_todevice_realw((void**)&mp->d_b_epsilondev_yz,b_epsilondev_yz,size);
}
// attenuation memory variables
if( *ATTENUATION ){
- copy_todevice_realw((void**)&mp->d_b_R_xx,b_R_xx,(*R_size));
- copy_todevice_realw((void**)&mp->d_b_R_yy,b_R_yy,(*R_size));
- copy_todevice_realw((void**)&mp->d_b_R_xy,b_R_xy,(*R_size));
- copy_todevice_realw((void**)&mp->d_b_R_xz,b_R_xz,(*R_size));
- copy_todevice_realw((void**)&mp->d_b_R_yz,b_R_yz,(*R_size));
+ // debug
+ //printf("prepare_fields_elastic_adj_dev: rank %d - attenuation\n",mp->myrank);
+ //synchronize_mpi();
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
-// (*R_size)*sizeof(realw)),5421);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw),
-// cudaMemcpyHostToDevice),5422);
+ size = *R_size;
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
-// (*R_size)*sizeof(realw)),5423);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw),
-// cudaMemcpyHostToDevice),5424);
+ copy_todevice_realw((void**)&mp->d_b_R_xx,b_R_xx,size);
+ copy_todevice_realw((void**)&mp->d_b_R_yy,b_R_yy,size);
+ copy_todevice_realw((void**)&mp->d_b_R_xy,b_R_xy,size);
+ copy_todevice_realw((void**)&mp->d_b_R_xz,b_R_xz,size);
+ copy_todevice_realw((void**)&mp->d_b_R_yz,b_R_yz,size);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
-// (*R_size)*sizeof(realw)),5425);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw),
-// cudaMemcpyHostToDevice),5426);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
-// (*R_size)*sizeof(realw)),5427);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw),
-// cudaMemcpyHostToDevice),5428);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
-// (*R_size)*sizeof(realw)),5429);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw),
-// cudaMemcpyHostToDevice),5420);
-
// alpha,beta,gamma factors for backward fields
copy_todevice_realw((void**)&mp->d_b_alphaval,b_alphaval,N_SLS);
copy_todevice_realw((void**)&mp->d_b_betaval,b_betaval,N_SLS);
copy_todevice_realw((void**)&mp->d_b_gammaval,b_gammaval,N_SLS);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
-// N_SLS*sizeof(realw)),5434);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
-// N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5435);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
-// N_SLS*sizeof(realw)),5436);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
-// N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5437);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
-// N_SLS*sizeof(realw)),5438);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
-// N_SLS*sizeof(realw),cudaMemcpyHostToDevice),5439);
}
+ // approximate hessian kernel
if( *APPROXIMATE_HESS_KL ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),5450);
- // initializes with zeros
- print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),5451);
+ // debug
+ //printf("prepare_fields_elastic_adj_dev: rank %d - hessian kernel\n",mp->myrank);
+ //synchronize_mpi();
+
+ size = NGLL3 * mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),size*sizeof(realw)),5450);
+ print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0,size*sizeof(realw)),5451);
}
+ // debug
+ //printf("prepare_fields_elastic_adj_dev: rank %d - done\n",mp->myrank);
+ //synchronize_mpi();
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_fields_elastic_adj_dev");
#endif
@@ -1256,17 +1039,15 @@
extern "C"
void FC_FUNC_(prepare_sim2_or_3_const_device,
- PREPARE_SIM2_OR_3_CONST_DEVICE)(
- long* Mesh_pointer_f,
+ PREPARE_SIM2_OR_3_CONST_DEVICE)(long* Mesh_pointer,
int* islice_selected_rec,
int* islice_selected_rec_size,
int* nadj_rec_local,
- int* nrec,
- int* myrank) {
+ int* nrec) {
TRACE("prepare_sim2_or_3_const_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
// adjoint source arrays
mp->nadj_rec_local = *nadj_rec_local;
@@ -1286,12 +1067,14 @@
int irec_local = 0;
for(int irec = 0; irec < *nrec; irec++) {
- if(*myrank == islice_selected_rec[irec]) {
+ if(mp->myrank == islice_selected_rec[irec]) {
irec_local++;
h_pre_computed_irec[irec_local-1] = irec;
}
}
+ // checks if all local receivers have been found
if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
+
// copies values onto GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
(mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),6010);
@@ -1300,7 +1083,6 @@
// temporary array to prepare extracted source array values
mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
-
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1317,7 +1099,7 @@
extern "C"
void FC_FUNC_(prepare_fields_noise_device,
- PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+ PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer,
int* NSPEC_AB, int* NGLOB_AB,
int* free_surface_ispec,
int* free_surface_ijk,
@@ -1325,34 +1107,21 @@
int* NOISE_TOMOGRAPHY,
int* NSTEP,
realw* noise_sourcearray,
- realw* normal_x_noise,
- realw* normal_y_noise,
- realw* normal_z_noise,
+ realw* normal_x_noise, realw* normal_y_noise, realw* normal_z_noise,
realw* mask_noise,
realw* free_surface_jacobian2Dw) {
TRACE("prepare_fields_noise_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
// free surface
mp->num_free_surface_faces = *num_free_surface_faces;
copy_todevice_int((void**)&mp->d_free_surface_ispec,free_surface_ispec,mp->num_free_surface_faces);
-
-// print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec,
-// mp->num_free_surface_faces*sizeof(int)),7001);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec,
-// mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7002);
-
copy_todevice_int((void**)&mp->d_free_surface_ijk,free_surface_ijk,
3*NGLL2*mp->num_free_surface_faces);
-// print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
-// 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),7003);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
-// 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),7004);
-
// alloc storage for the surface buffer to be copied
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),7005);
@@ -1361,11 +1130,6 @@
if( *NOISE_TOMOGRAPHY == 1 ){
copy_todevice_realw((void**)&mp->d_noise_sourcearray,noise_sourcearray,
3*NGLL3*(*NSTEP));
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
-// 3*NGLL3*(*NSTEP)*sizeof(realw)),7101);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
-// 3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),7102);
}
// prepares noise directions
@@ -1375,42 +1139,15 @@
copy_todevice_realw((void**)&mp->d_normal_x_noise,normal_x_noise,nface_size);
copy_todevice_realw((void**)&mp->d_normal_y_noise,normal_y_noise,nface_size);
copy_todevice_realw((void**)&mp->d_normal_z_noise,normal_z_noise,nface_size);
-
copy_todevice_realw((void**)&mp->d_mask_noise,mask_noise,nface_size);
copy_todevice_realw((void**)&mp->d_free_surface_jacobian2Dw,free_surface_jacobian2Dw,nface_size);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
-// nface_size*sizeof(realw)),7301);
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
-// nface_size*sizeof(realw)),7302);
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
-// nface_size*sizeof(realw)),7303);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
-// nface_size*sizeof(realw)),7304);
-// print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
-// nface_size*sizeof(realw)),7305);
- // transfers data onto GPU
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
-// nface_size*sizeof(realw),cudaMemcpyHostToDevice),7306);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
-// nface_size*sizeof(realw),cudaMemcpyHostToDevice),7307);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
-// nface_size*sizeof(realw),cudaMemcpyHostToDevice),7308);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
-// nface_size*sizeof(realw),cudaMemcpyHostToDevice),7309);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
-// nface_size*sizeof(realw),cudaMemcpyHostToDevice),7310);
}
// prepares noise strength kernel
if( *NOISE_TOMOGRAPHY == 3 ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
- NGLL3*(mp->NSPEC_AB)*sizeof(realw)),7401);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),NGLL3*(mp->NSPEC_AB)*sizeof(realw)),7401);
// initializes kernel values to zero
- print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
- NGLL3*mp->NSPEC_AB*sizeof(realw)),7403);
-
+ print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,NGLL3*mp->NSPEC_AB*sizeof(realw)),7403);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1427,7 +1164,7 @@
extern "C"
void FC_FUNC_(prepare_fields_gravity_device,
- PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
+ PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer,
int* GRAVITY,
realw* minus_deriv_gravity,
realw* minus_g,
@@ -1437,7 +1174,7 @@
TRACE("prepare_fields_gravity_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
setConst_wgll_cube(h_wgll_cube,mp);
@@ -1445,21 +1182,10 @@
if( mp->gravity ){
copy_todevice_realw((void**)&mp->d_minus_deriv_gravity,minus_deriv_gravity,mp->NGLOB_AB);
-
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_deriv_gravity),
-// (mp->NGLOB_AB)*sizeof(realw)),8000);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_deriv_gravity, minus_deriv_gravity,
-// (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8001);
-
copy_todevice_realw((void**)&mp->d_minus_g,minus_g,mp->NGLOB_AB);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_minus_g),
-// (mp->NGLOB_AB)*sizeof(realw)),8002);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_minus_g, minus_g,
-// (mp->NGLOB_AB)*sizeof(realw),cudaMemcpyHostToDevice),8003);
-
-
if( *ACOUSTIC_SIMULATION == 0 ){
+ // density
// rhostore not allocated yet
int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
// padded array
@@ -1472,10 +1198,16 @@
}
}
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("prepare_fields_gravity_device");
+#endif
}
/* ----------------------------------------------------------------------------------------------- */
+// unused yet...
+
+/*
extern "C"
void FC_FUNC_(prepare_seismogram_fields,
PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {
@@ -1501,6 +1233,7 @@
cudaMallocHost((void**)&mp->h_seismograms_v_it,3**nrec_local*sizeof(realw));
cudaMallocHost((void**)&mp->h_seismograms_a_it,3**nrec_local*sizeof(realw));
}
+*/
/* ----------------------------------------------------------------------------------------------- */
@@ -1510,8 +1243,7 @@
extern "C"
void FC_FUNC_(prepare_cleanup_device,
- PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
- int* SAVE_FORWARD,
+ PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer,
int* ACOUSTIC_SIMULATION,
int* ELASTIC_SIMULATION,
int* ABSORBING_CONDITIONS,
@@ -1525,7 +1257,7 @@
TRACE("prepare_cleanup_device");
// frees allocated memory arrays
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
// frees memory on GPU
// mesh
@@ -1610,7 +1342,9 @@
cudaFree(mp->d_displ);
cudaFree(mp->d_veloc);
cudaFree(mp->d_accel);
+
cudaFree(mp->d_send_accel_buffer);
+ if( mp->simulation_type == 3) cudaFree(mp->d_b_send_accel_buffer);
cudaFree(mp->d_rmassx);
cudaFree(mp->d_rmassy);
@@ -1628,7 +1362,7 @@
cudaFree(mp->d_rho_vp);
cudaFree(mp->d_rho_vs);
- if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && *SAVE_FORWARD ))
+ if(mp->simulation_type == 3 || ( mp->simulation_type == 1 && mp->save_forward ))
cudaFree(mp->d_b_absorb_field);
}
@@ -1637,8 +1371,12 @@
cudaFree(mp->d_b_veloc);
cudaFree(mp->d_b_accel);
cudaFree(mp->d_rho_kl);
- cudaFree(mp->d_mu_kl);
- cudaFree(mp->d_kappa_kl);
+ if( mp->anisotropic_kl ){
+ cudaFree(mp->d_cijkl_kl);
+ }else{
+ cudaFree(mp->d_mu_kl);
+ cudaFree(mp->d_kappa_kl);
+ }
if( *APPROXIMATE_HESS_KL ) cudaFree(mp->d_hess_el_kl);
}
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2013-08-20 14:13:26 UTC (rev 22718)
@@ -1,4 +1,4 @@
-/*
+/*
!=====================================================================
!
! S p e c f e m 3 D V e r s i o n 2 . 1
@@ -33,76 +33,74 @@
typedef float realw;
+
-
//
// src/cuda/check_fields_cuda.cu
//
-void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {}
+void FC_FUNC_(pause_for_debug,PAUSE_FOR_DEBUG)() {}
void FC_FUNC_(output_free_device_memory,
- OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
+ OUTPUT_FREE_DEVICE_MEMORY)(int* myrank_f) {}
void FC_FUNC_(get_free_device_memory,
- get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+ get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+void FC_FUNC_(get_norm_acoustic_from_device,
+ GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,long* Mesh_pointer,int* sim_type) {}
+
+void FC_FUNC_(get_norm_elastic_from_device,
+ GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
+ long* Mesh_pointer,
+ int* type) {}
+
+void FC_FUNC_(get_max_accel,
+ GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
+
void FC_FUNC_(check_max_norm_displ_gpu,
- CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer,int* announceID) {}
void FC_FUNC_(check_max_norm_vector,
- CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
+ CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
void FC_FUNC_(check_max_norm_displ,
- CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
+ CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
void FC_FUNC_(check_max_norm_b_displ_gpu,
- CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer,int* announceID) {}
void FC_FUNC_(check_max_norm_b_accel_gpu,
- CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer,int* announceID) {}
void FC_FUNC_(check_max_norm_b_veloc_gpu,
- CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer,int* announceID) {}
void FC_FUNC_(check_max_norm_b_displ,
- CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
+ CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
void FC_FUNC_(check_max_norm_b_accel,
- CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
+ CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
void FC_FUNC_(check_error_vectors,
- CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
+ CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
-void FC_FUNC_(get_max_accel,
- GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
-void FC_FUNC_(get_norm_acoustic_from_device,
- GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
- long* Mesh_pointer_f) {}
-
-void FC_FUNC_(get_norm_elastic_from_device,
- GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
- long* Mesh_pointer_f) {}
-
-
//
// src/cuda/compute_add_sources_acoustic_cuda.cu
//
void FC_FUNC_(compute_add_sources_ac_cuda,
- COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* NSOURCESf,
- double* h_stf_pre_compute,
- int* myrankf) {}
+ COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer,
+ int* phase_is_innerf,
+ int* NSOURCESf,
+ double* h_stf_pre_compute) {}
void FC_FUNC_(compute_add_sources_ac_s3_cuda,
- COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* NSOURCESf,
- double* h_stf_pre_compute,
- int* myrankf) {}
+ COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer,
+ int* phase_is_innerf,
+ int* NSOURCESf,
+ double* h_stf_pre_compute) {}
void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
@@ -111,12 +109,11 @@
int* h_ispec_is_inner,
int* h_ispec_is_acoustic,
int* h_ispec_selected_rec,
- int* myrank,
int* nrec,
int* time_index,
int* h_islice_selected_rec,
int* nadj_rec_local,
- int* NTSTEP_BETWEEN_READ_ADJSRC) {}
+ int* NTSTEP_BETWEEN_READ_ADJSRC) {}
//
@@ -124,25 +121,22 @@
//
void FC_FUNC_(compute_add_sources_el_cuda,
- COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* NSOURCESf,
- double* h_stf_pre_compute,
- int* myrankf) {}
+ COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer,
+ double* h_stf_pre_compute,
+ int* h_NSOURCES,
+ int* h_phase_is_inner) {}
void FC_FUNC_(compute_add_sources_el_s3_cuda,
COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
double* h_stf_pre_compute,
- int* NSOURCESf,
- int* phase_is_inner,
- int* myrank) {}
+ int* h_NSOURCES,
+ int* h_phase_is_inner) {}
void FC_FUNC_(add_source_master_rec_noise_cu,
- ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
- int* myrank_f,
- int* it_f,
- int* irec_master_noise_f,
- int* islice_selected_rec) {}
+ ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer,
+ int* it_f,
+ int* irec_master_noise_f,
+ int* islice_selected_rec) {}
void FC_FUNC_(add_sources_el_sim_type_2_or_3,
ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
@@ -151,12 +145,11 @@
int* h_ispec_is_inner,
int* h_ispec_is_elastic,
int* h_ispec_selected_rec,
- int* myrank,
int* nrec,
int* time_index,
int* h_islice_selected_rec,
int* nadj_rec_local,
- int* NTSTEP_BETWEEN_READ_ADJSRC) {}
+ int* NTSTEP_BETWEEN_READ_ADJSRC) {}
//
@@ -164,17 +157,17 @@
//
void FC_FUNC_(compute_coupling_ac_el_cuda,
- COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_COUPLING_AC_EL_CUDA)(long* Mesh_pointer,
int* phase_is_innerf,
- int* num_coupling_ac_el_facesf) {}
+ int* num_coupling_ac_el_facesf) {}
void FC_FUNC_(compute_coupling_el_ac_cuda,
- COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_COUPLING_EL_AC_CUDA)(long* Mesh_pointer,
int* phase_is_innerf,
- int* num_coupling_ac_el_facesf) {}
+ int* num_coupling_ac_el_facesf) {}
void FC_FUNC_(compute_coupling_ocean_cuda,
- COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f) {}
+ COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer) {}
//
@@ -182,37 +175,26 @@
//
void FC_FUNC_(transfer_boun_pot_from_device,
- TRANSFER_BOUN_POT_FROM_DEVICE)(
- int* size,
- long* Mesh_pointer_f,
- realw* potential_dot_dot_acoustic,
- realw* send_potential_dot_dot_buffer,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh,
- int* nibool_interfaces_ext_mesh,
- int* ibool_interfaces_ext_mesh,
- int* FORWARD_OR_ADJOINT){}
+ TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer,
+ realw* potential_dot_dot_acoustic,
+ realw* send_potential_dot_dot_buffer,
+ int* FORWARD_OR_ADJOINT){}
void FC_FUNC_(transfer_asmbl_pot_to_device,
- TRANSFER_ASMBL_POT_TO_DEVICE)(
- long* Mesh_pointer,
- realw* potential_dot_dot_acoustic,
- realw* buffer_recv_scalar_ext_mesh,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh,
- int* nibool_interfaces_ext_mesh,
- int* ibool_interfaces_ext_mesh,
- int* FORWARD_OR_ADJOINT) {}
+ TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
+ realw* potential_dot_dot_acoustic,
+ realw* buffer_recv_scalar_ext_mesh,
+ int* FORWARD_OR_ADJOINT) {}
void FC_FUNC_(compute_forces_acoustic_cuda,
- COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_FORCES_ACOUSTIC_CUDA)(long* Mesh_pointer,
int* iphase,
int* nspec_outer_acoustic,
- int* nspec_inner_acoustic) {}
+ int* nspec_inner_acoustic) {}
void FC_FUNC_(acoustic_enforce_free_surf_cuda,
- ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
- int* ABSORB_INSTEAD_OF_FREE_SURFACE) {}
+ ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer,
+ int* ABSORB_INSTEAD_OF_FREE_SURFACE) {}
//
@@ -220,33 +202,21 @@
//
void FC_FUNC_(transfer_boun_accel_from_device,
- TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
- realw* send_accel_buffer,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh,
- int* nibool_interfaces_ext_mesh,
- int* ibool_interfaces_ext_mesh,
- int* FORWARD_OR_ADJOINT){}
+ TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer,
+ realw* accel,
+ realw* send_accel_buffer,
+ int* FORWARD_OR_ADJOINT){}
void FC_FUNC_(transfer_boundary_from_device_a,
TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer,
- int* nspec_outer_elastic) {}
+ int* nspec_outer_elastic) {}
void FC_FUNC_(transfer_boundary_to_device_a,
TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
realw* buffer_recv_vector_ext_mesh,
int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh) {}
+ int* max_nibool_interfaces_ext_mesh) {}
-//void FC_FUNC_(assemble_accel_on_device,
-// ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel,
-// realw* buffer_recv_vector_ext_mesh,
-// int* num_interfaces_ext_mesh,
-// int* max_nibool_interfaces_ext_mesh,
-// int* nibool_interfaces_ext_mesh,
-// int* ibool_interfaces_ext_mesh,
-// int* FORWARD_OR_ADJOINT) {}
-
void FC_FUNC_(transfer_asmbl_accel_to_device,
TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer, realw* accel,
realw* buffer_recv_vector_ext_mesh,
@@ -254,21 +224,31 @@
int* max_nibool_interfaces_ext_mesh,
int* nibool_interfaces_ext_mesh,
int* ibool_interfaces_ext_mesh,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
+//void FC_FUNC_(assemble_accel_on_device,
+// ASSEMBLE_ACCEL_on_DEVICE)(long* Mesh_pointer, realw* accel,
+// realw* buffer_recv_vector_ext_mesh,
+// int* num_interfaces_ext_mesh,
+// int* max_nibool_interfaces_ext_mesh,
+// int* nibool_interfaces_ext_mesh,
+// int* ibool_interfaces_ext_mesh,
+// int* FORWARD_OR_ADJOINT) {}
+
void FC_FUNC_(compute_forces_viscoelastic_cuda,
- COMPUTE_FORCES_VISCOELASTIC_CUDA)(long* Mesh_pointer_f,
- int* iphase,
- int* nspec_outer_elastic,
- int* nspec_inner_elastic,
- int* COMPUTE_AND_STORE_STRAIN,
- int* ATTENUATION,
- int* ANISOTROPY) {}
+ COMPUTE_FORCES_VISCOELASTIC_CUDA)(long* Mesh_pointer,
+ int* iphase,
+ realw* deltat,
+ int* nspec_outer_elastic,
+ int* nspec_inner_elastic,
+ int* COMPUTE_AND_STORE_STRAIN,
+ int* ATTENUATION,
+ int* ANISOTROPY) {}
void FC_FUNC_(sync_copy_from_device,
- SYNC_copy_FROM_DEVICE)(long* Mesh_pointer_f,
+ SYNC_copy_FROM_DEVICE)(long* Mesh_pointer,
int* iphase,
- realw* send_buffer) {}
+ realw* send_buffer) {}
//
@@ -277,23 +257,22 @@
void FC_FUNC_(compute_kernels_elastic_cuda,
COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
- realw* deltat_f) {}
+ realw* deltat_f) {}
void FC_FUNC_(compute_kernels_strgth_noise_cu,
COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
realw* h_noise_surface_movie,
- realw* deltat) {}
+ realw* deltat) {}
void FC_FUNC_(compute_kernels_acoustic_cuda,
- COMPUTE_KERNELS_ACOUSTIC_CUDA)(
- long* Mesh_pointer,
- realw* deltat_f) {}
+ COMPUTE_KERNELS_ACOUSTIC_CUDA)(long* Mesh_pointer,
+ realw* deltat_f) {}
void FC_FUNC_(compute_kernels_hess_cuda,
COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
realw* deltat_f,
int* ELASTIC_SIMULATION,
- int* ACOUSTIC_SIMULATION) {}
+ int* ACOUSTIC_SIMULATION) {}
//
@@ -301,10 +280,9 @@
//
void FC_FUNC_(compute_stacey_acoustic_cuda,
- COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer,
int* phase_is_innerf,
- int* SAVE_FORWARDf,
- realw* h_b_absorb_potential) {}
+ realw* h_b_absorb_potential) {}
//
@@ -312,10 +290,9 @@
//
void FC_FUNC_(compute_stacey_viscoelastic_cuda,
- COMPUTE_STACEY_VISCOELASTIC_CUDA)(long* Mesh_pointer_f,
+ COMPUTE_STACEY_VISCOELASTIC_CUDA)(long* Mesh_pointer,
int* phase_is_innerf,
- int* SAVE_FORWARDf,
- realw* h_b_absorb_field) {}
+ realw* b_absorb_field) {}
//
@@ -323,10 +300,10 @@
//
void FC_FUNC_(initialize_cuda_device,
- INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+ INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
exit(1);
-}
+}
//
@@ -334,71 +311,65 @@
//
void FC_FUNC_(it_update_displacement_cuda,
- IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
- int* size_F,
+ IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer,
realw* deltat_F,
realw* deltatsqover2_F,
realw* deltatover2_F,
realw* b_deltat_F,
realw* b_deltatsqover2_F,
- realw* b_deltatover2_F) {}
+ realw* b_deltatover2_F) {}
void FC_FUNC_(it_update_displacement_ac_cuda,
- it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
- int* size_F,
+ it_update_displacement_ac_cuda)(long* Mesh_pointer,
realw* deltat_F,
realw* deltatsqover2_F,
realw* deltatover2_F,
realw* b_deltat_F,
realw* b_deltatsqover2_F,
- realw* b_deltatover2_F) {}
+ realw* b_deltatover2_F) {}
void FC_FUNC_(kernel_3_a_cuda,
KERNEL_3_A_CUDA)(long* Mesh_pointer,
- int* size_F,
realw* deltatover2_F,
realw* b_deltatover2_F,
- int* APPROXIMATE_OCEAN_LOAD) {}
+ int* APPROXIMATE_OCEAN_LOAD) {}
void FC_FUNC_(kernel_3_b_cuda,
KERNEL_3_B_CUDA)(long* Mesh_pointer,
- int* size_F,
- realw* deltatover2_F,
- realw* b_deltatover2_F) {}
+ realw* deltatover2_F,
+ realw* b_deltatover2_F) {}
-void FC_FUNC_(kernel_3_a_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
- long* Mesh_pointer,
- int* size_F) {}
+void FC_FUNC_(kernel_3_a_acoustic_cuda,
+ KERNEL_3_ACOUSTIC_CUDA)(long* Mesh_pointer ) {}
-void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
- long* Mesh_pointer,
- int* size_F,
- realw* deltatover2_F,
- realw* b_deltatover2_F) {}
+void FC_FUNC_(kernel_3_b_acoustic_cuda,
+ KERNEL_3_ACOUSTIC_CUDA)(long* Mesh_pointer,
+ realw* deltatover2_F,
+ realw* b_deltatover2_F) {}
//
// src/cuda/noise_tomography_cuda.cu
//
-void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
-void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
-void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer,realw* h_displ) {}
void FC_FUNC_(transfer_surface_to_host,
- TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
- realw* h_noise_surface_movie) {}
+ TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer,
+ realw* h_noise_surface_movie) {}
void FC_FUNC_(noise_read_add_surface_movie_cu,
- NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
+ NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer,
realw* h_noise_surface_movie,
- int* NOISE_TOMOGRAPHYf) {}
+ int* NOISE_TOMOGRAPHYf) {}
//
@@ -407,19 +378,15 @@
void FC_FUNC_(prepare_constants_device,
PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
- int* h_NGLLX,
- int* NSPEC_AB, int* NGLOB_AB,
+ int* h_NGLLX, int* NSPEC_AB, int* NGLOB_AB,
realw* h_xix, realw* h_xiy, realw* h_xiz,
realw* h_etax, realw* h_etay, realw* h_etaz,
realw* h_gammax, realw* h_gammay, realw* h_gammaz,
realw* h_kappav, realw* h_muv,
int* h_ibool,
- int* num_interfaces_ext_mesh,
- int* max_nibool_interfaces_ext_mesh,
- int* h_nibool_interfaces_ext_mesh,
- int* h_ibool_interfaces_ext_mesh,
- realw* h_hprime_xx,
- realw* h_hprimewgll_xx,
+ int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
+ int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
+ realw* h_hprime_xx, realw* h_hprimewgll_xx,
realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
int* ABSORBING_CONDITIONS,
int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
@@ -427,37 +394,27 @@
realw* h_abs_boundary_jacobian2Dw,
int* h_num_abs_boundary_faces,
int* h_ispec_is_inner,
- int* NSOURCES,
- int* nsources_local_f,
+ int* NSOURCES, int* nsources_local_f,
realw* h_sourcearrays,
- int* h_islice_selected_source,
- int* h_ispec_selected_source,
- int* h_number_receiver_global,
- int* h_ispec_selected_rec,
+ int* h_islice_selected_source, int* h_ispec_selected_source,
+ int* h_number_receiver_global, int* h_ispec_selected_rec,
int* nrec,int* nrec_local,
int* SIMULATION_TYPE,
int* USE_MESH_COLORING_GPU_f,
int* nspec_acoustic,int* nspec_elastic,
- int* my_neighbours_ext_mesh,
- int* request_send_vector_ext_mesh,
- int* request_recv_vector_ext_mesh,
- realw* buffer_recv_vector_ext_mesh
- ) {}
+ int* h_myrank,
+ int* SAVE_FORWARD ) {}
void FC_FUNC_(prepare_fields_acoustic_device,
- PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
- realw* rmass_acoustic,
- realw* rhostore,
- realw* kappastore,
- int* num_phase_ispec_acoustic,
- int* phase_ispec_inner_acoustic,
+ PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer,
+ realw* rmass_acoustic, realw* rhostore, realw* kappastore,
+ int* num_phase_ispec_acoustic, int* phase_ispec_inner_acoustic,
int* ispec_is_acoustic,
int* NOISE_TOMOGRAPHY,
int* num_free_surface_faces,
int* free_surface_ispec,
int* free_surface_ijk,
- int* b_reclen_potential,
- realw* b_absorb_potential,
+ int* b_reclen_potential, realw* b_absorb_potential,
int* ELASTIC_SIMULATION,
int* num_coupling_ac_el_faces,
int* coupling_ac_el_ispec,
@@ -466,26 +423,20 @@
realw* coupling_ac_el_jacobian2Dw,
int* num_colors_outer_acoustic,
int* num_colors_inner_acoustic,
- int* num_elem_colors_acoustic) {}
+ int* num_elem_colors_acoustic) {}
void FC_FUNC_(prepare_fields_acoustic_adj_dev,
- PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer_f,
- int* APPROXIMATE_HESS_KL) {}
+ PREPARE_FIELDS_ACOUSTIC_ADJ_DEV)(long* Mesh_pointer,
+ int* APPROXIMATE_HESS_KL) {}
void FC_FUNC_(prepare_fields_elastic_device,
- PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
- int* size,
- realw* rmassx,
- realw* rmassy,
- realw* rmassz,
- realw* rho_vp,
- realw* rho_vs,
+ PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer,
+ realw* rmassx, realw* rmassy, realw* rmassz,
+ realw* rho_vp, realw* rho_vs,
int* num_phase_ispec_elastic,
int* phase_ispec_inner_elastic,
int* ispec_is_elastic,
- realw* h_b_absorb_field,
- int* h_b_reclen_field,
- int* SAVE_FORWARD,
+ realw* b_absorb_field, int* b_reclen_field,
int* COMPUTE_AND_STORE_STRAIN,
realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
realw* epsilondev_xz,realw* epsilondev_yz,
@@ -506,31 +457,17 @@
int* num_colors_inner_elastic,
int* num_elem_colors_elastic,
int* ANISOTROPY,
- realw *c11store,
- realw *c12store,
- realw *c13store,
- realw *c14store,
- realw *c15store,
- realw *c16store,
- realw *c22store,
- realw *c23store,
- realw *c24store,
- realw *c25store,
- realw *c26store,
- realw *c33store,
- realw *c34store,
- realw *c35store,
- realw *c36store,
- realw *c44store,
- realw *c45store,
- realw *c46store,
- realw *c55store,
- realw *c56store,
- realw *c66store){}
+ realw *c11store,realw *c12store,realw *c13store,
+ realw *c14store,realw *c15store,realw *c16store,
+ realw *c22store,realw *c23store,realw *c24store,
+ realw *c25store,realw *c26store,realw *c33store,
+ realw *c34store,realw *c35store,realw *c36store,
+ realw *c44store,realw *c45store,realw *c46store,
+ realw *c55store,realw *c56store,realw *c66store ){}
void FC_FUNC_(prepare_fields_elastic_adj_dev,
- PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
- int* size,
+ PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer,
+ int* size_f,
int* COMPUTE_AND_STORE_STRAIN,
realw* epsilon_trace_over_3,
realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
@@ -540,19 +477,18 @@
int* R_size,
realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
- int* APPROXIMATE_HESS_KL){}
+ int* ANISOTROPIC_KL,
+ int* APPROXIMATE_HESS_KL){}
void FC_FUNC_(prepare_sim2_or_3_const_device,
- PREPARE_SIM2_OR_3_CONST_DEVICE)(
- long* Mesh_pointer_f,
+ PREPARE_SIM2_OR_3_CONST_DEVICE)(long* Mesh_pointer,
int* islice_selected_rec,
int* islice_selected_rec_size,
int* nadj_rec_local,
- int* nrec,
- int* myrank) {}
+ int* nrec) {}
void FC_FUNC_(prepare_fields_noise_device,
- PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+ PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer,
int* NSPEC_AB, int* NGLOB_AB,
int* free_surface_ispec,
int* free_surface_ijk,
@@ -560,27 +496,24 @@
int* NOISE_TOMOGRAPHY,
int* NSTEP,
realw* noise_sourcearray,
- realw* normal_x_noise,
- realw* normal_y_noise,
- realw* normal_z_noise,
+ realw* normal_x_noise, realw* normal_y_noise, realw* normal_z_noise,
realw* mask_noise,
- realw* free_surface_jacobian2Dw) {}
+ realw* free_surface_jacobian2Dw) {}
void FC_FUNC_(prepare_fields_gravity_device,
- PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
+ PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer,
int* GRAVITY,
realw* minus_deriv_gravity,
realw* minus_g,
realw* h_wgll_cube,
int* ACOUSTIC_SIMULATION,
- realw* rhostore) {}
+ realw* rhostore) {}
void FC_FUNC_(prepare_seismogram_fields,
- PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {}
+ PREPARE_SEISMOGRAM_FIELDS)(long* Mesh_pointer,int* nrec_local, double* nu, double* hxir, double* hetar, double* hgammar) {}
void FC_FUNC_(prepare_cleanup_device,
- PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
- int* SAVE_FORWARD,
+ PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer,
int* ACOUSTIC_SIMULATION,
int* ELASTIC_SIMULATION,
int* ABSORBING_CONDITIONS,
@@ -589,7 +522,7 @@
int* ATTENUATION,
int* ANISOTROPY,
int* APPROXIMATE_OCEAN_LOAD,
- int* APPROXIMATE_HESS_KL) {}
+ int* APPROXIMATE_HESS_KL) {}
//
@@ -597,153 +530,133 @@
//
void FC_FUNC_(transfer_fields_el_to_device,
- TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer) {}
void FC_FUNC_(transfer_fields_el_from_device,
- TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer) {}
void FC_FUNC_(transfer_b_fields_to_device,
TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer) {}
void FC_FUNC_(transfer_b_fields_from_device,
- TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {}
+ TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer) {}
void FC_FUNC_(transfer_accel_to_device,
- TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+ TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer) {}
void FC_FUNC_(transfer_accel_from_device,
- TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer) {}
void FC_FUNC_(transfer_b_accel_from_device,
- TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
+ TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer) {}
void FC_FUNC_(transfer_sigma_from_device,
- TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {}
+ TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer) {}
void FC_FUNC_(transfer_b_displ_from_device,
- TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {}
+ TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer) {}
void FC_FUNC_(transfer_displ_from_device,
- TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {}
+ TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer) {}
-void FC_FUNC_(transfer_compute_kernel_answers_from_device,
- TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
- realw* rho_kl,int* size_rho,
- realw* mu_kl, int* size_mu,
- realw* kappa_kl, int* size_kappa) {}
-
-void FC_FUNC_(transfer_compute_kernel_fields_from_device,
- TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
- realw* accel, int* size_accel,
- realw* b_displ, int* size_b_displ,
- realw* epsilondev_xx,
- realw* epsilondev_yy,
- realw* epsilondev_xy,
- realw* epsilondev_xz,
- realw* epsilondev_yz,
- int* size_epsilondev,
- realw* b_epsilondev_xx,
- realw* b_epsilondev_yy,
- realw* b_epsilondev_xy,
- realw* b_epsilondev_xz,
- realw* b_epsilondev_yz,
- int* size_b_epsilondev,
- realw* rho_kl,int* size_rho,
- realw* mu_kl, int* size_mu,
- realw* kappa_kl, int* size_kappa,
- realw* epsilon_trace_over_3,
- realw* b_epsilon_trace_over_3,
- int* size_epsilon_trace_over_3) {}
-
void FC_FUNC_(transfer_b_fields_att_to_device,
TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
- realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
- int* size_R,
- realw* b_epsilondev_xx,
- realw* b_epsilondev_yy,
- realw* b_epsilondev_xy,
- realw* b_epsilondev_xz,
- realw* b_epsilondev_yz,
- int* size_epsilondev) {}
+ realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,
+ realw* b_R_xz,realw* b_R_yz,
+ int* size_R,
+ realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+ int* size_epsilondev) {}
void FC_FUNC_(transfer_fields_att_from_device,
TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
int* size_R,
- realw* epsilondev_xx,
- realw* epsilondev_yy,
- realw* epsilondev_xy,
- realw* epsilondev_xz,
- realw* epsilondev_yz,
- int* size_epsilondev) {}
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
+ int* size_epsilondev) {}
void FC_FUNC_(transfer_kernels_el_to_host,
TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
- realw* h_rho_kl,
- realw* h_mu_kl,
- realw* h_kappa_kl,
- int* NSPEC_AB) {}
+ realw* h_rho_kl,
+ realw* h_mu_kl,
+ realw* h_kappa_kl,
+ realw* h_cijkl_kl,
+ int* NSPEC_AB) {}
void FC_FUNC_(transfer_kernels_noise_to_host,
TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
- realw* h_Sigma_kl,
- int* NSPEC_AB) {}
+ realw* h_Sigma_kl,
+ int* NSPEC_AB) {}
void FC_FUNC_(transfer_fields_ac_to_device,
- TRANSFER_FIELDS_AC_TO_DEVICE)(
- int* size,
- realw* potential_acoustic,
- realw* potential_dot_acoustic,
- realw* potential_dot_dot_acoustic,
- long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_AC_TO_DEVICE)(int* size,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
+ long* Mesh_pointer) {}
void FC_FUNC_(transfer_b_fields_ac_to_device,
- TRANSFER_B_FIELDS_AC_TO_DEVICE)(
- int* size,
- realw* b_potential_acoustic,
- realw* b_potential_dot_acoustic,
- realw* b_potential_dot_dot_acoustic,
- long* Mesh_pointer_f) {}
+ TRANSFER_B_FIELDS_AC_TO_DEVICE)(int* size,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
+ long* Mesh_pointer) {}
void FC_FUNC_(transfer_fields_ac_from_device,
TRANSFER_FIELDS_AC_FROM_DEVICE)(int* size,
realw* potential_acoustic,
realw* potential_dot_acoustic,
realw* potential_dot_dot_acoustic,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer) {}
void FC_FUNC_(transfer_b_fields_ac_from_device,
- TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
- int* size,
- realw* b_potential_acoustic,
- realw* b_potential_dot_acoustic,
- realw* b_potential_dot_dot_acoustic,
- long* Mesh_pointer_f) {}
+ TRANSFER_B_FIELDS_AC_FROM_DEVICE)(int* size,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
+ long* Mesh_pointer) {}
void FC_FUNC_(transfer_dot_dot_from_device,
- TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {}
+ TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer) {}
void FC_FUNC_(transfer_b_dot_dot_from_device,
- TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {}
+ TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer) {}
void FC_FUNC_(transfer_kernels_ac_to_host,
- TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
- realw* h_rho_ac_kl,
- realw* h_kappa_ac_kl,
- int* NSPEC_AB) {}
+ TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,realw* h_rho_ac_kl,realw* h_kappa_ac_kl,int* NSPEC_AB) {}
void FC_FUNC_(transfer_kernels_hess_el_tohost,
- TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
- realw* h_hess_kl,
- int* NSPEC_AB) {}
+ TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,realw* h_hess_kl,int* NSPEC_AB) {}
void FC_FUNC_(transfer_kernels_hess_ac_tohost,
- TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
- realw* h_hess_ac_kl,
- int* NSPEC_AB) {}
+ TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,realw* h_hess_ac_kl,int* NSPEC_AB) {}
+void FC_FUNC_(transfer_compute_kernel_answers_from_device,
+ TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
+ realw* rho_kl,int* size_rho,
+ realw* mu_kl, int* size_mu,
+ realw* kappa_kl, int* size_kappa) {}
+void FC_FUNC_(transfer_compute_kernel_fields_from_device,
+ TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
+ realw* accel, int* size_accel,
+ realw* b_displ, int* size_b_displ,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
+ int* size_epsilondev,
+ realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+ int* size_b_epsilondev,
+ realw* rho_kl,int* size_rho,
+ realw* mu_kl, int* size_mu,
+ realw* kappa_kl, int* size_kappa,
+ realw* epsilon_trace_over_3,
+ realw* b_epsilon_trace_over_3,
+ int* size_epsilon_trace_over_3) {}
+
+
//
// src/cuda/write_seismograms_cuda.cu
//
@@ -754,18 +667,17 @@
realw* seismograms_d,
realw* seismograms_v,
realw* seismograms_a,
- int* it) {}
+ int* it) {}
void FC_FUNC_(transfer_station_el_from_device,
TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
realw* b_displ, realw* b_veloc, realw* b_accel,
long* Mesh_pointer_f,int* number_receiver_global,
int* ispec_selected_rec,int* ispec_selected_source,
- int* ibool) {}
+ int* ibool) {}
void FC_FUNC_(transfer_station_ac_from_device,
- TRANSFER_STATION_AC_FROM_DEVICE)(
- realw* potential_acoustic,
+ TRANSFER_STATION_AC_FROM_DEVICE)(realw* potential_acoustic,
realw* potential_dot_acoustic,
realw* potential_dot_dot_acoustic,
realw* b_potential_acoustic,
@@ -775,5 +687,5 @@
int* number_receiver_global,
int* ispec_selected_rec,
int* ispec_selected_source,
- int* ibool) {}
+ int* ibool) {}
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/transfer_fields_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/transfer_fields_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -53,11 +53,11 @@
extern "C"
void FC_FUNC_(transfer_fields_el_to_device,
- TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+ TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer) {
-TRACE("transfer_fields_el_to_device_");
+ TRACE("transfer_fields_el_to_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40004);
@@ -69,11 +69,11 @@
extern "C"
void FC_FUNC_(transfer_fields_el_from_device,
- TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
+ TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer) {
- TRACE("transfer_fields_el_from_device_");
+ TRACE("transfer_fields_el_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
@@ -86,30 +86,31 @@
extern "C"
void FC_FUNC_(transfer_b_fields_to_device,
TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {
+ long* Mesh_pointer) {
- TRACE("transfer_b_fields_to_device_");
+ TRACE("transfer_b_fields_to_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- cudaMemcpy(mp->d_b_displ,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_veloc,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_accel,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_displ,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),41006);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_veloc,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),41007);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_accel,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),41008);
+
}
/* ----------------------------------------------------------------------------------------------- */
extern "C"
void FC_FUNC_(transfer_b_fields_from_device,
- TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {
+ TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer) {
-TRACE("transfer_b_fields_from_device_");
+ TRACE("transfer_b_fields_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- cudaMemcpy(b_displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_veloc,mp->d_b_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+ print_CUDA_error_if_any(cudaMemcpy(b_displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),42006);
+ print_CUDA_error_if_any(cudaMemcpy(b_veloc,mp->d_b_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),42007);
+ print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),42008);
}
@@ -118,11 +119,11 @@
extern "C"
void FC_FUNC_(transfer_accel_to_device,
- TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
+ TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer) {
-TRACE("transfer_accel_to_device");
+ TRACE("transfer_accel_to_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40016);
@@ -132,11 +133,11 @@
extern "C"
void FC_FUNC_(transfer_accel_from_device,
- TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
+ TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer) {
-TRACE("transfer_accel_from_device");
+ TRACE("transfer_accel_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40026);
@@ -146,11 +147,11 @@
extern "C"
void FC_FUNC_(transfer_b_accel_from_device,
- TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {
+ TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer) {
-TRACE("transfer_b_accel_from_device");
+ TRACE("transfer_b_accel_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40036);
@@ -160,11 +161,11 @@
extern "C"
void FC_FUNC_(transfer_sigma_from_device,
- TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {
+ TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer) {
-TRACE("transfer_sigma_from_device");
+ TRACE("transfer_sigma_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40046);
@@ -174,11 +175,11 @@
extern "C"
void FC_FUNC_(transfer_b_displ_from_device,
- TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
+ TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer) {
-TRACE("transfer_b_displ_from_device");
+ TRACE("transfer_b_displ_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40056);
@@ -188,120 +189,47 @@
extern "C"
void FC_FUNC_(transfer_displ_from_device,
- TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
+ TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer) {
-TRACE("transfer_displ_from_device");
+ TRACE("transfer_displ_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40066);
}
/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(transfer_compute_kernel_answers_from_device,
- TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
- realw* rho_kl,int* size_rho,
- realw* mu_kl, int* size_mu,
- realw* kappa_kl, int* size_kappa) {
-TRACE("transfer_compute_kernel_answers_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
-
-}
-*/
-
-/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(transfer_compute_kernel_fields_from_device,
- TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
- realw* accel, int* size_accel,
- realw* b_displ, int* size_b_displ,
- realw* epsilondev_xx,
- realw* epsilondev_yy,
- realw* epsilondev_xy,
- realw* epsilondev_xz,
- realw* epsilondev_yz,
- int* size_epsilondev,
- realw* b_epsilondev_xx,
- realw* b_epsilondev_yy,
- realw* b_epsilondev_xy,
- realw* b_epsilondev_xz,
- realw* b_epsilondev_yz,
- int* size_b_epsilondev,
- realw* rho_kl,int* size_rho,
- realw* mu_kl, int* size_mu,
- realw* kappa_kl, int* size_kappa,
- realw* epsilon_trace_over_3,
- realw* b_epsilon_trace_over_3,
- int* size_epsilon_trace_over_3) {
-TRACE("transfer_compute_kernel_fields_from_device");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
- cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
- cudaMemcpyDeviceToHost);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("after transfer_compute_kernel_fields_from_device");
-#endif
-}
-*/
-
-/* ----------------------------------------------------------------------------------------------- */
-
// attenuation fields
extern "C"
void FC_FUNC_(transfer_b_fields_att_to_device,
TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
- realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
- int* size_R,
- realw* b_epsilondev_xx,
- realw* b_epsilondev_yy,
- realw* b_epsilondev_xy,
- realw* b_epsilondev_xz,
- realw* b_epsilondev_yz,
- int* size_epsilondev) {
+ realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,
+ realw* b_R_xz,realw* b_R_yz,
+ int* size_R,
+ realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+ int* size_epsilondev) {
+
TRACE("transfer_b_fields_att_to_device");
+
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
- cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(realw),cudaMemcpyHostToDevice),43011);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(realw),cudaMemcpyHostToDevice),43012);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(realw),cudaMemcpyHostToDevice),43013);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(realw),cudaMemcpyHostToDevice),43014);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(realw),cudaMemcpyHostToDevice),43015);
- cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice),43016);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice),43017);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice),43018);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice),43019);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice),43020);
-
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_b_fields_att_to_device");
#endif
@@ -316,27 +244,25 @@
TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
int* size_R,
- realw* epsilondev_xx,
- realw* epsilondev_yy,
- realw* epsilondev_xy,
- realw* epsilondev_xz,
- realw* epsilondev_yz,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
int* size_epsilondev) {
TRACE("transfer_fields_att_from_device");
+
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
- cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+ print_CUDA_error_if_any(cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(realw),cudaMemcpyDeviceToHost),43021);
+ print_CUDA_error_if_any(cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost),43022);
+ print_CUDA_error_if_any(cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost),43023);
+ print_CUDA_error_if_any(cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost),43024);
+ print_CUDA_error_if_any(cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost),43025);
- cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost),43026);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost),43027);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost),43028);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost),43029);
+ print_CUDA_error_if_any(cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost),43030);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -351,21 +277,27 @@
extern "C"
void FC_FUNC_(transfer_kernels_el_to_host,
TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
- realw* h_rho_kl,
- realw* h_mu_kl,
- realw* h_kappa_kl,
- int* NSPEC_AB) {
-TRACE("transfer_kernels_el_to_host");
+ realw* h_rho_kl,
+ realw* h_mu_kl,
+ realw* h_kappa_kl,
+ realw* h_cijkl_kl,
+ int* NSPEC_AB) {
+ TRACE("transfer_kernels_el_to_host");
+
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*NGLL3*sizeof(realw),
cudaMemcpyDeviceToHost),40101);
- print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*NGLL3*sizeof(realw),
- cudaMemcpyDeviceToHost),40102);
- print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*NGLL3*sizeof(realw),
- cudaMemcpyDeviceToHost),40103);
-
+ if( mp->anisotropic_kl ){
+ print_CUDA_error_if_any(cudaMemcpy(h_cijkl_kl,mp->d_cijkl_kl,*NSPEC_AB*21*NGLL3*sizeof(realw),
+ cudaMemcpyDeviceToHost),40102);
+ }else{
+ print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*NGLL3*sizeof(realw),
+ cudaMemcpyDeviceToHost),40102);
+ print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*NGLL3*sizeof(realw),
+ cudaMemcpyDeviceToHost),40103);
+ }
}
/* ----------------------------------------------------------------------------------------------- */
@@ -377,11 +309,12 @@
extern "C"
void FC_FUNC_(transfer_kernels_noise_to_host,
TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
- realw* h_Sigma_kl,
- int* NSPEC_AB) {
-TRACE("transfer_kernels_noise_to_host");
+ realw* h_Sigma_kl,
+ int* NSPEC_AB) {
+ TRACE("transfer_kernels_noise_to_host");
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
cudaMemcpyDeviceToHost),40201);
@@ -397,16 +330,17 @@
extern "C"
void FC_FUNC_(transfer_fields_ac_to_device,
- TRANSFER_FIELDS_AC_TO_DEVICE)(
- int* size,
- realw* potential_acoustic,
- realw* potential_dot_acoustic,
- realw* potential_dot_dot_acoustic,
- long* Mesh_pointer_f) {
-TRACE("transfer_fields_ac_to_device");
+ TRANSFER_FIELDS_AC_TO_DEVICE)(int* size,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
+ long* Mesh_pointer) {
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ TRACE("transfer_fields_ac_to_device");
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+
print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_acoustic,potential_acoustic,
sizeof(realw)*(*size),cudaMemcpyHostToDevice),50110);
print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_acoustic,potential_dot_acoustic,
@@ -423,16 +357,17 @@
extern "C"
void FC_FUNC_(transfer_b_fields_ac_to_device,
- TRANSFER_B_FIELDS_AC_TO_DEVICE)(
- int* size,
- realw* b_potential_acoustic,
- realw* b_potential_dot_acoustic,
- realw* b_potential_dot_dot_acoustic,
- long* Mesh_pointer_f) {
-TRACE("transfer_b_fields_ac_to_device");
+ TRANSFER_B_FIELDS_AC_TO_DEVICE)(int* size,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
+ long* Mesh_pointer) {
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ TRACE("transfer_b_fields_ac_to_device");
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_acoustic,b_potential_acoustic,
sizeof(realw)*(*size),cudaMemcpyHostToDevice),51110);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
@@ -454,10 +389,11 @@
realw* potential_acoustic,
realw* potential_dot_acoustic,
realw* potential_dot_dot_acoustic,
- long* Mesh_pointer_f) {
-TRACE("transfer_fields_ac_from_device");
+ long* Mesh_pointer) {
+ TRACE("transfer_fields_ac_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
print_CUDA_error_if_any(cudaMemcpy(potential_acoustic,mp->d_potential_acoustic,
sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52111);
@@ -475,15 +411,15 @@
extern "C"
void FC_FUNC_(transfer_b_fields_ac_from_device,
- TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
- int* size,
- realw* b_potential_acoustic,
- realw* b_potential_dot_acoustic,
- realw* b_potential_dot_dot_acoustic,
- long* Mesh_pointer_f) {
-TRACE("transfer_b_fields_ac_from_device");
+ TRANSFER_B_FIELDS_AC_FROM_DEVICE)(int* size,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
+ long* Mesh_pointer) {
+ TRACE("transfer_b_fields_ac_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
print_CUDA_error_if_any(cudaMemcpy(b_potential_acoustic,mp->d_b_potential_acoustic,
sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53111);
@@ -501,11 +437,12 @@
extern "C"
void FC_FUNC_(transfer_dot_dot_from_device,
- TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+ TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer) {
TRACE("transfer_dot_dot_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50041);
@@ -516,11 +453,12 @@
extern "C"
void FC_FUNC_(transfer_b_dot_dot_from_device,
- TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+ TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer) {
TRACE("transfer_b_dot_dot_from_device");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50042);
@@ -532,15 +470,13 @@
extern "C"
void FC_FUNC_(transfer_kernels_ac_to_host,
- TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
- realw* h_rho_ac_kl,
- realw* h_kappa_ac_kl,
- int* NSPEC_AB) {
+ TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,realw* h_rho_ac_kl,realw* h_kappa_ac_kl,int* NSPEC_AB) {
TRACE("transfer_kernels_ac_to_host");
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
+
int size = *NSPEC_AB*NGLL3;
// copies kernel values over to CPU host
@@ -558,13 +494,13 @@
extern "C"
void FC_FUNC_(transfer_kernels_hess_el_tohost,
- TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
- realw* h_hess_kl,
- int* NSPEC_AB) {
-TRACE("transfer_kernels_hess_el_tohost");
+ TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,realw* h_hess_kl,int* NSPEC_AB) {
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+ TRACE("transfer_kernels_hess_el_tohost");
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+
print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_el_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
cudaMemcpyDeviceToHost),70201);
}
@@ -573,15 +509,87 @@
extern "C"
void FC_FUNC_(transfer_kernels_hess_ac_tohost,
- TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
- realw* h_hess_ac_kl,
- int* NSPEC_AB) {
+ TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,realw* h_hess_ac_kl,int* NSPEC_AB) {
+
TRACE("transfer_kernels_hess_ac_tohost");
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
print_CUDA_error_if_any(cudaMemcpy(h_hess_ac_kl,mp->d_hess_ac_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
cudaMemcpyDeviceToHost),70202);
}
+// unused...
+/* ----------------------------------------------------------------------------------------------- */
+/*
+extern "C"
+void FC_FUNC_(transfer_compute_kernel_answers_from_device,
+ TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
+ realw* rho_kl,int* size_rho,
+ realw* mu_kl, int* size_mu,
+ realw* kappa_kl, int* size_kappa) {
+TRACE("transfer_compute_kernel_answers_from_device");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+ cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+ if( ! mp->anisotropic_kl ){
+ cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
+ }
+}
+*/
+
+/* ----------------------------------------------------------------------------------------------- */
+/*
+extern "C"
+void FC_FUNC_(transfer_compute_kernel_fields_from_device,
+ TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
+ realw* accel, int* size_accel,
+ realw* b_displ, int* size_b_displ,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
+ int* size_epsilondev,
+ realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+ int* size_b_epsilondev,
+ realw* rho_kl,int* size_rho,
+ realw* mu_kl, int* size_mu,
+ realw* kappa_kl, int* size_kappa,
+ realw* epsilon_trace_over_3,
+ realw* b_epsilon_trace_over_3,
+ int* size_epsilon_trace_over_3) {
+TRACE("transfer_compute_kernel_fields_from_device");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+ cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+
+ if( ! mp->anisotropic_kl ){
+ cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
+ }
+
+ cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
+ cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
+ cudaMemcpyDeviceToHost);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("after transfer_compute_kernel_fields_from_device");
+#endif
+}
+*/
+
Modified: seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/cuda/write_seismograms_cuda.cu 2013-08-20 14:13:26 UTC (rev 22718)
@@ -43,60 +43,44 @@
/* ----------------------------------------------------------------------------------------------- */
+//fortran code snippet...
/*
- ! gets global number of that receiver
- irec = number_receiver_global(irec_local)
+ ! gets global number of that receiver
+ irec = number_receiver_global(irec_local)
- ! gets local receiver interpolators
- ! (1-D Lagrange interpolators)
- hxir(:) = hxir_store(irec_local,:)
- hetar(:) = hetar_store(irec_local,:)
- hgammar(:) = hgammar_store(irec_local,:)
-
+ ! gets local receiver interpolators
+ ! (1-D Lagrange interpolators)
+ hxir(:) = hxir_store(irec_local,:)
+ hetar(:) = hetar_store(irec_local,:)
+ hgammar(:) = hgammar_store(irec_local,:)
*/
/* ----------------------------------------------------------------------------------------------- */
-// Initially sets the blocks_x to be the num_blocks, and adds rows as
-// needed. If an additional row is added, the row length is cut in
-// half. If the block count is odd, there will be 1 too many blocks,
-// which must be managed at runtime with an if statement.
-void get_blocks_xy(int num_blocks,int* num_blocks_x,int* num_blocks_y) {
- *num_blocks_x = num_blocks;
- *num_blocks_y = 1;
- while(*num_blocks_x > 65535) {
- *num_blocks_x = (int) ceil(*num_blocks_x*0.5f);
- *num_blocks_y = *num_blocks_y*2;
- }
- return;
-}
+// unused...
+/*
+__device__ double my_atomicAdd(double* address, double val) {
-/* ----------------------------------------------------------------------------------------------- */
-
-__device__ double atomicAdd(double* address, double val)
-{
- unsigned long long int* address_as_ull =
- (unsigned long long int*)address;
+ unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;
- do {
- assumed = old;
-old = atomicCAS(address_as_ull, assumed,
- __double_as_longlong(val +
- __longlong_as_double(assumed)));
+ do{
+ assumed = old;
+ old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
} while (assumed != old);
return __longlong_as_double(old);
}
+*/
/* ----------------------------------------------------------------------------------------------- */
__global__ void compute_interpolated_dva_plus_seismogram(int nrec_local,
- realw* displ, realw* veloc, realw* accel,
- int* ibool,
- double* hxir, double* hetar, double* hgammar,
- realw* seismograms_d, realw* seismograms_v, realw* seismograms_a,
- double* nu,
- int* number_receiver_global,
- int* ispec_selected_rec) {
+ realw* displ, realw* veloc, realw* accel,
+ int* ibool,
+ double* hxir, double* hetar, double* hgammar,
+ realw* seismograms_d, realw* seismograms_v, realw* seismograms_a,
+ double* nu,
+ int* number_receiver_global,
+ int* ispec_selected_rec) {
int irec_local = blockIdx.x + blockIdx.y*gridDim.x;
int i = threadIdx.x;
int j = threadIdx.y;
@@ -202,12 +186,13 @@
// transfers seismograms from device to host
- TRACE("transfer_seismograms_el_from_d");
+ TRACE("\ttransfer_seismograms_el_from_d");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
int num_blocks_x, num_blocks_y;
get_blocks_xy(*nrec_local,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(5,5,5);
@@ -222,22 +207,23 @@
// cudaEventRecord( start, 0 );
compute_interpolated_dva_plus_seismogram<<<grid,threads,0,mp->compute_stream>>>(*nrec_local,
- mp->d_displ,mp->d_veloc,mp->d_accel,
- mp->d_ibool,
- mp->d_hxir, mp->d_hetar, mp->d_hgammar,
- mp->d_seismograms_d,
- mp->d_seismograms_v,
- mp->d_seismograms_a,
- mp->d_nu,
- mp->d_number_receiver_global,
- mp->d_ispec_selected_rec
- );
+ mp->d_displ,mp->d_veloc,mp->d_accel,
+ mp->d_ibool,
+ mp->d_hxir, mp->d_hetar, mp->d_hgammar,
+ mp->d_seismograms_d,
+ mp->d_seismograms_v,
+ mp->d_seismograms_a,
+ mp->d_nu,
+ mp->d_number_receiver_global,
+ mp->d_ispec_selected_rec
+ );
// cudaMemcpy(h_debug,d_debug,125*sizeof(double),cudaMemcpyDeviceToHost);
- cudaMemcpy(mp->h_seismograms_d_it,mp->d_seismograms_d,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
- cudaMemcpy(mp->h_seismograms_v_it,mp->d_seismograms_v,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
- cudaMemcpy(mp->h_seismograms_a_it,mp->d_seismograms_a,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost);
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ print_CUDA_error_if_any(cudaMemcpy(mp->h_seismograms_d_it,mp->d_seismograms_d,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost),72001);
+ print_CUDA_error_if_any(cudaMemcpy(mp->h_seismograms_v_it,mp->d_seismograms_v,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost),72002);
+ print_CUDA_error_if_any(cudaMemcpy(mp->h_seismograms_a_it,mp->d_seismograms_a,sizeof(realw)*3* *nrec_local,cudaMemcpyDeviceToHost),72003);
// cudaEventRecord( stop, 0 );
// cudaEventSynchronize( stop );
@@ -286,37 +272,35 @@
/* ----------------------------------------------------------------------------------------------- */
void transfer_field_from_device(Mesh* mp, realw* d_field,realw* h_field,
- int* number_receiver_global,
- int* d_ispec_selected,
- int* h_ispec_selected,
- int* ibool) {
+ int* number_receiver_global,
+ int* d_ispec_selected,
+ int* h_ispec_selected,
+ int* ibool) {
-TRACE("transfer_field_from_device");
+TRACE("\ttransfer_field_from_device");
// checks if anything to do
if( mp->nrec_local == 0 ) return;
int blocksize = NGLL3;
- int num_blocks_x = mp->nrec_local;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nrec_local,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
// prepare field transfer array on device
transfer_stations_fields_from_device_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_number_receiver_global,
- d_ispec_selected,
- mp->d_ibool,
- mp->d_station_seismo_field,
- d_field,
- mp->nrec_local);
+ d_ispec_selected,
+ mp->d_ibool,
+ mp->d_station_seismo_field,
+ d_field,
+ mp->nrec_local);
- cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
- (3*NGLL3)*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
+ print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
+ (3*NGLL3)*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost),71001);
int irec_local;
for(irec_local=0;irec_local<mp->nrec_local;irec_local++) {
@@ -422,13 +406,10 @@
// sets up kernel dimensions
int blocksize = NGLL3;
- int num_blocks_x = mp->nrec_local;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = (int) ceil(num_blocks_x*0.5f);
- num_blocks_y = num_blocks_y*2;
- }
+ int num_blocks_x, num_blocks_y;
+ get_blocks_xy(mp->nrec_local,&num_blocks_x,&num_blocks_y);
+
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
@@ -444,6 +425,7 @@
exit_on_cuda_error("transfer_field_acoustic_from_device kernel");
#endif
+ // (cudaMemcpy implicitly synchronizes all other cuda operations)
print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential,
mp->nrec_local*NGLL3*sizeof(realw),cudaMemcpyDeviceToHost),55000);
@@ -474,8 +456,7 @@
extern "C"
void FC_FUNC_(transfer_station_ac_from_device,
- TRANSFER_STATION_AC_FROM_DEVICE)(
- realw* potential_acoustic,
+ TRANSFER_STATION_AC_FROM_DEVICE)(realw* potential_acoustic,
realw* potential_dot_acoustic,
realw* potential_dot_dot_acoustic,
realw* b_potential_acoustic,
Modified: seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_MPI.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_MPI.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_MPI.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -215,12 +215,12 @@
! adds contributions from different partitions to flag arrays
! integer arrays
- call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob,test_flag, &
+ call assemble_MPI_scalar_i_blocking(NPROC,nglob,test_flag, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_dummy,&
my_neighbours_ext_mesh)
! custom_real arrays
- call assemble_MPI_scalar_ext_mesh(NPROC,nglob,test_flag_cr, &
+ call assemble_MPI_scalar_blocking(NPROC,nglob,test_flag_cr, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_dummy, &
my_neighbours_ext_mesh)
Modified: seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_coupling_surfaces.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_coupling_surfaces.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/generate_databases/get_coupling_surfaces.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -122,7 +122,7 @@
enddo
! sums acoustic flags
if( ACOUSTIC_SIMULATION ) then
- call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,acoustic_flag, &
+ call assemble_MPI_scalar_i_blocking(NPROC,nglob_dummy,acoustic_flag, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
my_neighbours_ext_mesh)
@@ -130,7 +130,7 @@
! sums elastic flags
if( ELASTIC_SIMULATION ) then
- call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,elastic_flag, &
+ call assemble_MPI_scalar_i_blocking(NPROC,nglob_dummy,elastic_flag, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
my_neighbours_ext_mesh)
@@ -138,7 +138,7 @@
! sums poroelastic flags
if( POROELASTIC_SIMULATION ) then
- call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob_dummy,poroelastic_flag, &
+ call assemble_MPI_scalar_i_blocking(NPROC,nglob_dummy,poroelastic_flag, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh_dummy,&
my_neighbours_ext_mesh)
Modified: seismo/3D/SPECFEM3D/trunk/src/generate_databases/memory_eval.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/generate_databases/memory_eval.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/generate_databases/memory_eval.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -32,8 +32,8 @@
subroutine memory_eval(NSPEC_AB,NGLOB_AB,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh,&
APPROXIMATE_OCEAN_LOAD,memory_size)
- use generate_databases_par, only: PML_CONDITIONS,nspec_cpml
- use create_regions_mesh_ext_par,only: NSPEC_ANISO,ispec_is_acoustic,ispec_is_elastic
+ use generate_databases_par, only: PML_CONDITIONS,nspec_cpml
+ use create_regions_mesh_ext_par,only: NSPEC_ANISO,ispec_is_acoustic,ispec_is_elastic,ispec_is_poroelastic
implicit none
@@ -46,7 +46,7 @@
! output
double precision, intent(out) :: memory_size
! local parameters
- logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION
+ logical :: ACOUSTIC_SIMULATION,ELASTIC_SIMULATION,POROELASTIC_SIMULATION
memory_size = 0.d0
@@ -159,6 +159,17 @@
endif
endif
+ ! elastic arrays
+ call any_all_l( ANY(ispec_is_poroelastic), POROELASTIC_SIMULATION )
+ if( POROELASTIC_SIMULATION ) then
+ ! displs_poroelastic,..
+ memory_size = memory_size + 6.d0*dble(NDIM)*NGLOB_AB*dble(CUSTOM_REAL)
+ ! rmass_solid_poroelastic,..
+ memory_size = memory_size + 2.d0*NGLOB_AB*dble(CUSTOM_REAL)
+ ! rhoarraystore,..
+ memory_size = memory_size + 17.d0*dble(NGLLX)*dble(NGLLY)*dble(NGLLZ)*NSPEC_AB*dble(CUSTOM_REAL)
+ endif
+
! skipping boundary surfaces
! skipping free surfaces
! skipping acoustic-elastic coupling surfaces
@@ -171,14 +182,26 @@
memory_size = memory_size + max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(SIZE_INTEGER)
! MPI communications
- ! buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
- memory_size = memory_size + 2.d0*dble(NDIM)*max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(CUSTOM_REAL)
+ if( ACOUSTIC_SIMULATION ) then
+ ! buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh
+ memory_size = memory_size + 2.d0*max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(CUSTOM_REAL)
+ ! request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
+ memory_size = memory_size + 2.d0*num_interfaces_ext_mesh*dble(SIZE_INTEGER)
+ endif
- ! buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh
- memory_size = memory_size + 2.d0*max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(CUSTOM_REAL)
+ if( ELASTIC_SIMULATION ) then
+ ! buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
+ memory_size = memory_size + 2.d0*dble(NDIM)*max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(CUSTOM_REAL)
+ ! request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+ memory_size = memory_size + 2.d0*num_interfaces_ext_mesh*dble(SIZE_INTEGER)
+ endif
- ! request_send_vector_ext_mesh,request_recv_vector_ext_mesh,request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
- memory_size = memory_size + 4.d0*num_interfaces_ext_mesh*dble(SIZE_INTEGER)
+ if( POROELASTIC_SIMULATION ) then
+ ! buffer_send_vector_ext_mesh_s,..
+ memory_size = memory_size + 4.d0*dble(NDIM)*max_nibool_interfaces_ext_mesh*num_interfaces_ext_mesh*dble(CUSTOM_REAL)
+ ! request_send_vector_ext_mesh_s,..
+ memory_size = memory_size + 4.d0*num_interfaces_ext_mesh*dble(SIZE_INTEGER)
+ endif
! ispec_is_inner
memory_size = memory_size + NSPEC_AB*dble(SIZE_LOGICAL)
Modified: seismo/3D/SPECFEM3D/trunk/src/shared/assemble_MPI_scalar.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/shared/assemble_MPI_scalar.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/shared/assemble_MPI_scalar.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -28,17 +28,12 @@
!---- assemble the contributions between slices using non-blocking MPI
!----
- subroutine assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,array_val, &
+ subroutine assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,array_val, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
-! subroutine assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,array_val, &
-! buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
-! num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
-! nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
-! my_neighbours_ext_mesh, &
-! request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
+! assembles scalar field in a blocking way, returns only after values have been assembled
implicit none
@@ -54,16 +49,12 @@
integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
-! real(kind=CUSTOM_REAL), dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
-! buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh
-! integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
+ real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: buffer_send_scalar
+ real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: buffer_recv_scalar
+ integer, dimension(:), allocatable :: request_send_scalar
+ integer, dimension(:), allocatable :: request_recv_scalar
- real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: buffer_send_scalar_ext_mesh
- real(kind=CUSTOM_REAL), dimension(:,:), allocatable :: buffer_recv_scalar_ext_mesh
- integer, dimension(:), allocatable :: request_send_scalar_ext_mesh
- integer, dimension(:), allocatable :: request_recv_scalar_ext_mesh
-
integer ipoin,iinterface,ier
! here we have to assemble all the contributions between partitions using MPI
@@ -71,72 +62,70 @@
! assemble only if more than one partition
if(NPROC > 1) then
- allocate(buffer_send_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array buffer_send_scalar_ext_mesh'
- allocate(buffer_recv_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array buffer_recv_scalar_ext_mesh'
- allocate(request_send_scalar_ext_mesh(num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array request_send_scalar_ext_mesh'
- allocate(request_recv_scalar_ext_mesh(num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array request_recv_scalar_ext_mesh'
+ allocate(buffer_send_scalar(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array buffer_send_scalar'
+ allocate(buffer_recv_scalar(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array buffer_recv_scalar'
+ allocate(request_send_scalar(num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array request_send_scalar'
+ allocate(request_recv_scalar(num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array request_recv_scalar'
! partition border copy into the buffer
do iinterface = 1, num_interfaces_ext_mesh
do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
- buffer_send_scalar_ext_mesh(ipoin,iinterface) = array_val(ibool_interfaces_ext_mesh(ipoin,iinterface))
+ buffer_send_scalar(ipoin,iinterface) = array_val(ibool_interfaces_ext_mesh(ipoin,iinterface))
enddo
enddo
! send messages
do iinterface = 1, num_interfaces_ext_mesh
! non-blocking synchronous send request
- call isend_cr(buffer_send_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
- nibool_interfaces_ext_mesh(iinterface), &
- my_neighbours_ext_mesh(iinterface), &
- itag, &
- request_send_scalar_ext_mesh(iinterface) &
- )
+ call isend_cr(buffer_send_scalar(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+ nibool_interfaces_ext_mesh(iinterface), &
+ my_neighbours_ext_mesh(iinterface), &
+ itag, &
+ request_send_scalar(iinterface) )
! receive request
- call irecv_cr(buffer_recv_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
- nibool_interfaces_ext_mesh(iinterface), &
- my_neighbours_ext_mesh(iinterface), &
- itag, &
- request_recv_scalar_ext_mesh(iinterface) &
- )
+ call irecv_cr(buffer_recv_scalar(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+ nibool_interfaces_ext_mesh(iinterface), &
+ my_neighbours_ext_mesh(iinterface), &
+ itag, &
+ request_recv_scalar(iinterface) )
enddo
! wait for communications completion (recv)
do iinterface = 1, num_interfaces_ext_mesh
- call wait_req(request_recv_scalar_ext_mesh(iinterface))
+ call wait_req(request_recv_scalar(iinterface))
enddo
! adding contributions of neighbours
do iinterface = 1, num_interfaces_ext_mesh
do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
- array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_scalar_ext_mesh(ipoin,iinterface)
+ array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_scalar(ipoin,iinterface)
enddo
enddo
! wait for communications completion (send)
do iinterface = 1, num_interfaces_ext_mesh
- call wait_req(request_send_scalar_ext_mesh(iinterface))
+ call wait_req(request_send_scalar(iinterface))
enddo
- deallocate(buffer_send_scalar_ext_mesh)
- deallocate(buffer_recv_scalar_ext_mesh)
- deallocate(request_send_scalar_ext_mesh)
- deallocate(request_recv_scalar_ext_mesh)
+ deallocate(buffer_send_scalar)
+ deallocate(buffer_recv_scalar)
+ deallocate(request_send_scalar)
+ deallocate(request_recv_scalar)
endif
- end subroutine assemble_MPI_scalar_ext_mesh
+ end subroutine assemble_MPI_scalar_blocking
!
!----
!
- subroutine assemble_MPI_scalar_i_ext_mesh(NPROC,NGLOB_AB,array_val, &
+ subroutine assemble_MPI_scalar_i_blocking(NPROC,NGLOB_AB,array_val, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
@@ -155,10 +144,10 @@
integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh,my_neighbours_ext_mesh
integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
- integer, dimension(:,:), allocatable :: buffer_send_scalar_ext_mesh
- integer, dimension(:,:), allocatable :: buffer_recv_scalar_ext_mesh
- integer, dimension(:), allocatable :: request_send_scalar_ext_mesh
- integer, dimension(:), allocatable :: request_recv_scalar_ext_mesh
+ integer, dimension(:,:), allocatable :: buffer_send_scalar
+ integer, dimension(:,:), allocatable :: buffer_recv_scalar
+ integer, dimension(:), allocatable :: request_send_scalar
+ integer, dimension(:), allocatable :: request_recv_scalar
integer :: ipoin,iinterface,ier
@@ -167,74 +156,70 @@
! assemble only if more than one partition
if(NPROC > 1) then
- allocate(buffer_send_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array buffer_send_scalar_ext_mesh'
- allocate(buffer_recv_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array buffer_recv_scalar_ext_mesh'
- allocate(request_send_scalar_ext_mesh(num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array request_send_scalar_ext_mesh'
- allocate(request_recv_scalar_ext_mesh(num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array request_recv_scalar_ext_mesh'
+ allocate(buffer_send_scalar(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array buffer_send_scalar'
+ allocate(buffer_recv_scalar(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array buffer_recv_scalar'
+ allocate(request_send_scalar(num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array request_send_scalar'
+ allocate(request_recv_scalar(num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array request_recv_scalar'
! partition border copy into the buffer
do iinterface = 1, num_interfaces_ext_mesh
do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
- buffer_send_scalar_ext_mesh(ipoin,iinterface) = &
- array_val(ibool_interfaces_ext_mesh(ipoin,iinterface))
+ buffer_send_scalar(ipoin,iinterface) = array_val(ibool_interfaces_ext_mesh(ipoin,iinterface))
enddo
enddo
! send messages
do iinterface = 1, num_interfaces_ext_mesh
! non-blocking synchronous send request
- call isend_i(buffer_send_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
- nibool_interfaces_ext_mesh(iinterface), &
- my_neighbours_ext_mesh(iinterface), &
- itag, &
- request_send_scalar_ext_mesh(iinterface) &
- )
+ call isend_i(buffer_send_scalar(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+ nibool_interfaces_ext_mesh(iinterface), &
+ my_neighbours_ext_mesh(iinterface), &
+ itag, &
+ request_send_scalar(iinterface) )
! receive request
- call irecv_i(buffer_recv_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
- nibool_interfaces_ext_mesh(iinterface), &
- my_neighbours_ext_mesh(iinterface), &
- itag, &
- request_recv_scalar_ext_mesh(iinterface) &
- )
+ call irecv_i(buffer_recv_scalar(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
+ nibool_interfaces_ext_mesh(iinterface), &
+ my_neighbours_ext_mesh(iinterface), &
+ itag, &
+ request_recv_scalar(iinterface) )
enddo
! wait for communications completion
do iinterface = 1, num_interfaces_ext_mesh
- call wait_req(request_recv_scalar_ext_mesh(iinterface))
+ call wait_req(request_recv_scalar(iinterface))
enddo
! adding contributions of neighbours
do iinterface = 1, num_interfaces_ext_mesh
do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
- array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) &
- + buffer_recv_scalar_ext_mesh(ipoin,iinterface)
+ array_val(ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_scalar(ipoin,iinterface)
enddo
enddo
! wait for communications completion (send)
do iinterface = 1, num_interfaces_ext_mesh
- call wait_req(request_send_scalar_ext_mesh(iinterface))
+ call wait_req(request_send_scalar(iinterface))
enddo
- deallocate(buffer_send_scalar_ext_mesh)
- deallocate(buffer_recv_scalar_ext_mesh)
- deallocate(request_send_scalar_ext_mesh)
- deallocate(request_recv_scalar_ext_mesh)
+ deallocate(buffer_send_scalar)
+ deallocate(buffer_recv_scalar)
+ deallocate(request_send_scalar)
+ deallocate(request_recv_scalar)
endif
- end subroutine assemble_MPI_scalar_i_ext_mesh
+ end subroutine assemble_MPI_scalar_i_blocking
!
!----
!
- subroutine assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_AB,array_val, &
+ subroutine assemble_MPI_scalar_async_send(NPROC,NGLOB_AB,array_val, &
buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
@@ -296,13 +281,13 @@
endif
- end subroutine assemble_MPI_scalar_ext_mesh_s
+ end subroutine assemble_MPI_scalar_async_send
!
!----
!
- subroutine assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,array_val, &
+ subroutine assemble_MPI_scalar_async_recv(NPROC,NGLOB_AB,array_val, &
buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh, &
max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
@@ -354,5 +339,5 @@
endif
- end subroutine assemble_MPI_scalar_ext_mesh_w
+ end subroutine assemble_MPI_scalar_async_recv
Modified: seismo/3D/SPECFEM3D/trunk/src/shared/constants.h.in
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/shared/constants.h.in 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/shared/constants.h.in 2013-08-20 14:13:26 UTC (rev 22718)
@@ -115,7 +115,7 @@
! to plot total energy curves, for instance to monitor how CPML absorbing layers behave;
! should be turned OFF in most cases
- logical, parameter :: output_energy = .false. ! .true.
+ logical, parameter :: OUTPUT_ENERGY = .false. ! .true.
integer, parameter :: IOUT_ENERGY = 937 ! file number for the energy file
integer, parameter :: NTSTEP_BETWEEN_OUTPUT_ENERGY = 10 ! how often we compute energy (which is expensive to compute)
Modified: seismo/3D/SPECFEM3D/trunk/src/shared/detect_surface.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/shared/detect_surface.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/shared/detect_surface.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -88,7 +88,7 @@
enddo
! adds contributions from different partitions to valence_external_mesh
- call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob,valence_external_mesh, &
+ call assemble_MPI_scalar_i_blocking(NPROC,nglob,valence_external_mesh, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
@@ -335,7 +335,7 @@
enddo
! adds contributions from different partitions to valence_external_mesh
- call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob,valence_external_mesh, &
+ call assemble_MPI_scalar_i_blocking(NPROC,nglob,valence_external_mesh, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
@@ -738,7 +738,7 @@
enddo
! adds contributions from different partitions to valence_external_mesh
- call assemble_MPI_scalar_i_ext_mesh(NPROC,nglob,valence_external_mesh, &
+ call assemble_MPI_scalar_i_blocking(NPROC,nglob,valence_external_mesh, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/Makefile.in 2013-08-20 14:13:26 UTC (rev 22718)
@@ -45,14 +45,30 @@
@COND_CUDA_TRUE at NVCC = nvcc
@COND_CUDA_FALSE at NVCC = @CC@
+# GPU architecture
- at COND_CUDA_TRUE@NVCC_FLAGS_BASE = $(CUDA_INC) $(MPI_INC) $(COND_MPI_CPPFLAGS)
- at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -dc -DCUDA -gencode=arch=compute_35,code=sm_35
- at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -DCUDA -DUSE_OLDER_CUDA4_GPU -gencode=arch=compute_20,code=sm_20
+# CUDA architecture / code version
+# Fermi: -gencode=arch=compute_10,code=sm_10 not supported
+# Tesla (default): -gencode=arch=compute_20,code=sm_20
+# Geforce GT 650m: -gencode=arch=compute_30,code=sm_30
+# Kepler (cuda5) : -gencode=arch=compute_35,code=sm_35
+GENCODE_20 = -gencode=arch=compute_20,code=\"sm_20,compute_20\"
+GENCODE_30 = -gencode=arch=compute_30,code=\"sm_30,compute_30\"
+GENCODE_35 = -gencode=arch=compute_35,code=\"sm_35,compute_35\"
+# CUDA version 5.x
+ at COND_CUDA_TRUE@@COND_CUDA5_TRUE at GENCODE = $(GENCODE_35)
+# CUDA version 4.x
+ at COND_CUDA_TRUE@@COND_CUDA5_FALSE at GENCODE = $(GENCODE_20)
+
+# CUDA flags and linking
+ at COND_CUDA_TRUE@NVCC_FLAGS_BASE = $(CUDA_INC) $(MPI_INC) $(COND_MPI_CPPFLAGS)
+ at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -dc -DCUDA $(GENCODE)
+ at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCC_FLAGS = $(NVCC_FLAGS_BASE) -DCUDA -DUSE_OLDER_CUDA4_GPU $(GENCODE)
+
@COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK_BASE = $(NVCC) $(CUDA_INC) $(MPI_INC) $(COND_MPI_CPPFLAGS) -DCUDA
- at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK = $(NVCCLINK_BASE) -dlink -gencode=arch=compute_35,code=sm_35
- at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCCLINK = $(NVCCLINK_BASE) -DUSE_OLDER_CUDA4_GPU -gencode=arch=compute_20,code=sm_20
+ at COND_CUDA_TRUE@@COND_CUDA5_TRUE at NVCCLINK = $(NVCCLINK_BASE) -dlink $(GENCODE)
+ at COND_CUDA_TRUE@@COND_CUDA5_FALSE at NVCCLINK = $(NVCCLINK_BASE) -DUSE_OLDER_CUDA4_GPU $(GENCODE)
@COND_CUDA_FALSE at NVCC_FLAGS = $(MPI_INC) $(COND_MPI_CPPFLAGS)
@COND_CUDA_FALSE at NVCCLINK = $(NVCC) $(NVCC_FLAGS)
@@ -100,6 +116,8 @@
ARFLAGS = cru
RANLIB = ranlib
+#######################################
+
libspecfem_a_OBJECTS = \
$O/assemble_MPI_scalar.shared.o \
$O/check_mesh_resolution.shared.o \
@@ -172,10 +190,11 @@
$O/specfem3D_par.o \
$O/pml_par.o \
$O/assemble_MPI_vector.o \
+ $O/check_stability.o \
$O/fault_solver_common.o \
$O/fault_solver_dynamic.o \
$O/fault_solver_kinematic.o \
- $O/gravity_perturbation.o \
+ $O/gravity_perturbation.o \
$O/compute_add_sources_acoustic.o \
$O/compute_add_sources_viscoelastic.o \
$O/compute_add_sources_poroelastic.o \
@@ -201,6 +220,7 @@
$O/compute_stacey_acoustic.o \
$O/compute_stacey_viscoelastic.o \
$O/compute_stacey_poroelastic.o \
+ $O/compute_total_energy.o \
$O/create_color_image.o \
$O/detect_mesh_surfaces.o \
$O/finalize_simulation.o \
@@ -220,6 +240,7 @@
$O/setup_movie_meshes.o \
$O/setup_sources_receivers.o \
$O/specfem3D.o \
+ $O/update_displacement_scheme.o \
$O/write_movie_output.o \
$O/write_output_ASCII.o \
$O/write_output_SU.o \
@@ -271,6 +292,8 @@
@COND_PYRE_FALSE@@COND_CUDA_TRUE at XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $(LIBSPECFEM) $(CUDA_OBJECTS)
@COND_PYRE_FALSE@@COND_CUDA_FALSE at XSPECFEM_OBJECTS = $(SOLVER_ARRAY_OBJECTS) $(LIBSPECFEM) $(CUDA_STUBS)
+#######################################
+
####
#### targets
####
@@ -292,6 +315,7 @@
specfem3D: xspecfem3D
+#######################################
####
#### rules for executables
@@ -343,6 +367,9 @@
xcreate_movie_shakemap_AVS_DX_GMT xcombine_vol_data xcombine_surf_data \
xsmooth_vol_data xmodel_update xsum_kernels
+
+#######################################
+
###
### rule for the archive library
###
@@ -352,6 +379,8 @@
$(AR) $(ARFLAGS) $L/libspecfem.a $(libspecfem_a_OBJECTS)
$(RANLIB) $L/libspecfem.a
+#######################################
+
####
#### rule to build each .o file below
####
@@ -372,6 +401,7 @@
$O/%.shared.o: ${SHARED}%.F90 $(SHARED)constants.h
${FCCOMPILE_CHECK} -c -o $@ $<
+#######################################
###
### OpenMP compilation
@@ -380,12 +410,20 @@
${FCCOMPILE_CHECK} -c -o $@ $<
+#######################################
+
###
### CUDA compilation
###
$O/%.cuda.o: ${CUDAD}%.cu ../../config.h $(CUDAD)mesh_constants_cuda.h $(CUDAD)prepare_constants_cuda.h
$(NVCC) -c $< -o $@ $(NVCC_FLAGS)
+$O/%.cudacc.o: ${CUDAD}%.c ../../config.h
+ ${CC} -c $(CFLAGS) $(MPI_INC) -o $@ ${CUDAD}$< -I../../
+
+
+#######################################
+
###
### C compilation
###
@@ -395,9 +433,8 @@
$O/%.cc.o: ${SHARED}%.c ../../config.h
${CC} -c $(CFLAGS) $(MPI_INC) -o $@ ${SHARED}$< -I../../
-$O/%.cudacc.o: ${CUDAD}%.c ../../config.h
- ${CC} -c $(CFLAGS) $(MPI_INC) -o $@ ${CUDAD}$< -I../../
+#######################################
###
### MPI compilation without optimization
@@ -409,19 +446,24 @@
$O/serial.o: $(SHARED)constants.h $(SHARED)serial.f90
${FCCOMPILE_CHECK} -c -o $O/serial.o $(SHARED)serial.f90
-$O/smooth_vol_data.o: $(SHARED)constants.h $(SHARED)smooth_vol_data.f90
- ${MPIFCCOMPILE_CHECK} -c -o $O/smooth_vol_data.o $(SHARED)smooth_vol_data.f90
+#######################################
##
## kernel summation
##
+$O/smooth_vol_data.o: $(SHARED)constants.h $(SHARED)smooth_vol_data.f90
+ ${MPIFCCOMPILE_CHECK} -c -o $O/smooth_vol_data.o $(SHARED)smooth_vol_data.f90
+
$O/sum_kernels.o: $(SHARED)constants.h $(SHARED)sum_kernels.f90
${MPIFCCOMPILE_CHECK} -c -o $O/sum_kernels.o $(SHARED)sum_kernels.f90
+
+#######################################
+
##
## model update
##
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/assemble_MPI_vector.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -28,11 +28,13 @@
!---- assemble the contributions between slices using non-blocking MPI
!----
- subroutine assemble_MPI_vector_ext_mesh(NPROC,NGLOB_AB,array_val, &
+ subroutine assemble_MPI_vector_blocking(NPROC,NGLOB_AB,array_val, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
+! assembles vector field in blocking way, only returns after values have been received and assembled
+
implicit none
include "constants.h"
@@ -50,15 +52,12 @@
! local parameters
! send/receive temporary buffers
- !real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
- ! buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh
- real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_send_vector_ext_mesh
- real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_recv_vector_ext_mesh
+ real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_send_vector
+ real(kind=CUSTOM_REAL), dimension(:,:,:), allocatable :: buffer_recv_vector
! requests
- !integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
- integer, dimension(:), allocatable :: request_send_vector_ext_mesh
- integer, dimension(:), allocatable :: request_recv_vector_ext_mesh
+ integer, dimension(:), allocatable :: request_send_vector
+ integer, dimension(:), allocatable :: request_recv_vector
integer ipoin,iinterface,ier
@@ -68,72 +67,68 @@
! assemble only if more than one partition
if(NPROC > 1) then
- allocate(buffer_send_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array buffer_send_vector_ext_mesh'
- allocate(buffer_recv_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array buffer_recv_vector_ext_mesh'
- allocate(request_send_vector_ext_mesh(num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array request_send_vector_ext_mesh'
- allocate(request_recv_vector_ext_mesh(num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array request_recv_vector_ext_mesh'
+ allocate(buffer_send_vector(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array buffer_send_vector'
+ allocate(buffer_recv_vector(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array buffer_recv_vector'
+ allocate(request_send_vector(num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array request_send_vector'
+ allocate(request_recv_vector(num_interfaces_ext_mesh),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array request_recv_vector'
! partition border copy into the buffer
do iinterface = 1, num_interfaces_ext_mesh
do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
- buffer_send_vector_ext_mesh(:,ipoin,iinterface) = &
- array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface))
+ buffer_send_vector(:,ipoin,iinterface) = array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface))
enddo
enddo
! send messages
do iinterface = 1, num_interfaces_ext_mesh
- call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
- NDIM*nibool_interfaces_ext_mesh(iinterface), &
- my_neighbours_ext_mesh(iinterface), &
- itag, &
- request_send_vector_ext_mesh(iinterface) &
- )
- call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
- NDIM*nibool_interfaces_ext_mesh(iinterface), &
- my_neighbours_ext_mesh(iinterface), &
- itag, &
- request_recv_vector_ext_mesh(iinterface) &
- )
+ call isend_cr(buffer_send_vector(1,1,iinterface), &
+ NDIM*nibool_interfaces_ext_mesh(iinterface), &
+ my_neighbours_ext_mesh(iinterface), &
+ itag, &
+ request_send_vector(iinterface) )
+ call irecv_cr(buffer_recv_vector(1,1,iinterface), &
+ NDIM*nibool_interfaces_ext_mesh(iinterface), &
+ my_neighbours_ext_mesh(iinterface), &
+ itag, &
+ request_recv_vector(iinterface) )
enddo
! wait for communications completion (recv)
do iinterface = 1, num_interfaces_ext_mesh
- call wait_req(request_recv_vector_ext_mesh(iinterface))
+ call wait_req(request_recv_vector(iinterface))
enddo
! adding contributions of neighbours
do iinterface = 1, num_interfaces_ext_mesh
do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
- array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) &
- + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+ array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector(:,ipoin,iinterface)
enddo
enddo
! wait for communications completion (send)
do iinterface = 1, num_interfaces_ext_mesh
- call wait_req(request_send_vector_ext_mesh(iinterface))
+ call wait_req(request_send_vector(iinterface))
enddo
- deallocate(buffer_send_vector_ext_mesh)
- deallocate(buffer_recv_vector_ext_mesh)
- deallocate(request_send_vector_ext_mesh)
- deallocate(request_recv_vector_ext_mesh)
+ deallocate(buffer_send_vector)
+ deallocate(buffer_recv_vector)
+ deallocate(request_send_vector)
+ deallocate(request_recv_vector)
endif
- end subroutine assemble_MPI_vector_ext_mesh
+ end subroutine assemble_MPI_vector_blocking
!
!-------------------------------------------------------------------------------------------------
!
- subroutine assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,array_val, &
+ subroutine assemble_MPI_vector_async_send(NPROC,NGLOB_AB,array_val, &
buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
@@ -194,19 +189,95 @@
endif
- end subroutine assemble_MPI_vector_ext_mesh_s
+ end subroutine assemble_MPI_vector_async_send
!
!-------------------------------------------------------------------------------------------------
!
- subroutine assemble_MPI_vector_ext_mesh_w(NPROC,NGLOB_AB,array_val, &
- buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+! unused, new ordered routine is used now...
+!
+! subroutine assemble_MPI_vector_async_recv(NPROC,NGLOB_AB,array_val, &
+! buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh, &
+! max_nibool_interfaces_ext_mesh, &
+! nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+! request_send_vector_ext_mesh,request_recv_vector_ext_mesh)
+!
+!! waits for data to receive and assembles
+!
+! implicit none
+!
+! include "constants.h"
+!
+! integer :: NPROC
+! integer :: NGLOB_AB
+!
+!! array to assemble
+! real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
+!
+! integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+!
+! real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
+! buffer_recv_vector_ext_mesh
+!
+! integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
+! integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
+! integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+!
+! integer ipoin,iinterface
+!
+!! here we have to assemble all the contributions between partitions using MPI
+!
+!! assemble only if more than one partition
+! if(NPROC > 1) then
+!
+!! wait for communications completion (recv)
+! do iinterface = 1, num_interfaces_ext_mesh
+! call wait_req(request_recv_vector_ext_mesh(iinterface))
+! enddo
+!
+!! adding contributions of neighbours
+! do iinterface = 1, num_interfaces_ext_mesh
+! do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
+! array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+! enddo
+! enddo
+!
+!! wait for communications completion (send)
+! do iinterface = 1, num_interfaces_ext_mesh
+! call wait_req(request_send_vector_ext_mesh(iinterface))
+! enddo
+!
+! endif
+!
+! end subroutine assemble_MPI_vector_async_recv
+!
+!
+!
+!-------------------------------------------------------------------------------------------------
+!
+
+ subroutine assemble_MPI_vector_async_w_ord(NPROC,NGLOB_AB,array_val, &
+ buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh, &
+ max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+ request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+ my_neighbours_ext_mesh,myrank)
+
! waits for data to receive and assembles
+! The goal of this version is to avoid different round-off errors in different processors.
+! The contribution of each processor is added following the order of its rank.
+! This guarantees that the sums are done in the same order on all processors.
+!
+! NOTE: this version assumes that the interfaces are ordered by increasing rank of the neighbour.
+! That is currently done so in subroutine write_interfaces_database in decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90
+! A safety test could be added here.
+!
+! October 2012 - Surendra Somala and Jean-Paul Ampuero - Caltech Seismolab
+
implicit none
include "constants.h"
@@ -217,7 +288,7 @@
! array to assemble
real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
- integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
+ integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,myrank
real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
buffer_recv_vector_ext_mesh
@@ -225,38 +296,68 @@
integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
+ integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
- integer ipoin,iinterface
+ real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: mybuffer
+ integer :: ipoin,iinterface,iglob
+ logical :: need_add_my_contrib
! here we have to assemble all the contributions between partitions using MPI
! assemble only if more than one partition
- if(NPROC > 1) then
+ if (NPROC == 1) return
+! move interface values of array_val to local buffers
+ do iinterface = 1, num_interfaces_ext_mesh
+ do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
+ iglob = ibool_interfaces_ext_mesh(ipoin,iinterface)
+ mybuffer(:,ipoin,iinterface) = array_val(:,iglob)
+ ! set them to zero right away to avoid counting it more than once during assembly:
+ ! buffers of higher rank get zeros on nodes shared with current buffer
+ array_val(:,iglob) = 0._CUSTOM_REAL
+ enddo
+ enddo
+
! wait for communications completion (recv)
do iinterface = 1, num_interfaces_ext_mesh
call wait_req(request_recv_vector_ext_mesh(iinterface))
enddo
-! adding contributions of neighbours
+! adding all contributions in order of processor rank
+ need_add_my_contrib = .true.
do iinterface = 1, num_interfaces_ext_mesh
+ if (need_add_my_contrib .and. myrank < my_neighbours_ext_mesh(iinterface)) call add_my_contrib()
do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
- array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) = &
- array_val(:,ibool_interfaces_ext_mesh(ipoin,iinterface)) &
- + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
+ iglob = ibool_interfaces_ext_mesh(ipoin,iinterface)
+ array_val(:,iglob) = array_val(:,iglob) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
enddo
enddo
+ if (need_add_my_contrib) call add_my_contrib()
! wait for communications completion (send)
do iinterface = 1, num_interfaces_ext_mesh
call wait_req(request_send_vector_ext_mesh(iinterface))
enddo
- endif
+ contains
- end subroutine assemble_MPI_vector_ext_mesh_w
+ subroutine add_my_contrib()
+ integer :: my_iinterface,my_ipoin
+ do my_iinterface = 1, num_interfaces_ext_mesh
+ do my_ipoin = 1, nibool_interfaces_ext_mesh(my_iinterface)
+ iglob = ibool_interfaces_ext_mesh(my_ipoin,my_iinterface)
+ array_val(:,iglob) = array_val(:,iglob) + mybuffer(:,my_ipoin,my_iinterface)
+ enddo
+ enddo
+ need_add_my_contrib = .false.
+
+ end subroutine add_my_contrib
+
+ end subroutine assemble_MPI_vector_async_w_ord
+
+
!
!--------------------------------------------------------------------------------------------------
!
@@ -436,9 +537,6 @@
integer, dimension(num_interfaces_ext_mesh) :: request_recv_vector_ext_mesh
! local parameters
- !integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
- !integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
- !integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh
integer :: iinterface
! here we have to assemble all the contributions between partitions using MPI
@@ -572,17 +670,15 @@
! send messages
do iinterface = 1, num_interfaces_ext_mesh
call isend_cr(buffer_send_vector_ext_mesh(1,1,iinterface), &
- NDIM*nibool_interfaces_ext_mesh(iinterface), &
- my_neighbours_ext_mesh(iinterface), &
- itag, &
- request_send_vector_ext_mesh(iinterface) &
- )
+ NDIM*nibool_interfaces_ext_mesh(iinterface), &
+ my_neighbours_ext_mesh(iinterface), &
+ itag, &
+ request_send_vector_ext_mesh(iinterface) )
call irecv_cr(buffer_recv_vector_ext_mesh(1,1,iinterface), &
- NDIM*nibool_interfaces_ext_mesh(iinterface), &
- my_neighbours_ext_mesh(iinterface), &
- itag, &
- request_recv_vector_ext_mesh(iinterface) &
- )
+ NDIM*nibool_interfaces_ext_mesh(iinterface), &
+ my_neighbours_ext_mesh(iinterface), &
+ itag, &
+ request_recv_vector_ext_mesh(iinterface) )
enddo
endif
@@ -610,11 +706,11 @@
integer :: NPROC
integer :: NGLOB_AB
integer(kind=8) :: Mesh_pointer
-! array to assemble
+
+ ! array to assemble
real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh
-
real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
buffer_recv_vector_ext_mesh
@@ -638,10 +734,11 @@
enddo
! adding contributions of neighbours
- call transfer_asmbl_accel_to_device(Mesh_pointer, array_val, buffer_recv_vector_ext_mesh, &
- num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,&
- ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+ call transfer_asmbl_accel_to_device(Mesh_pointer, array_val, &
+ buffer_recv_vector_ext_mesh, &
+ num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,&
+ ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
! This step is done via previous function transfer_and_assemble...
! do iinterface = 1, num_interfaces_ext_mesh
@@ -666,11 +763,11 @@
!
subroutine assemble_MPI_scalar_send_cuda(NPROC, &
- buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
- num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh, &
- my_neighbours_ext_mesh, &
- request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
+ buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh, &
+ my_neighbours_ext_mesh, &
+ request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh)
! non-blocking MPI send
@@ -700,21 +797,24 @@
! send messages
do iinterface = 1, num_interfaces_ext_mesh
+ ! note: passing arguments:
+ ! **array**(1:nibool_interfaces_ext_mesh(iinterface),iinterface)
+ ! might lead to additional copy memory operations for certain compilers (slows down performance);
+ ! to avoid this in fortran, one might just pass the pointer to the first array value:
+ ! **array**(1,iinterface)
+
! non-blocking synchronous send request
call isend_cr(buffer_send_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
- nibool_interfaces_ext_mesh(iinterface), &
- my_neighbours_ext_mesh(iinterface), &
- itag, &
- request_send_scalar_ext_mesh(iinterface) &
- )
+ nibool_interfaces_ext_mesh(iinterface), &
+ my_neighbours_ext_mesh(iinterface), &
+ itag, &
+ request_send_scalar_ext_mesh(iinterface) )
! receive request
call irecv_cr(buffer_recv_scalar_ext_mesh(1:nibool_interfaces_ext_mesh(iinterface),iinterface), &
- nibool_interfaces_ext_mesh(iinterface), &
- my_neighbours_ext_mesh(iinterface), &
- itag, &
- request_recv_scalar_ext_mesh(iinterface) &
- )
-
+ nibool_interfaces_ext_mesh(iinterface), &
+ my_neighbours_ext_mesh(iinterface), &
+ itag, &
+ request_recv_scalar_ext_mesh(iinterface) )
enddo
endif
@@ -756,9 +856,9 @@
integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
integer, dimension(num_interfaces_ext_mesh) :: request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh
- integer FORWARD_OR_ADJOINT
+ integer :: FORWARD_OR_ADJOINT
- integer iinterface ! ipoin
+ integer :: iinterface ! ipoin
! assemble only if more than one partition
if(NPROC > 1) then
@@ -769,9 +869,7 @@
enddo
! adding contributions of neighbours
- call transfer_asmbl_pot_to_device(Mesh_pointer, array_val, buffer_recv_scalar_ext_mesh, &
- num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, nibool_interfaces_ext_mesh,&
- ibool_interfaces_ext_mesh,FORWARD_OR_ADJOINT)
+ call transfer_asmbl_pot_to_device(Mesh_pointer,array_val,buffer_recv_scalar_ext_mesh,FORWARD_OR_ADJOINT)
! note: adding contributions of neighbours has been done just above for cuda
!do iinterface = 1, num_interfaces_ext_mesh
@@ -791,109 +889,3 @@
end subroutine assemble_MPI_scalar_write_cuda
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
- subroutine assemble_MPI_vector_ext_mesh_w_ordered(NPROC,NGLOB_AB,array_val, &
- buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- request_send_vector_ext_mesh,request_recv_vector_ext_mesh,my_neighbours_ext_mesh,myrank)
-
-! waits for data to receive and assembles
-
-! The goal of this version is to avoid different round-off errors in different processors.
-! The contribution of each processor is added following the order of its rank.
-! This guarantees that the sums are done in the same order on all processors.
-!
-! NOTE: this version assumes that the interfaces are ordered by increasing rank of the neighbour.
-! That is currently done so in subroutine write_interfaces_database in decompose_mesh_SCOTCH/part_decompose_mesh_SCOTCH.f90
-! A safety test could be added here.
-!
-! October 2012 - Surendra Somala and Jean-Paul Ampuero - Caltech Seismolab
-
- implicit none
-
- include "constants.h"
-
- integer :: NPROC
- integer :: NGLOB_AB
-
-! array to assemble
- real(kind=CUSTOM_REAL), dimension(NDIM,NGLOB_AB) :: array_val
-
- integer :: num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh,myrank
-
- real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: &
- buffer_recv_vector_ext_mesh
-
- integer, dimension(num_interfaces_ext_mesh) :: nibool_interfaces_ext_mesh
- integer, dimension(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: ibool_interfaces_ext_mesh
- integer, dimension(num_interfaces_ext_mesh) :: request_send_vector_ext_mesh,request_recv_vector_ext_mesh
- integer, dimension(num_interfaces_ext_mesh) :: my_neighbours_ext_mesh
-
- real(kind=CUSTOM_REAL), dimension(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh) :: mybuffer
- integer :: ipoin,iinterface,iglob
- logical :: need_add_my_contrib
-
-! here we have to assemble all the contributions between partitions using MPI
-
-! assemble only if more than one partition
- if (NPROC == 1) return
-
-! move interface values of array_val to local buffers
- do iinterface = 1, num_interfaces_ext_mesh
- do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
- iglob = ibool_interfaces_ext_mesh(ipoin,iinterface)
- mybuffer(:,ipoin,iinterface) = array_val(:,iglob)
- ! set them to zero right away to avoid counting it more than once during assembly:
- ! buffers of higher rank get zeros on nodes shared with current buffer
- array_val(:,iglob) = 0._CUSTOM_REAL
- enddo
- enddo
-
-! wait for communications completion (recv)
- do iinterface = 1, num_interfaces_ext_mesh
- call wait_req(request_recv_vector_ext_mesh(iinterface))
- enddo
-
-! adding all contributions in order of processor rank
- need_add_my_contrib = .true.
- do iinterface = 1, num_interfaces_ext_mesh
- if (need_add_my_contrib .and. myrank < my_neighbours_ext_mesh(iinterface)) call add_my_contrib()
- do ipoin = 1, nibool_interfaces_ext_mesh(iinterface)
- iglob = ibool_interfaces_ext_mesh(ipoin,iinterface)
- array_val(:,iglob) = array_val(:,iglob) + buffer_recv_vector_ext_mesh(:,ipoin,iinterface)
- enddo
- enddo
- if (need_add_my_contrib) call add_my_contrib()
-
-! wait for communications completion (send)
- do iinterface = 1, num_interfaces_ext_mesh
- call wait_req(request_send_vector_ext_mesh(iinterface))
- enddo
-
- contains
-
- subroutine add_my_contrib()
-
- integer :: my_iinterface,my_ipoin
-
- do my_iinterface = 1, num_interfaces_ext_mesh
- do my_ipoin = 1, nibool_interfaces_ext_mesh(my_iinterface)
- iglob = ibool_interfaces_ext_mesh(my_ipoin,my_iinterface)
- array_val(:,iglob) = array_val(:,iglob) + mybuffer(:,my_ipoin,my_iinterface)
- enddo
- enddo
- need_add_my_contrib = .false.
-
- end subroutine add_my_contrib
-
- end subroutine assemble_MPI_vector_ext_mesh_w_ordered
-
-
-!
-!-------------------------------------------------------------------------------------------------
-!
-
Added: seismo/3D/SPECFEM3D/trunk/src/specfem3D/check_stability.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/check_stability.f90 (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/check_stability.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -0,0 +1,290 @@
+!=====================================================================
+!
+! S p e c f e m 3 D V e r s i o n 2 . 1
+! ---------------------------------------
+!
+! Main authors: Dimitri Komatitsch and Jeroen Tromp
+! Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+! July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+!
+! United States and French Government Sponsorship Acknowledged.
+
+ subroutine check_stability()
+
+! computes the maximum of the norm of the displacement
+! in all the slices using an MPI reduction
+! and output timestamp file to check that simulation is running fine
+
+ use specfem_par
+ use specfem_par_elastic
+ use specfem_par_poroelastic
+ use specfem_par_acoustic
+
+ implicit none
+
+ double precision :: tCPU,t_remain,t_total
+ integer :: ihours,iminutes,iseconds,int_tCPU, &
+ ihours_remain,iminutes_remain,iseconds_remain,int_t_remain, &
+ ihours_total,iminutes_total,iseconds_total,int_t_total
+
+ ! maximum of the norm of the displacement
+ real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all ! elastic
+ real(kind=CUSTOM_REAL) Usolidnormp,Usolidnormp_all ! acoustic
+ real(kind=CUSTOM_REAL) Usolidnorms,Usolidnorms_all ! solid poroelastic
+ real(kind=CUSTOM_REAL) Usolidnormw,Usolidnormw_all ! fluid (w.r.t.s) poroelastic
+
+ ! norm of the backward displacement
+ real(kind=CUSTOM_REAL) b_Usolidnorm, b_Usolidnorm_all
+ real(kind=CUSTOM_REAL) b_Usolidnormp, b_Usolidnormp_all
+ real(kind=CUSTOM_REAL) b_Usolidnorms, b_Usolidnorms_all
+ real(kind=CUSTOM_REAL) b_Usolidnormw, b_Usolidnormw_all
+
+ ! initializes
+ Usolidnorm_all = 0.0_CUSTOM_REAL
+ Usolidnormp_all = 0.0_CUSTOM_REAL
+ Usolidnorms_all = 0.0_CUSTOM_REAL
+ Usolidnormw_all = 0.0_CUSTOM_REAL
+
+ ! compute maximum of norm of displacement in each slice
+ if( ELASTIC_SIMULATION ) then
+ if( GPU_MODE) then
+ ! way 2: just get maximum of field from GPU
+ call get_norm_elastic_from_device(Usolidnorm,Mesh_pointer,1)
+ else
+ Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2))
+ endif
+
+ ! check stability of the code, exit if unstable
+ ! negative values can occur with some compilers when the unstable value is greater
+ ! than the greatest possible floating-point number of the machine
+ !if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0.0_CUSTOM_REAL) &
+ ! call exit_MPI(myrank,'single forward simulation became unstable and blew up')
+
+ ! compute the maximum of the maxima for all the slices using an MPI reduction
+ call max_all_cr(Usolidnorm,Usolidnorm_all)
+ endif
+
+ if( ACOUSTIC_SIMULATION ) then
+ if(GPU_MODE) then
+ ! way 2: just get maximum of field from GPU
+ call get_norm_acoustic_from_device(Usolidnormp,Mesh_pointer,1)
+ else
+ Usolidnormp = maxval(abs(potential_dot_dot_acoustic(:)))
+ endif
+
+ ! compute the maximum of the maxima for all the slices using an MPI reduction
+ call max_all_cr(Usolidnormp,Usolidnormp_all)
+ endif
+
+ if( POROELASTIC_SIMULATION ) then
+ Usolidnorms = maxval(sqrt(displs_poroelastic(1,:)**2 + displs_poroelastic(2,:)**2 + &
+ displs_poroelastic(3,:)**2))
+ Usolidnormw = maxval(sqrt(displw_poroelastic(1,:)**2 + displw_poroelastic(2,:)**2 + &
+ displw_poroelastic(3,:)**2))
+
+ ! compute the maximum of the maxima for all the slices using an MPI reduction
+ call max_all_cr(Usolidnorms,Usolidnorms_all)
+ call max_all_cr(Usolidnormw,Usolidnormw_all)
+ endif
+
+
+ ! adjoint simulations
+ if( SIMULATION_TYPE == 3 ) then
+ ! initializes backward field norms
+ b_Usolidnorm_all = 0.0_CUSTOM_REAL
+ b_Usolidnormp_all = 0.0_CUSTOM_REAL
+ b_Usolidnorms_all = 0.0_CUSTOM_REAL
+ b_Usolidnormw_all = 0.0_CUSTOM_REAL
+
+ if( ELASTIC_SIMULATION ) then
+ ! way 2
+ if(GPU_MODE) then
+ call get_norm_elastic_from_device(b_Usolidnorm,Mesh_pointer,3)
+ else
+ b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2))
+ endif
+ ! compute max of all slices
+ call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
+ endif
+ if( ACOUSTIC_SIMULATION ) then
+ ! way 2
+ if(GPU_MODE) then
+ call get_norm_acoustic_from_device(b_Usolidnormp,Mesh_pointer,3)
+ else
+ b_Usolidnormp = maxval(abs(b_potential_dot_dot_acoustic(:)))
+ endif
+ ! compute max of all slices
+ call max_all_cr(b_Usolidnormp,b_Usolidnormp_all)
+ endif
+ if( POROELASTIC_SIMULATION ) then
+ b_Usolidnorms = maxval(sqrt(b_displs_poroelastic(1,:)**2 + b_displs_poroelastic(2,:)**2 + &
+ b_displs_poroelastic(3,:)**2))
+ b_Usolidnormw = maxval(sqrt(b_displw_poroelastic(1,:)**2 + b_displw_poroelastic(2,:)**2 + &
+ b_displw_poroelastic(3,:)**2))
+ ! compute max of all slices
+ call max_all_cr(b_Usolidnorms,b_Usolidnorms_all)
+ call max_all_cr(b_Usolidnormw,b_Usolidnormw_all)
+ endif
+ ! check stability of the code, exit if unstable
+ ! negative values can occur with some compilers when the unstable value is greater
+ ! than the greatest possible floating-point number of the machine
+ !if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0.0_CUSTOM_REAL) &
+ ! call exit_MPI(myrank,'single backward simulation became unstable and blew up')
+ endif
+
+ ! user output
+ if(myrank == 0) then
+
+ write(IMAIN,*) 'Time step # ',it
+ write(IMAIN,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
+
+ ! elapsed time since beginning of the simulation
+ tCPU = wtime() - time_start
+ int_tCPU = int(tCPU)
+ ihours = int_tCPU / 3600
+ iminutes = (int_tCPU - 3600*ihours) / 60
+ iseconds = int_tCPU - 3600*ihours - 60*iminutes
+ write(IMAIN,*) 'Elapsed time in seconds = ',tCPU
+ write(IMAIN,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+ write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',sngl(tCPU/dble(it))
+
+ if( ELASTIC_SIMULATION ) &
+ write(IMAIN,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
+
+ if( ACOUSTIC_SIMULATION ) &
+ write(IMAIN,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all
+
+ if( POROELASTIC_SIMULATION ) then
+ write(IMAIN,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all
+ write(IMAIN,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all
+ endif
+
+ ! adjoint simulations
+ if (SIMULATION_TYPE == 3) then
+ if( ELASTIC_SIMULATION ) &
+ write(IMAIN,*) 'Max norm displacement vector U (backward) in all slices (m) = ',b_Usolidnorm_all
+ if( ACOUSTIC_SIMULATION ) &
+ write(IMAIN,*) 'Max norm pressure P (backward) in all slices (Pa) = ',b_Usolidnormp_all
+ if( POROELASTIC_SIMULATION ) then
+ write(IMAIN,*) 'Max norm displacement vector Us (backward) in all slices (m) = ',b_Usolidnorms_all
+ write(IMAIN,*) 'Max norm displacement vector W (backward) in all slices (m) = ',b_Usolidnormw_all
+ endif
+ endif
+
+ ! compute estimated remaining simulation time
+ t_remain = (NSTEP - it) * (tCPU/dble(it))
+ int_t_remain = int(t_remain)
+ ihours_remain = int_t_remain / 3600
+ iminutes_remain = (int_t_remain - 3600*ihours_remain) / 60
+ iseconds_remain = int_t_remain - 3600*ihours_remain - 60*iminutes_remain
+ write(IMAIN,*) 'Time steps done = ',it,' out of ',NSTEP
+ write(IMAIN,*) 'Time steps remaining = ',NSTEP - it
+ write(IMAIN,*) 'Estimated remaining time in seconds = ',sngl(t_remain)
+ write(IMAIN,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
+ ihours_remain,iminutes_remain,iseconds_remain
+
+ ! compute estimated total simulation time
+ t_total = t_remain + tCPU
+ int_t_total = int(t_total)
+ ihours_total = int_t_total / 3600
+ iminutes_total = (int_t_total - 3600*ihours_total) / 60
+ iseconds_total = int_t_total - 3600*ihours_total - 60*iminutes_total
+ write(IMAIN,*) 'Estimated total run time in seconds = ',sngl(t_total)
+ write(IMAIN,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
+ ihours_total,iminutes_total,iseconds_total
+ write(IMAIN,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'
+
+ if(it < 100) then
+ write(IMAIN,*) '************************************************************'
+ write(IMAIN,*) '**** BEWARE: the above time estimates are not reliable'
+ write(IMAIN,*) '**** because fewer than 100 iterations have been performed'
+ write(IMAIN,*) '************************************************************'
+ endif
+ write(IMAIN,*)
+
+ ! flushes file buffer for main output file (IMAIN)
+ call flush_IMAIN()
+
+ ! write time stamp file to give information about progression of simulation
+ write(outputname,"('/timestamp',i6.6)") it
+ open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown')
+ write(IOUT,*) 'Time step # ',it
+ write(IOUT,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
+ write(IOUT,*) 'Elapsed time in seconds = ',tCPU
+ write(IOUT,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+ write(IOUT,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it)
+
+ if( ELASTIC_SIMULATION ) &
+ write(IOUT,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
+
+ if( ACOUSTIC_SIMULATION ) &
+ write(IOUT,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all
+
+ if( POROELASTIC_SIMULATION ) then
+ write(IOUT,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all
+ write(IOUT,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all
+ endif
+
+ ! adjoint simulations
+ if (SIMULATION_TYPE == 3) then
+ if( ELASTIC_SIMULATION ) &
+ write(IOUT,*) 'Max norm displacement vector U (backward) in all slices (m) = ',b_Usolidnorm_all
+ if( ACOUSTIC_SIMULATION ) &
+ write(IOUT,*) 'Max norm pressure P (backward) in all slices (Pa) = ',b_Usolidnormp_all
+ if( POROELASTIC_SIMULATION ) then
+ write(IOUT,*) 'Max norm displacement vector Us (backward) in all slices (m) = ',b_Usolidnorms_all
+ write(IOUT,*) 'Max norm displacement vector W (backward) in all slices (m) = ',b_Usolidnormw_all
+ endif
+ endif
+
+ ! estimation
+ write(IOUT,*) 'Time steps done = ',it,' out of ',NSTEP
+ write(IOUT,*) 'Time steps remaining = ',NSTEP - it
+ write(IOUT,*) 'Estimated remaining time in seconds = ',t_remain
+ write(IOUT,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
+ ihours_remain,iminutes_remain,iseconds_remain
+ write(IOUT,*) 'Estimated total run time in seconds = ',t_total
+ write(IOUT,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
+ ihours_total,iminutes_total,iseconds_total
+ write(IOUT,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'
+ close(IOUT)
+
+ ! check stability of the code, exit if unstable
+ ! negative values can occur with some compilers when the unstable value is greater
+ ! than the greatest possible floating-point number of the machine
+ if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0_CUSTOM_REAL &
+ .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0_CUSTOM_REAL &
+ .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0_CUSTOM_REAL &
+ .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0_CUSTOM_REAL) &
+ call exit_MPI(myrank,'forward simulation became unstable and blew up')
+
+ ! adjoint simulations
+ if( SIMULATION_TYPE == 3 ) then
+ if( b_Usolidnorm_all > STABILITY_THRESHOLD .or. b_Usolidnorm_all < 0.0_CUSTOM_REAL &
+ .or. b_Usolidnormp_all > STABILITY_THRESHOLD .or. b_Usolidnormp_all < 0.0_CUSTOM_REAL &
+ .or. b_Usolidnorms_all > STABILITY_THRESHOLD .or. b_Usolidnorms_all < 0.0_CUSTOM_REAL &
+ .or. b_Usolidnormw_all > STABILITY_THRESHOLD .or. b_Usolidnormw_all < 0.0_CUSTOM_REAL ) &
+ call exit_MPI(myrank,'backward simulation became unstable and blew up')
+ endif
+
+ endif ! myrank
+
+ end subroutine check_stability
+
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_acoustic.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_acoustic.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -265,9 +265,10 @@
! reads in **sta**.**net**.**LH**.adj files
adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
call compute_arrays_adjoint_source(myrank,adj_source_file, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- adj_sourcearray, xigll,yigll,zigll, &
- it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ adj_sourcearray, xigll,yigll,zigll, &
+ it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
enddo
@@ -416,6 +417,9 @@
! adjoint sources in SU format
integer,parameter :: nheader=240 ! 240 bytes
+ ! checks if anything to do
+ if( SIMULATION_TYPE /= 3 ) return
+
! plotting source time function
if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
! initializes total
@@ -451,7 +455,7 @@
! thus indexing is NSTEP - it , instead of NSTEP - it - 1
! adjoint simulations
- if (SIMULATION_TYPE == 3 .and. nsources_local > 0) then
+ if( nsources_local > 0 ) then
! adds acoustic sources
do isource = 1,NSOURCES
@@ -663,8 +667,7 @@
! only implements SIMTYPE=1 and NOISE_TOM=0
! write(*,*) "fortran dt = ", dt
! change dt -> DT
- call compute_add_sources_ac_cuda(Mesh_pointer, phase_is_inner, &
- NSOURCES, stf_pre_compute, myrank)
+ call compute_add_sources_ac_cuda(Mesh_pointer,phase_is_inner,NSOURCES,stf_pre_compute)
endif
endif
@@ -730,9 +733,10 @@
! reads in **sta**.**net**.**LH**.adj files
adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
call compute_arrays_adjoint_source(myrank,adj_source_file, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- adj_sourcearray, xigll,yigll,zigll, &
- it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ adj_sourcearray, xigll,yigll,zigll, &
+ it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
enddo
@@ -783,7 +787,8 @@
! on GPU
call add_sources_ac_sim_2_or_3_cuda(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
ispec_is_inner,ispec_is_acoustic, &
- ispec_selected_rec,myrank,nrec, &
+ ispec_selected_rec, &
+ nrec, &
NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
islice_selected_rec,nadj_rec_local, &
NTSTEP_BETWEEN_READ_ADJSRC)
@@ -821,8 +826,7 @@
enddo
stf_used_total = stf_used_total + sum(stf_pre_compute(:))
! only implements SIMTYPE=3
- call compute_add_sources_ac_s3_cuda(Mesh_pointer, phase_is_inner, &
- NSOURCES,stf_pre_compute, myrank)
+ call compute_add_sources_ac_s3_cuda(Mesh_pointer,phase_is_inner,NSOURCES,stf_pre_compute)
endif
endif
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_poroelastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_poroelastic.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_poroelastic.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -274,9 +274,10 @@
! reads in **sta**.**net**.**LH**.adj files
adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
call compute_arrays_adjoint_source(myrank,adj_source_file, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- adj_sourcearray, xigll,yigll,zigll, &
- it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ adj_sourcearray, xigll,yigll,zigll, &
+ it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
enddo
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_add_sources_viscoelastic.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -102,6 +102,9 @@
!equivalence (i2head,i4head,r4head) ! share the same 240-byte-memory
double precision :: hxir(NGLLX),hpxir(NGLLX),hetar(NGLLY),hpetar(NGLLY),hgammar(NGLLZ),hpgammar(NGLLZ)
+ ! some old tests (currently unstable; do not remove them though, we might fix this one day)
+ if (OLD_TEST_TO_FIX_ONE_DAY) return
+
! plotting source time function
if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
! initializes total
@@ -250,9 +253,10 @@
! reads in **sta**.**net**.**LH**.adj files
adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
call compute_arrays_adjoint_source(myrank,adj_source_file, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- adj_sourcearray, xigll,yigll,zigll, &
- it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ adj_sourcearray, xigll,yigll,zigll, &
+ it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
enddo
@@ -355,37 +359,38 @@
endif
! for noise simulations
- ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
- ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
- ! because boundary points are claculated first!
- if( .not. phase_is_inner ) then
- if ( NOISE_TOMOGRAPHY == 1 ) then
- ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
- ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
- ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
- ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
- call add_source_master_rec_noise(myrank,nrec, &
- NSTEP,accel,noise_sourcearray, &
- ibool,islice_selected_rec,ispec_selected_rec, &
- it,irec_master_noise, &
- NSPEC_AB,NGLOB_AB)
- else if ( NOISE_TOMOGRAPHY == 2 ) then
- ! second step of noise tomography, i.e., read the surface movie saved at every timestep
- ! use the movie to drive the ensemble forward wavefield
- call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces,accel, &
- normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
- ibool,noise_surface_movie,NSTEP-it+1,NSPEC_AB,NGLOB_AB, &
- num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
- free_surface_jacobian2Dw)
- ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
- ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
- ! note the ensemble forward sources are generally distributed on the surface of the earth
- ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
- ! therefore, we must add it here, before applying the inverse of mass matrix
+ if( NOISE_TOMOGRAPHY > 0 ) then
+ ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
+ ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
+ ! because boundary points are claculated first!
+ if( .not. phase_is_inner ) then
+ if ( NOISE_TOMOGRAPHY == 1 ) then
+ ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
+ ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
+ ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
+ ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
+ call add_source_master_rec_noise(myrank,nrec, &
+ NSTEP,accel,noise_sourcearray, &
+ ibool,islice_selected_rec,ispec_selected_rec, &
+ it,irec_master_noise, &
+ NSPEC_AB,NGLOB_AB)
+ else if ( NOISE_TOMOGRAPHY == 2 ) then
+ ! second step of noise tomography, i.e., read the surface movie saved at every timestep
+ ! use the movie to drive the ensemble forward wavefield
+ call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces,accel, &
+ normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
+ ibool,noise_surface_movie,NSTEP-it+1,NSPEC_AB,NGLOB_AB, &
+ num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
+ free_surface_jacobian2Dw)
+ ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
+ ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
+ ! note the ensemble forward sources are generally distributed on the surface of the earth
+ ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
+ ! therefore, we must add it here, before applying the inverse of mass matrix
+ endif
endif
endif
-
end subroutine compute_add_sources_viscoelastic
!
!=====================================================================
@@ -443,6 +448,9 @@
! adjoint sources in SU format
integer,parameter :: nheader=240 ! 240 bytes
+ ! some old tests (currently unstable; do not remove them though, we might fix this one day)
+ if (OLD_TEST_TO_FIX_ONE_DAY) return
+
! plotting source time function
if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
! initializes total
@@ -566,24 +574,25 @@
endif
! for noise simulations
- ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
- ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
- ! because boundary points are claculated first!
- if( .not. phase_is_inner ) then
- if ( NOISE_TOMOGRAPHY == 3 ) then
- ! third step of noise tomography, i.e., read the surface movie saved at every timestep
- ! use the movie to reconstruct the ensemble forward wavefield
- ! the ensemble adjoint wavefield is done as usual
- ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
- call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces,b_accel, &
- normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
- ibool,noise_surface_movie,it,NSPEC_AB,NGLOB_AB, &
- num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
- free_surface_jacobian2Dw)
+ if( NOISE_TOMOGRAPHY > 0 ) then
+ ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
+ ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
+ ! because boundary points are claculated first!
+ if( .not. phase_is_inner ) then
+ if ( NOISE_TOMOGRAPHY == 3 ) then
+ ! third step of noise tomography, i.e., read the surface movie saved at every timestep
+ ! use the movie to reconstruct the ensemble forward wavefield
+ ! the ensemble adjoint wavefield is done as usual
+ ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
+ call noise_read_add_surface_movie(NGLLSQUARE*num_free_surface_faces,b_accel, &
+ normal_x_noise,normal_y_noise,normal_z_noise,mask_noise, &
+ ibool,noise_surface_movie,it,NSPEC_AB,NGLOB_AB, &
+ num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
+ free_surface_jacobian2Dw)
+ endif
endif
endif
-
end subroutine compute_add_sources_viscoelastic_bpwf
!
@@ -596,7 +605,8 @@
ispec_is_elastic,SIMULATION_TYPE,NSTEP, &
nrec,islice_selected_rec,ispec_selected_rec, &
nadj_rec_local,adj_sourcearrays, &
- NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY,Mesh_pointer)
+ NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY, &
+ Mesh_pointer)
use specfem_par,only: PRINT_SOURCE_TIME_FUNCTION,stf_used_total, &
xigll,yigll,zigll,xi_receiver,eta_receiver,gamma_receiver,&
@@ -659,6 +669,9 @@
!equivalence (i2head,i4head,r4head) ! share the same 240-byte-memory
double precision :: hxir(NGLLX),hpxir(NGLLX),hetar(NGLLY),hpetar(NGLLY),hgammar(NGLLZ),hpgammar(NGLLZ)
+ ! some old tests (currently unstable; do not remove them though, we might fix this one day)
+ if (OLD_TEST_TO_FIX_ONE_DAY) return
+
! plotting source time function
if(PRINT_SOURCE_TIME_FUNCTION .and. .not. phase_is_inner ) then
! initializes total
@@ -667,7 +680,6 @@
! forward simulations
if (SIMULATION_TYPE == 1 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then
-
if( NSOURCES > 0 ) then
do isource = 1,NSOURCES
! precomputes source time function factor
@@ -692,8 +704,7 @@
! only implements SIMTYPE=1 and NOISE_TOM=0
! write(*,*) "fortran dt = ", dt
! change dt -> DT
- call compute_add_sources_el_cuda(Mesh_pointer, phase_is_inner, &
- NSOURCES, stf_pre_compute, myrank)
+ call compute_add_sources_el_cuda(Mesh_pointer,stf_pre_compute,NSOURCES,phase_is_inner)
endif
endif ! forward
@@ -723,7 +734,6 @@
! adjoint simulations
if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
-
! adds adjoint source in this partitions
if( nadj_rec_local > 0 ) then
@@ -758,9 +768,10 @@
! reads in **sta**.**net**.**LH**.adj files
adj_source_file = trim(station_name(irec))//'.'//trim(network_name(irec))
call compute_arrays_adjoint_source(myrank,adj_source_file, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- adj_sourcearray, xigll,yigll,zigll, &
- it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ adj_sourcearray, xigll,yigll,zigll, &
+ it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+
do itime = 1,NTSTEP_BETWEEN_READ_ADJSRC
adj_sourcearrays(irec_local,itime,:,:,:,:) = adj_sourcearray(itime,:,:,:,:)
enddo
@@ -822,7 +833,8 @@
if( it < NSTEP ) then
call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
ispec_is_inner,ispec_is_elastic, &
- ispec_selected_rec,myrank,nrec, &
+ ispec_selected_rec, &
+ nrec, &
NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
islice_selected_rec,nadj_rec_local, &
NTSTEP_BETWEEN_READ_ADJSRC)
@@ -836,33 +848,30 @@
! adjoint simulations
if (SIMULATION_TYPE == 3 .and. NOISE_TOMOGRAPHY == 0 .and. nsources_local > 0) then
-
- if( NSOURCES > 0 ) then
- do isource = 1,NSOURCES
- ! precomputes source time function factors
- if(USE_FORCE_POINT_SOURCE) then
- if( USE_RICKER_TIME_FUNCTION ) then
- stf_pre_compute(isource) = &
- comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
- else
- stf_pre_compute(isource) = &
- comp_source_time_function_gauss(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_tiny(isource))
- endif
- else
- if( USE_RICKER_TIME_FUNCTION ) then
- stf_pre_compute(isource) = &
- comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
- else
- stf_pre_compute(isource) = &
- comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
- endif
- endif
- enddo
- ! only implements SIMTYPE=3
- call compute_add_sources_el_s3_cuda(Mesh_pointer,stf_pre_compute, &
- NSOURCES,phase_is_inner,myrank)
-
- endif
+ if( NSOURCES > 0 ) then
+ do isource = 1,NSOURCES
+ ! precomputes source time function factors
+ if(USE_FORCE_POINT_SOURCE) then
+ if( USE_RICKER_TIME_FUNCTION ) then
+ stf_pre_compute(isource) = &
+ comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
+ else
+ stf_pre_compute(isource) = &
+ comp_source_time_function_gauss(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_tiny(isource))
+ endif
+ else
+ if( USE_RICKER_TIME_FUNCTION ) then
+ stf_pre_compute(isource) = &
+ comp_source_time_function_rickr(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur(isource))
+ else
+ stf_pre_compute(isource) = &
+ comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_src(isource),hdur_gaussian(isource))
+ endif
+ endif
+ enddo
+ ! only implements SIMTYPE=3
+ call compute_add_sources_el_s3_cuda(Mesh_pointer,stf_pre_compute,NSOURCES,phase_is_inner)
+ endif
endif ! adjoint
! master prints out source time function to file
@@ -873,35 +882,36 @@
endif
! for noise simulations
- ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
- ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
- ! because boundary points are claculated first!
- if( .not. phase_is_inner ) then
- if ( NOISE_TOMOGRAPHY == 1 ) then
- ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
- ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
- ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
- ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
- call add_source_master_rec_noise_cu(Mesh_pointer, myrank, it, irec_master_noise, islice_selected_rec)
- else if ( NOISE_TOMOGRAPHY == 2 ) then
- ! second step of noise tomography, i.e., read the surface movie saved at every timestep
- ! use the movie to drive the ensemble forward wavefield
- call noise_read_add_surface_movie_GPU(noise_surface_movie,NSTEP-it+1,num_free_surface_faces, &
- Mesh_pointer,NOISE_TOMOGRAPHY)
- ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
- ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
- ! note the ensemble forward sources are generally distributed on the surface of the earth
- ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
- ! therefore, we must add it here, before applying the inverse of mass matrix
- else if ( NOISE_TOMOGRAPHY == 3 ) then
- ! third step of noise tomography, i.e., read the surface movie saved at every timestep
- ! use the movie to reconstruct the ensemble forward wavefield
- ! the ensemble adjoint wavefield is done as usual
- ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
- call noise_read_add_surface_movie_GPU(noise_surface_movie,it,num_free_surface_faces, &
- Mesh_pointer,NOISE_TOMOGRAPHY)
+ if( NOISE_TOMOGRAPHY > 0 ) then
+ ! we have two loops indicated by phase_is_inner ("inner elements/points" or "boundary elements/points")
+ ! here, we only add those noise sources once, when we are calculating for boudanry points (phase_is_inner==.false.),
+ ! because boundary points are claculated first!
+ if( .not. phase_is_inner ) then
+ if ( NOISE_TOMOGRAPHY == 1 ) then
+ ! the first step of noise tomography is to use |S(\omega)|^2 as a point force source at one of the receivers.
+ ! hence, instead of a moment tensor 'sourcearrays', a 'noise_sourcearray' for a point force is needed.
+ ! furthermore, the CMTSOLUTION needs to be zero, i.e., no earthquakes.
+ ! now this must be manually set in DATA/CMTSOLUTION, by USERS.
+ call add_source_master_rec_noise_cu(Mesh_pointer,it,irec_master_noise,islice_selected_rec)
+ else if ( NOISE_TOMOGRAPHY == 2 ) then
+ ! second step of noise tomography, i.e., read the surface movie saved at every timestep
+ ! use the movie to drive the ensemble forward wavefield
+ call noise_read_add_surface_movie_GPU(noise_surface_movie,NSTEP-it+1,num_free_surface_faces, &
+ Mesh_pointer,NOISE_TOMOGRAPHY)
+ ! be careful, since ensemble forward sources are reversals of generating wavefield "eta"
+ ! hence the "NSTEP-it+1", i.e., start reading from the last timestep
+ ! note the ensemble forward sources are generally distributed on the surface of the earth
+ ! that's to say, the ensemble forward source is kind of a surface force density, not a body force density
+ ! therefore, we must add it here, before applying the inverse of mass matrix
+ else if ( NOISE_TOMOGRAPHY == 3 ) then
+ ! third step of noise tomography, i.e., read the surface movie saved at every timestep
+ ! use the movie to reconstruct the ensemble forward wavefield
+ ! the ensemble adjoint wavefield is done as usual
+ ! note instead of "NSTEP-it+1", now we us "it", since reconstruction is a reversal of reversal
+ call noise_read_add_surface_movie_GPU(noise_surface_movie,it,num_free_surface_faces, &
+ Mesh_pointer,NOISE_TOMOGRAPHY)
+ endif
endif
endif
-
end subroutine compute_add_sources_viscoelastic_GPU
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_arrays_source.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_arrays_source.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_arrays_source.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -106,9 +106,9 @@
!=============================================================================
subroutine compute_arrays_adjoint_source(myrank, adj_source_file, &
- xi_receiver,eta_receiver,gamma_receiver, adj_sourcearray, &
- xigll,yigll,zigll, &
- it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
+ xi_receiver,eta_receiver,gamma_receiver, adj_sourcearray, &
+ xigll,yigll,zigll, &
+ it_sub_adj,NSTEP,NTSTEP_BETWEEN_READ_ADJSRC)
implicit none
@@ -132,7 +132,7 @@
double precision :: hxir(NGLLX), hpxir(NGLLX), hetar(NGLLY), hpetar(NGLLY), &
hgammar(NGLLZ), hpgammar(NGLLZ)
- real(kind=CUSTOM_REAL) :: adj_src(NTSTEP_BETWEEN_READ_ADJSRC,NDIM)
+ real(kind=CUSTOM_REAL), dimension(NTSTEP_BETWEEN_READ_ADJSRC,NDIM) :: adj_src
integer icomp, itime, i, j, k, ios, it_start, it_end
double precision :: junk
@@ -149,8 +149,7 @@
it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
it_end = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
- !adj_sourcearray(:,:,:,:,:) = 0.
- adj_src = 0._CUSTOM_REAL
+ adj_src(:,:) = 0._CUSTOM_REAL
! loops over components
do icomp = 1, NDIM
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_coupling_viscoelastic_ac.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -295,6 +295,9 @@
! adjoint locals
real(kind=CUSTOM_REAL) :: b_force_normal_comp
+ ! checks if anything to do
+ if (SIMULATION_TYPE /= 3) return
+
! initialize the updates
updated_dof_ocean_load(:) = .false.
@@ -323,18 +326,16 @@
! we use the total force which includes the Coriolis term above
! adjoint simulations
- if (SIMULATION_TYPE == 3) then
- b_force_normal_comp = b_accel(1,iglob)*nx / rmassx(iglob) &
- + b_accel(2,iglob)*ny / rmassy(iglob) &
- + b_accel(3,iglob)*nz / rmassz(iglob)
+ b_force_normal_comp = b_accel(1,iglob)*nx / rmassx(iglob) &
+ + b_accel(2,iglob)*ny / rmassy(iglob) &
+ + b_accel(3,iglob)*nz / rmassz(iglob)
- b_accel(1,iglob) = b_accel(1,iglob) &
- + (rmass_ocean_load(iglob) - rmassx(iglob)) * b_force_normal_comp * nx
- b_accel(2,iglob) = b_accel(2,iglob) &
- + (rmass_ocean_load(iglob) - rmassy(iglob)) * b_force_normal_comp * ny
- b_accel(3,iglob) = b_accel(3,iglob) &
- + (rmass_ocean_load(iglob) - rmassz(iglob)) * b_force_normal_comp * nz
- endif !adjoint
+ b_accel(1,iglob) = b_accel(1,iglob) &
+ + (rmass_ocean_load(iglob) - rmassx(iglob)) * b_force_normal_comp * nx
+ b_accel(2,iglob) = b_accel(2,iglob) &
+ + (rmass_ocean_load(iglob) - rmassy(iglob)) * b_force_normal_comp * ny
+ b_accel(3,iglob) = b_accel(3,iglob) &
+ + (rmass_ocean_load(iglob) - rmassz(iglob)) * b_force_normal_comp * nz
! done with this point
updated_dof_ocean_load(iglob) = .true.
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_acoustic_calling_routine.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -186,7 +186,7 @@
! assemble all the contributions between slices using MPI
if( phase_is_inner .eqv. .false. ) then
! sends potential_dot_dot_acoustic values to corresponding MPI interface neighbors (non-blocking)
- call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
+ call assemble_MPI_scalar_async_send(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
@@ -195,7 +195,7 @@
else
! waits for send/receive requests to be completed and assembles values
- call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
+ call assemble_MPI_scalar_async_recv(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
@@ -336,12 +336,15 @@
integer:: iphase
logical:: phase_is_inner
+ ! checks
+ if( SIMULATION_TYPE /= 3 ) &
+ call exit_MPI(myrank,'error calling compute_forces_acoustic_bpwf() with wrong SIMULATION_TYPE')
+
! adjoint simulations
- if( SIMULATION_TYPE == 3 ) &
- call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT,STACEY_INSTEAD_OF_FREE_SURFACE, &
- b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
- ibool,free_surface_ijk,free_surface_ispec, &
- num_free_surface_faces,ispec_is_acoustic)
+ call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT,STACEY_INSTEAD_OF_FREE_SURFACE, &
+ b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
+ ibool,free_surface_ijk,free_surface_ispec, &
+ num_free_surface_faces,ispec_is_acoustic)
! distinguishes two runs: for elements on MPI interfaces, and elements within the partitions
do iphase=1,2
@@ -354,29 +357,27 @@
endif
! adjoint simulations
- if( SIMULATION_TYPE == 3 ) then
- if(USE_DEVILLE_PRODUCTS) then
- ! uses Deville (2002) optimizations
- call compute_forces_acoustic_Dev(iphase,NSPEC_ADJOINT,NGLOB_ADJOINT, &
- b_potential_acoustic,b_potential_dot_dot_acoustic, &
- xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
- hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
- wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D, &
- rhostore,jacobian,ibool, &
- num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,&
- phase_ispec_inner_acoustic)
- else
- call compute_forces_acoustic_noDev(iphase,NSPEC_ADJOINT,NGLOB_ADJOINT, &
- b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
- xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
- hprime_xx,hprime_yy,hprime_zz, &
- hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
- wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
- rhostore,jacobian,ibool,deltat, &
- num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,&
- phase_ispec_inner_acoustic,ELASTIC_SIMULATION,&
- .true.,potential_dot_dot_acoustic_interface)
- endif
+ if(USE_DEVILLE_PRODUCTS) then
+ ! uses Deville (2002) optimizations
+ call compute_forces_acoustic_Dev(iphase,NSPEC_ADJOINT,NGLOB_ADJOINT, &
+ b_potential_acoustic,b_potential_dot_dot_acoustic, &
+ xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+ hprime_xx,hprime_xxT,hprimewgll_xx,hprimewgll_xxT, &
+ wgllwgll_xy_3D,wgllwgll_xz_3D,wgllwgll_yz_3D, &
+ rhostore,jacobian,ibool, &
+ num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,&
+ phase_ispec_inner_acoustic)
+ else
+ call compute_forces_acoustic_noDev(iphase,NSPEC_ADJOINT,NGLOB_ADJOINT, &
+ b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
+ xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+ hprime_xx,hprime_yy,hprime_zz, &
+ hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
+ wgllwgll_xy,wgllwgll_xz,wgllwgll_yz, &
+ rhostore,jacobian,ibool,deltat, &
+ num_phase_ispec_acoustic,nspec_inner_acoustic,nspec_outer_acoustic,&
+ phase_ispec_inner_acoustic,ELASTIC_SIMULATION,&
+ .true.,potential_dot_dot_acoustic_interface)
endif
! ! Stacey absorbing boundary conditions
@@ -394,25 +395,23 @@
if(ELASTIC_SIMULATION ) then
if( num_coupling_ac_el_faces > 0 ) then
! adjoint/kernel simulations
- if( SIMULATION_TYPE == 3 ) &
- call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, &
- ibool,b_displ,b_potential_dot_dot_acoustic, &
- num_coupling_ac_el_faces, &
- coupling_ac_el_ispec,coupling_ac_el_ijk, &
- coupling_ac_el_normal, &
- coupling_ac_el_jacobian2Dw, &
- ispec_is_inner,phase_is_inner,&
- PML_CONDITIONS,spec_to_CPML,is_CPML,&
- potential_dot_dot_acoustic_interface,veloc,rmemory_coupling_ac_el_displ,&
- SIMULATION_TYPE,.true.,accel_interface)
+ call compute_coupling_acoustic_el(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+ ibool,b_displ,b_potential_dot_dot_acoustic, &
+ num_coupling_ac_el_faces, &
+ coupling_ac_el_ispec,coupling_ac_el_ijk, &
+ coupling_ac_el_normal, &
+ coupling_ac_el_jacobian2Dw, &
+ ispec_is_inner,phase_is_inner,&
+ PML_CONDITIONS,spec_to_CPML,is_CPML,&
+ potential_dot_dot_acoustic_interface,veloc,rmemory_coupling_ac_el_displ,&
+ SIMULATION_TYPE,.true.,accel_interface)
endif
endif
! poroelastic coupling
if(POROELASTIC_SIMULATION ) then
if( num_coupling_ac_po_faces > 0 ) then
- if( SIMULATION_TYPE == 3 ) &
- stop 'not implemented yet'
+ stop 'coupling acoustic-poroelastic domains not implemented yet...'
endif
endif
@@ -429,32 +428,27 @@
if( phase_is_inner .eqv. .false. ) then
! sends b_potential_dot_dot_acoustic values to corresponding MPI interface neighbors (non-blocking)
! adjoint simulations
- if( SIMULATION_TYPE == 3 ) then
- call assemble_MPI_scalar_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
- b_buffer_send_scalar_ext_mesh,b_buffer_recv_scalar_ext_mesh, &
- num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
- my_neighbours_ext_mesh, &
- b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
- endif
+ call assemble_MPI_scalar_async_send(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
+ b_buffer_send_scalar_ext_mesh,b_buffer_recv_scalar_ext_mesh, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
+ my_neighbours_ext_mesh, &
+ b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
else
! adjoint simulations
- if( SIMULATION_TYPE == 3 ) then
- call assemble_MPI_scalar_ext_mesh_w(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
- b_buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
- max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
- endif
+ call assemble_MPI_scalar_async_recv(NPROC,NGLOB_ADJOINT,b_potential_dot_dot_acoustic, &
+ b_buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
+ max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+ b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh)
endif !phase_is_inner
enddo
! divides pressure with mass matrix
! adjoint simulations
- if (SIMULATION_TYPE == 3) &
- b_potential_dot_dot_acoustic(:) = b_potential_dot_dot_acoustic(:) * rmass_acoustic(:)
+ b_potential_dot_dot_acoustic(:) = b_potential_dot_dot_acoustic(:) * rmass_acoustic(:)
! update velocity
! note: Newmark finite-difference time scheme with acoustic domains:
@@ -471,18 +465,17 @@
!
! corrector:
! updates the chi_dot term which requires chi_dot_dot(t+delta)
+
! corrector
! adjoint simulations
- if (SIMULATION_TYPE == 3) &
- b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) + b_deltatover2*b_potential_dot_dot_acoustic(:)
+ b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) + b_deltatover2*b_potential_dot_dot_acoustic(:)
! enforces free surface (zeroes potentials at free surface)
! adjoint simulations
- if (SIMULATION_TYPE == 3) &
- call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT,STACEY_INSTEAD_OF_FREE_SURFACE, &
- b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
- ibool,free_surface_ijk,free_surface_ispec, &
- num_free_surface_faces,ispec_is_acoustic)
+ call acoustic_enforce_free_surface(NSPEC_AB,NGLOB_ADJOINT,STACEY_INSTEAD_OF_FREE_SURFACE, &
+ b_potential_acoustic,b_potential_dot_acoustic,b_potential_dot_dot_acoustic, &
+ ibool,free_surface_ijk,free_surface_ispec, &
+ num_free_surface_faces,ispec_is_acoustic)
end subroutine compute_forces_acoustic_bpwf
!
@@ -568,13 +561,9 @@
! assemble all the contributions between slices using MPI
if( phase_is_inner .eqv. .false. ) then
! sends potential_dot_dot_acoustic values to corresponding MPI interface neighbors (non-blocking)
- call transfer_boun_pot_from_device(NGLOB_AB, Mesh_pointer, &
+ call transfer_boun_pot_from_device(Mesh_pointer, &
potential_dot_dot_acoustic, &
buffer_send_scalar_ext_mesh, &
- num_interfaces_ext_mesh, &
- max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh, &
- ibool_interfaces_ext_mesh, &
1) ! <-- 1 == fwd accel
call assemble_MPI_scalar_send_cuda(NPROC, &
buffer_send_scalar_ext_mesh,buffer_recv_scalar_ext_mesh, &
@@ -585,13 +574,9 @@
! adjoint simulations
if( SIMULATION_TYPE == 3 ) then
- call transfer_boun_pot_from_device(NGLOB_AB, Mesh_pointer, &
+ call transfer_boun_pot_from_device(Mesh_pointer, &
b_potential_dot_dot_acoustic, &
b_buffer_send_scalar_ext_mesh,&
- num_interfaces_ext_mesh, &
- max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh, &
- ibool_interfaces_ext_mesh, &
3) ! <-- 3 == adjoint b_accel
call assemble_MPI_scalar_send_cuda(NPROC, &
@@ -608,8 +593,8 @@
! waits for send/receive requests to be completed and assembles values
call assemble_MPI_scalar_write_cuda(NPROC,NGLOB_AB,potential_dot_dot_acoustic, &
Mesh_pointer,&
- buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh,&
- max_nibool_interfaces_ext_mesh, &
+ buffer_recv_scalar_ext_mesh, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, &
1)
@@ -618,10 +603,10 @@
if( SIMULATION_TYPE == 3 ) then
call assemble_MPI_scalar_write_cuda(NPROC,NGLOB_AB,b_potential_dot_dot_acoustic, &
Mesh_pointer, &
- b_buffer_recv_scalar_ext_mesh,num_interfaces_ext_mesh, &
- max_nibool_interfaces_ext_mesh, &
+ b_buffer_recv_scalar_ext_mesh, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- b_request_send_scalar_ext_mesh,request_recv_scalar_ext_mesh, &
+ b_request_send_scalar_ext_mesh,b_request_recv_scalar_ext_mesh, &
3)
endif
endif !phase_is_inner
@@ -629,7 +614,7 @@
enddo
! divides pressure with mass matrix
- call kernel_3_a_acoustic_cuda(Mesh_pointer,NGLOB_AB)
+ call kernel_3_a_acoustic_cuda(Mesh_pointer)
! update velocity
! note: Newmark finite-difference time scheme with acoustic domains:
@@ -646,7 +631,7 @@
!
! corrector:
! updates the chi_dot term which requires chi_dot_dot(t+delta)
- call kernel_3_b_acoustic_cuda(Mesh_pointer,NGLOB_AB,deltatover2,b_deltatover2)
+ call kernel_3_b_acoustic_cuda(Mesh_pointer,deltatover2,b_deltatover2)
! enforces free surface (zeroes potentials at free surface)
call acoustic_enforce_free_surf_cuda(Mesh_pointer,STACEY_INSTEAD_OF_FREE_SURFACE)
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic_calling_routine.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic_calling_routine.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_poroelastic_calling_routine.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -124,7 +124,7 @@
else
! on GPU
-stop 'GPU for poroelastic simulation not implemented'
+ call exit_MPI(myrank,'GPU for poroelastic simulation not implemented')
endif ! GPU_MODE
! adds poroelastic absorbing boundary terms to accelerations (type Stacey conditions)
@@ -260,12 +260,6 @@
b_request_send_vector_ext_meshw,b_request_recv_vector_ext_meshw)
endif !adjoint
- !! DK DK May 2009: removed this because now each slice of a CUBIT + SCOTCH mesh
- !! DK DK May 2009: has a different number of spectral elements and therefore
- !! DK DK May 2009: only the general non-blocking MPI routines assemble_MPI_vector_ext_mesh_s
- !! DK DK May 2009: and assemble_MPI_vector_ext_mesh_w above can be used.
- !! DK DK May 2009: For adjoint runs below (SIMULATION_TYPE == 3) they should be used as well.
-
endif
enddo
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_forces_viscoelastic_calling_routine.F90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -52,14 +52,15 @@
phase_is_inner = .true.
endif
- ! elastic term
+! elastic term
if(USE_DEVILLE_PRODUCTS) then
! uses Deville (2002) optimizations
call compute_forces_viscoelastic_Dev_sim1(iphase)
else
! no optimizations used
- call compute_forces_viscoelastic_noDev(iphase,NSPEC_AB,NGLOB_AB,displ,veloc,accel, &
+ call compute_forces_viscoelastic_noDev(iphase,NSPEC_AB,NGLOB_AB, &
+ displ,veloc,accel, &
xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
hprime_xx,hprime_yy,hprime_zz, &
hprimewgll_xx,hprimewgll_yy,hprimewgll_zz, &
@@ -84,7 +85,8 @@
dsdx_top,dsdx_bot, &
ispec2D_moho_top,ispec2D_moho_bot, &
num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic, &
- phase_ispec_inner_elastic,.false.,accel_interface,ACOUSTIC_SIMULATION)
+ phase_ispec_inner_elastic,.false., &
+ accel_interface,ACOUSTIC_SIMULATION)
endif
@@ -117,7 +119,8 @@
coupling_ac_el_jacobian2Dw, &
ispec_is_inner,phase_is_inner,&
PML_CONDITIONS,is_CPML,potential_dot_dot_acoustic_interface,&
- SIMULATION_TYPE,.false.,accel_interface,&
+ SIMULATION_TYPE,.false., &
+ accel_interface,&
rmemory_coupling_el_ac_potential,spec_to_CPML,&
potential_acoustic,potential_dot_acoustic)
@@ -133,7 +136,8 @@
coupling_ac_el_jacobian2Dw, &
ispec_is_inner,phase_is_inner,&
PML_CONDITIONS,is_CPML,potential_dot_dot_acoustic_interface,&
- SIMULATION_TYPE,.false.,accel_interface,&
+ SIMULATION_TYPE,.false., &
+ accel_interface,&
rmemory_coupling_el_ac_potential,spec_to_CPML,&
potential_acoustic,potential_dot_acoustic)
@@ -167,7 +171,7 @@
endif
! adds source term (single-force/moment-tensor solution)
- if (.not. OLD_TEST_TO_FIX_ONE_DAY) call compute_add_sources_viscoelastic( NSPEC_AB,NGLOB_AB,accel, &
+ call compute_add_sources_viscoelastic( NSPEC_AB,NGLOB_AB,accel, &
ibool,ispec_is_inner,phase_is_inner, &
NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,&
hdur,hdur_gaussian,tshift_src,dt,t0,sourcearrays, &
@@ -179,7 +183,7 @@
! assemble all the contributions between slices using MPI
if( phase_is_inner .eqv. .false. ) then
! sends accel values to corresponding MPI interface neighbors
- call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_AB,accel, &
+ call assemble_MPI_vector_async_send(NPROC,NGLOB_AB,accel, &
buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
@@ -188,7 +192,7 @@
else
! waits for send/receive requests to be completed and assembles values
- call assemble_MPI_vector_ext_mesh_w_ordered(NPROC,NGLOB_AB,accel, &
+ call assemble_MPI_vector_async_w_ord(NPROC,NGLOB_AB,accel, &
buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
@@ -201,7 +205,6 @@
!Percy , Fault boundary term B*tau is added to the assembled forces
! which at this point are stored in the array 'accel'
if (SIMULATION_TYPE_DYN) call bc_dynflt_set3d_all(accel,veloc,displ)
-
if (SIMULATION_TYPE_KIN) call bc_kinflt_set_all(accel,veloc,displ)
! multiplies with inverse of mass matrix (note: rmass has been inverted already)
@@ -229,37 +232,36 @@
endif
! C-PML boundary
- if(PML_CONDITIONS)then
- do iface=1,num_abs_boundary_faces
- ispec = abs_boundary_ispec(iface)
+ if( PML_CONDITIONS ) then
+ do iface=1,num_abs_boundary_faces
+ ispec = abs_boundary_ispec(iface)
!!! It is better to move this into do iphase=1,2 loop
!!! if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
- if( ispec_is_elastic(ispec) .and. is_CPML(ispec)) then
- ! reference gll points on boundary face
- do igll = 1,NGLLSQUARE
+ if( ispec_is_elastic(ispec) .and. is_CPML(ispec)) then
+ ! reference gll points on boundary face
+ do igll = 1,NGLLSQUARE
+ ! gets local indices for GLL point
+ i = abs_boundary_ijk(1,igll,iface)
+ j = abs_boundary_ijk(2,igll,iface)
+ k = abs_boundary_ijk(3,igll,iface)
- ! gets local indices for GLL point
- i = abs_boundary_ijk(1,igll,iface)
- j = abs_boundary_ijk(2,igll,iface)
- k = abs_boundary_ijk(3,igll,iface)
+ iglob=ibool(i,j,k,ispec)
- iglob=ibool(i,j,k,ispec)
+ accel(:,iglob) = 0.0
+ veloc(:,iglob) = 0.0
+ displ(:,iglob) = 0.0
- accel(:,iglob) = 0.0
- veloc(:,iglob) = 0.0
- displ(:,iglob) = 0.0
+ if(SIMULATION_TYPE ==3)then
+ if(ACOUSTIC_SIMULATION)then
+ accel_interface(:,iglob) = 0.0
+ endif
+ endif
- if(SIMULATION_TYPE ==3)then
- if(ACOUSTIC_SIMULATION)then
- accel_interface(:,iglob) = 0.0
- endif
- endif
-
- enddo
- endif ! ispec_is_elastic
+ enddo
+ endif ! ispec_is_elastic
!!! endif
- enddo
- endif
+ enddo
+ endif
! updates velocities
! Newmark finite-difference time scheme with elastic domains:
@@ -279,9 +281,9 @@
! updates the velocity term which requires a(t+delta)
veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
- if(PML_CONDITIONS)then
- if(SIMULATION_TYPE == 1 .and. SAVE_FORWARD)then
- if(nglob_interface_PML_elastic > 0)then
+ if( PML_CONDITIONS ) then
+ if( SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
+ if( nglob_interface_PML_elastic > 0 ) then
call save_field_on_pml_interface(displ,veloc,accel,nglob_interface_PML_elastic,&
b_PML_field,b_reclen_PML_field)
endif
@@ -289,10 +291,12 @@
endif
end subroutine compute_forces_viscoelastic
+
!
!=====================================================================
+!
-! elastic solver for back
+! elastic solver for backward/reconstructed wavefields
subroutine compute_forces_viscoelastic_bpwf()
@@ -309,7 +313,11 @@
integer:: iphase
logical:: phase_is_inner
-! distinguishes two runs: for points on MPI interfaces, and points within the partitions
+ ! checks
+ if( SIMULATION_TYPE /= 3 ) &
+ call exit_MPI(myrank,'error calling compute_forces_viscoelastic_bpwf() with wrong SIMULATION_TYPE')
+
+ ! distinguishes two runs: for points on MPI interfaces, and points within the partitions
do iphase=1,2
!first for points on MPI interfaces
@@ -322,15 +330,13 @@
! elastic term
if(USE_DEVILLE_PRODUCTS) then
-
! adjoint simulations: backward/reconstructed wavefield
- if( SIMULATION_TYPE == 3 ) call compute_forces_viscoelastic_Dev_sim3(iphase)
+ call compute_forces_viscoelastic_Dev_sim3(iphase)
else
! no optimizations used
! adjoint simulations: backward/reconstructed wavefield
- if( SIMULATION_TYPE == 3 ) &
- call compute_forces_viscoelastic_noDev(iphase,NSPEC_AB,NGLOB_AB, &
+ call compute_forces_viscoelastic_noDev(iphase,NSPEC_AB,NGLOB_AB, &
b_displ,b_veloc,b_accel, &
xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
hprime_xx,hprime_yy,hprime_zz, &
@@ -356,8 +362,8 @@
b_dsdx_top,b_dsdx_bot, &
ispec2D_moho_top,ispec2D_moho_bot, &
num_phase_ispec_elastic,nspec_inner_elastic,nspec_outer_elastic, &
- phase_ispec_inner_elastic,.true.,accel_interface,ACOUSTIC_SIMULATION)
-
+ phase_ispec_inner_elastic,.true., &
+ accel_interface,ACOUSTIC_SIMULATION)
endif
@@ -377,8 +383,7 @@
if( ACOUSTIC_SIMULATION ) then
if( num_coupling_ac_el_faces > 0 ) then
! adjoint simulations
- if( SIMULATION_TYPE == 3 ) &
- call compute_coupling_viscoelastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
+ call compute_coupling_viscoelastic_ac(NSPEC_ADJOINT,NGLOB_ADJOINT, &
ibool,b_accel,b_potential_dot_dot_acoustic, &
num_coupling_ac_el_faces, &
coupling_ac_el_ispec,coupling_ac_el_ijk, &
@@ -386,7 +391,8 @@
coupling_ac_el_jacobian2Dw, &
ispec_is_inner,phase_is_inner,&
PML_CONDITIONS,is_CPML,potential_dot_dot_acoustic_interface,&
- SIMULATION_TYPE,.true.,accel_interface,&
+ SIMULATION_TYPE,.true., &
+ accel_interface,&
rmemory_coupling_el_ac_potential,spec_to_CPML,&
potential_acoustic,potential_dot_acoustic)
@@ -400,7 +406,7 @@
endif
! adds source term (single-force/moment-tensor solution)
- if (.not. OLD_TEST_TO_FIX_ONE_DAY) call compute_add_sources_viscoelastic_bpwf( NSPEC_AB,NGLOB_AB, &
+ call compute_add_sources_viscoelastic_bpwf( NSPEC_AB,NGLOB_AB, &
ibool,ispec_is_inner,phase_is_inner, &
NSOURCES,myrank,it,islice_selected_source,ispec_selected_source,&
hdur,hdur_gaussian,tshift_src,dt,t0,sourcearrays, &
@@ -411,45 +417,39 @@
if( phase_is_inner .eqv. .false. ) then
! sends accel values to corresponding MPI interface neighbors
! adjoint simulations
- if( SIMULATION_TYPE == 3 ) then
- call assemble_MPI_vector_ext_mesh_s(NPROC,NGLOB_ADJOINT,b_accel, &
+ call assemble_MPI_vector_async_send(NPROC,NGLOB_ADJOINT,b_accel, &
b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
my_neighbours_ext_mesh, &
b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
- endif !adjoint
-
else
! waits for send/receive requests to be completed and assembles values
! adjoint simulations
- if( SIMULATION_TYPE == 3 ) then
- call assemble_MPI_vector_ext_mesh_w_ordered(NPROC,NGLOB_ADJOINT,b_accel, &
+ call assemble_MPI_vector_async_w_ord(NPROC,NGLOB_ADJOINT,b_accel, &
b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh, &
my_neighbours_ext_mesh,myrank)
- endif !adjoint
-
endif
enddo
! multiplies with inverse of mass matrix (note: rmass has been inverted already)
! adjoint simulations
- if (SIMULATION_TYPE == 3) then
- b_accel(1,:) = b_accel(1,:)*rmassx(:)
- b_accel(2,:) = b_accel(2,:)*rmassy(:)
- b_accel(3,:) = b_accel(3,:)*rmassz(:)
- endif !adjoint
+ b_accel(1,:) = b_accel(1,:)*rmassx(:)
+ b_accel(2,:) = b_accel(2,:)*rmassy(:)
+ b_accel(3,:) = b_accel(3,:)*rmassz(:)
! updates acceleration with ocean load term
if(APPROXIMATE_OCEAN_LOAD) then
call compute_coupling_ocean_bpwf(NSPEC_AB,NGLOB_AB, &
- ibool,rmassx,rmassy,rmassz,rmass_ocean_load, &
+ ibool,rmassx,rmassy,rmassz, &
+ rmass_ocean_load, &
free_surface_normal,free_surface_ijk,free_surface_ispec, &
- num_free_surface_faces,SIMULATION_TYPE, &
+ num_free_surface_faces, &
+ SIMULATION_TYPE, &
NGLOB_ADJOINT,b_accel)
endif
@@ -471,9 +471,10 @@
! corrector:
! updates the velocity term which requires a(t+delta)
! adjoint simulations
- if (SIMULATION_TYPE == 3) b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
+ b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
end subroutine compute_forces_viscoelastic_bpwf
+
!
!-------------------------------------------------------------------------------------------------
!
@@ -488,9 +489,7 @@
! forward simulations
use specfem_par
- use specfem_par_acoustic
use specfem_par_elastic
- use specfem_par_poroelastic
implicit none
@@ -583,9 +582,7 @@
! uses backward/reconstructed displacement and acceleration arrays
use specfem_par
- use specfem_par_acoustic
use specfem_par_elastic
- use specfem_par_poroelastic
implicit none
@@ -650,7 +647,11 @@
integer:: iphase
logical:: phase_is_inner
-! distinguishes two runs: for points on MPI interfaces, and points within the partitions
+ ! check
+ if( PML_CONDITIONS ) &
+ call exit_MPI(myrank,'PML conditions not yet implemented for routine compute_forces_viscoelastic_GPU()')
+
+ ! distinguishes two runs: for points on MPI interfaces, and points within the partitions
do iphase=1,2
!first for points on MPI interfaces
@@ -660,21 +661,20 @@
phase_is_inner = .true.
endif
-! elastic term
- ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
- call compute_forces_viscoelastic_cuda(Mesh_pointer, iphase, deltat, &
- nspec_outer_elastic, &
- nspec_inner_elastic, &
- COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
+ ! elastic term
+ ! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
+ call compute_forces_viscoelastic_cuda(Mesh_pointer, iphase, deltat, &
+ nspec_outer_elastic, &
+ nspec_inner_elastic, &
+ COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
- if(phase_is_inner .eqv. .true.) then
- ! while Inner elements compute "Kernel_2", we wait for MPI to
- ! finish and transfer the boundary terms to the device
- ! asynchronously
-
+ ! while inner elements compute "Kernel_2", we wait for MPI to
+ ! finish and transfer the boundary terms to the device asynchronously
+ if(phase_is_inner .eqv. .true.) then
!daniel: todo - this avoids calling the fortran vector send from CUDA routine
! wait for asynchronous copy to finish
call sync_copy_from_device(Mesh_pointer,iphase,buffer_send_vector_ext_mesh)
+
! sends mpi buffers
call assemble_MPI_vector_send_cuda(NPROC, &
buffer_send_vector_ext_mesh,buffer_recv_vector_ext_mesh, &
@@ -687,28 +687,35 @@
call transfer_boundary_to_device(NPROC,Mesh_pointer,buffer_recv_vector_ext_mesh, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
request_recv_vector_ext_mesh)
- endif ! inner elements
+ endif ! inner elements
-! adds elastic absorbing boundary term to acceleration (Stacey conditions)
+ ! adds elastic absorbing boundary term to acceleration (Stacey conditions)
if( STACEY_ABSORBING_CONDITIONS ) then
- call compute_stacey_viscoelastic_GPU(phase_is_inner,num_abs_boundary_faces, &
- SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
- b_num_abs_boundary_faces,b_reclen_field,b_absorb_field, &
- Mesh_pointer,it_dsm,Veloc_dsm_boundary,Tract_dsm_boundary)
+ call compute_stacey_viscoelastic_GPU(phase_is_inner,num_abs_boundary_faces, &
+ SIMULATION_TYPE,SAVE_FORWARD,NSTEP,it, &
+ b_num_abs_boundary_faces,b_reclen_field,b_absorb_field, &
+ Mesh_pointer, &
+ it_dsm,Veloc_dsm_boundary,Tract_dsm_boundary)
endif
-
-! acoustic coupling
+ ! acoustic coupling
if( ACOUSTIC_SIMULATION ) then
if( num_coupling_ac_el_faces > 0 ) then
call compute_coupling_el_ac_cuda(Mesh_pointer,phase_is_inner, &
num_coupling_ac_el_faces)
- endif ! num_coupling_ac_el_faces
+ endif
endif
+ ! poroelastic coupling
+ if( POROELASTIC_SIMULATION ) then
+ ! note:
+ ! these routines are not implemented as CUDA kernels, we just transfer the fields
+ ! from the GPU to the CPU and vice versa
-! poroelastic coupling
- if( POROELASTIC_SIMULATION ) then
+ ! transfers displacement & acceleration to the CPU
+ call transfer_displ_from_device(NDIM*NGLOB_AB,displ, Mesh_pointer)
+ call transfer_accel_from_device(NDIM*NGLOB_AB,accel, Mesh_pointer)
+
call compute_coupling_viscoelastic_po(NSPEC_AB,NGLOB_AB,ibool,&
displs_poroelastic,displw_poroelastic,&
xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
@@ -728,94 +735,105 @@
coupling_el_po_normal, &
coupling_el_po_jacobian2Dw, &
ispec_is_inner,phase_is_inner)
+
+ ! transfers acceleration back to GPU
+ call transfer_accel_to_device(NDIM*NGLOB_AB,accel, Mesh_pointer)
endif
-! adds source term (single-force/moment-tensor solution)
- if (.not. OLD_TEST_TO_FIX_ONE_DAY) call compute_add_sources_viscoelastic_GPU(NSPEC_AB, &
+ ! adds source term (single-force/moment-tensor solution)
+ call compute_add_sources_viscoelastic_GPU(NSPEC_AB, &
ispec_is_inner,phase_is_inner,NSOURCES,myrank,it,&
hdur,hdur_gaussian,tshift_src,dt,t0, &
ispec_is_elastic,SIMULATION_TYPE,NSTEP, &
nrec,islice_selected_rec,ispec_selected_rec, &
nadj_rec_local,adj_sourcearrays, &
- NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY,Mesh_pointer)
+ NTSTEP_BETWEEN_READ_ADJSRC,NOISE_TOMOGRAPHY, &
+ Mesh_pointer)
! assemble all the contributions between slices using MPI
if( phase_is_inner .eqv. .false. ) then
- ! sends accel values to corresponding MPI interface neighbors
- ! GPU_MODE == 1
- ! transfers boundary region to host asynchronously. The
- ! MPI-send is done from within compute_forces_viscoelastic_cuda,
- ! once the inner element kernels are launched, and the
- ! memcpy has finished. see compute_forces_viscoelastic_cuda:1655
- call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic)
+ ! sends accel values to corresponding MPI interface neighbors
- ! adjoint simulations
- if( SIMULATION_TYPE == 3 ) then
- call transfer_boun_accel_from_device(NGLOB_AB*NDIM, Mesh_pointer, b_accel,&
- b_buffer_send_vector_ext_mesh,&
- num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh,&
- nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh,3) ! <-- 3 == adjoint b_accel
- call assemble_MPI_vector_send_cuda(NPROC, &
- b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
- num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,&
- my_neighbours_ext_mesh, &
- b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
- endif !adjoint
+ ! transfers boundary region to host asynchronously. The
+ ! MPI-send is done from within compute_forces_viscoelastic_cuda,
+ ! once the inner element kernels are launched, and the
+ ! memcpy has finished. see compute_forces_viscoelastic_cuda: ~ line 1655
+ call transfer_boundary_from_device_a(Mesh_pointer,nspec_outer_elastic)
+ ! adjoint simulations
+ if( SIMULATION_TYPE == 3 ) then
+ call transfer_boun_accel_from_device(Mesh_pointer, b_accel,&
+ b_buffer_send_vector_ext_mesh,&
+ 3) ! <-- 3 == adjoint b_accel
+ call assemble_MPI_vector_send_cuda(NPROC, &
+ b_buffer_send_vector_ext_mesh,b_buffer_recv_vector_ext_mesh, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,&
+ my_neighbours_ext_mesh, &
+ b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh)
+ endif !adjoint
+
else
! waits for send/receive requests to be completed and assembles values
- ! GPU_MODE == 1
call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,accel, Mesh_pointer,&
buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- request_send_vector_ext_mesh,request_recv_vector_ext_mesh,1)
+ request_send_vector_ext_mesh,request_recv_vector_ext_mesh, &
+ 1)
! adjoint simulations
if( SIMULATION_TYPE == 3 ) then
- call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,b_accel, Mesh_pointer,&
+ call assemble_MPI_vector_write_cuda(NPROC,NGLOB_AB,b_accel, Mesh_pointer,&
b_buffer_recv_vector_ext_mesh,num_interfaces_ext_mesh,&
max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh,3)
+ b_request_send_vector_ext_mesh,b_request_recv_vector_ext_mesh, &
+ 3)
endif !adjoint
-
endif
- enddo
+ enddo
!Percy , Fault boundary term B*tau is added to the assembled forces
! which at this point are stored in the array 'accel'
- if (SIMULATION_TYPE_DYN) call bc_dynflt_set3d_all(accel,veloc,displ)
+ if (SIMULATION_TYPE_DYN .or. SIMULATION_TYPE_KIN ) then
+ ! transfers wavefields to the CPU
+ call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc,accel, Mesh_pointer)
- if (SIMULATION_TYPE_KIN) call bc_kinflt_set_all(accel,veloc,displ)
+ ! adds dynamic source
+ if (SIMULATION_TYPE_DYN) call bc_dynflt_set3d_all(accel,veloc,displ)
+ if (SIMULATION_TYPE_KIN) call bc_kinflt_set_all(accel,veloc,displ)
+ ! transfers acceleration back to GPU
+ call transfer_accel_to_device(NDIM*NGLOB_AB,accel, Mesh_pointer)
+ endif
+
! multiplies with inverse of mass matrix (note: rmass has been inverted already)
- call kernel_3_a_cuda(Mesh_pointer, NGLOB_AB, deltatover2,b_deltatover2,APPROXIMATE_OCEAN_LOAD)
+ call kernel_3_a_cuda(Mesh_pointer,deltatover2,b_deltatover2,APPROXIMATE_OCEAN_LOAD)
! updates acceleration with ocean load term
if(APPROXIMATE_OCEAN_LOAD) then
call compute_coupling_ocean_cuda(Mesh_pointer)
+
+ ! updates velocities
+ ! Newmark finite-difference time scheme with elastic domains:
+ ! (see e.g. Hughes, 1987; Chaljub et al., 2003)
+ !
+ ! u(t+delta_t) = u(t) + delta_t v(t) + 1/2 delta_t**2 a(t)
+ ! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
+ ! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
+ !
+ ! where
+ ! u, v, a are displacement,velocity & acceleration
+ ! M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
+ ! f denotes a source term (acoustic/elastic)
+ ! chi_dot_dot is acoustic (fluid) potential ( dotted twice with respect to time)
+ !
+ ! corrector:
+ ! updates the velocity term which requires a(t+delta)
+ ! GPU_MODE: this is handled in 'kernel_3' at the same time as accel*rmass
+ call kernel_3_b_cuda(Mesh_pointer,deltatover2,b_deltatover2)
endif
-! updates velocities
-! Newmark finite-difference time scheme with elastic domains:
-! (see e.g. Hughes, 1987; Chaljub et al., 2003)
-!
-! u(t+delta_t) = u(t) + delta_t v(t) + 1/2 delta_t**2 a(t)
-! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
-! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
-!
-! where
-! u, v, a are displacement,velocity & acceleration
-! M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
-! f denotes a source term (acoustic/elastic)
-! chi_dot_dot is acoustic (fluid) potential ( dotted twice with respect to time)
-!
-! corrector:
-! updates the velocity term which requires a(t+delta)
-! GPU_MODE: this is handled in 'kernel_3' at the same time as accel*rmass
- if( APPROXIMATE_OCEAN_LOAD ) call kernel_3_b_cuda(Mesh_pointer, NGLOB_AB, deltatover2,b_deltatover2)
-
end subroutine compute_forces_viscoelastic_GPU
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_acoustic.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_acoustic.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -114,7 +114,7 @@
enddo ! num_abs_boundary_faces
! adjoint simulations: stores absorbed wavefield part
- if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+ if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
! writes out absorbing boundary value only when second phase is running
if( phase_is_inner .eqv. .true. ) then
! uses fortran routine
@@ -172,7 +172,7 @@
if( num_abs_boundary_faces == 0 ) return
! adjoint simulations:
- if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0) then
+ if( SIMULATION_TYPE == 3 ) then
! reads in absorbing boundary array when first phase is running
if( phase_is_inner .eqv. .false. ) then
! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
@@ -252,7 +252,7 @@
if( num_abs_boundary_faces == 0 ) return
! adjoint simulations:
- if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0) then
+ if( SIMULATION_TYPE == 3 ) then
! reads in absorbing boundary array when first phase is running
if( phase_is_inner .eqv. .false. ) then
! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
@@ -266,12 +266,10 @@
endif !adjoint
! absorbs absorbing-boundary surface using Sommerfeld condition (vanishing field in the outer-space)
- if( num_abs_boundary_faces > 0 ) &
- call compute_stacey_acoustic_cuda(Mesh_pointer, phase_is_inner, &
- SAVE_FORWARD,b_absorb_potential)
+ call compute_stacey_acoustic_cuda(Mesh_pointer, phase_is_inner,b_absorb_potential)
! adjoint simulations: stores absorbed wavefield part
- if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+ if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
! writes out absorbing boundary value only when second phase is running
if( phase_is_inner .eqv. .true. ) then
! uses fortran routine
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_poroelastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_poroelastic.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_poroelastic.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -88,7 +88,7 @@
if( num_abs_boundary_faces == 0 ) return
! adjoint simulations:
- if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0) then
+ if( SIMULATION_TYPE == 3 ) then
! reads in absorbing boundary array when first phase is running
if( phase_is_inner .eqv. .false. ) then
! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
@@ -191,7 +191,7 @@
enddo
! adjoint simulations: stores absorbed wavefield part
- if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+ if( SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
! writes out absorbing boundary value only when second phase is running
if( phase_is_inner .eqv. .true. ) then
! uses fortran routine
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_stacey_viscoelastic.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -101,20 +101,6 @@
! checks if anything to do
if( num_abs_boundary_faces == 0 ) return
-! adjoint simulations:
- if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0) then
- ! reads in absorbing boundary array when first phase is running
- if( phase_is_inner .eqv. .false. ) then
- ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
- ! uses fortran routine
- !read(IOABS,rec=NSTEP-it+1) reclen1,b_absorb_field,reclen2
- !if (reclen1 /= b_reclen_field .or. reclen1 /= reclen2) &
- ! call exit_mpi(0,'Error reading absorbing contribution b_absorb_field')
- ! uses c routine for faster reading
- call read_abs(0,b_absorb_field,b_reclen_field,NSTEP-it+1)
- endif
- endif !adjoint
-
! absorbs absorbing-boundary surface using Stacey condition (Clayton & Enquist)
do iface=1,num_abs_boundary_faces
@@ -182,7 +168,7 @@
enddo
! adjoint simulations: stores absorbed wavefield part
- if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+ if( SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
! writes out absorbing boundary value only when second phase is running
if( phase_is_inner .eqv. .true. ) then
! uses fortran routine
@@ -214,6 +200,8 @@
NSTEP,it,NGLOB_ADJOINT,b_accel, &
b_num_abs_boundary_faces,b_reclen_field,b_absorb_field)
+ use specfem_par,only: myrank
+
implicit none
include "constants.h"
@@ -244,51 +232,48 @@
! local parameters
integer :: ispec,iglob,i,j,k,iface,igll
+ ! checks
+ if (SIMULATION_TYPE /= 3 ) &
+ call exit_MPI(myrank,'error calling routine compute_stacey_viscoelastic_bpwf() with wrong SIMULATION_TYPE')
+
! checks if anything to do
if( num_abs_boundary_faces == 0 ) return
-! adjoint simulations:
- if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0) then
- ! reads in absorbing boundary array when first phase is running
- if( phase_is_inner .eqv. .false. ) then
- ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
- ! uses fortran routine
- !read(IOABS,rec=NSTEP-it+1) reclen1,b_absorb_field,reclen2
- !if (reclen1 /= b_reclen_field .or. reclen1 /= reclen2) &
- ! call exit_mpi(0,'Error reading absorbing contribution b_absorb_field')
- ! uses c routine for faster reading
- call read_abs(0,b_absorb_field,b_reclen_field,NSTEP-it+1)
- endif
- endif !adjoint
+ ! adjoint simulations:
+ ! reads in absorbing boundary array when first phase is running
+ if( phase_is_inner .eqv. .false. ) then
+ ! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
+ ! uses fortran routine
+ !read(IOABS,rec=NSTEP-it+1) reclen1,b_absorb_field,reclen2
+ !if (reclen1 /= b_reclen_field .or. reclen1 /= reclen2) &
+ ! call exit_mpi(0,'Error reading absorbing contribution b_absorb_field')
+ ! uses c routine for faster reading
+ call read_abs(0,b_absorb_field,b_reclen_field,NSTEP-it+1)
+ endif
! absorbs absorbing-boundary surface using Stacey condition (Clayton & Enquist)
do iface=1,num_abs_boundary_faces
- ispec = abs_boundary_ispec(iface)
+ ispec = abs_boundary_ispec(iface)
- if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
+ if (ispec_is_inner(ispec) .eqv. phase_is_inner) then
- if( ispec_is_elastic(ispec) ) then
+ if( ispec_is_elastic(ispec) ) then
+ ! reference gll points on boundary face
+ do igll = 1,NGLLSQUARE
+ ! gets local indices for GLL point
+ i = abs_boundary_ijk(1,igll,iface)
+ j = abs_boundary_ijk(2,igll,iface)
+ k = abs_boundary_ijk(3,igll,iface)
- ! reference gll points on boundary face
- do igll = 1,NGLLSQUARE
+ ! gets velocity
+ iglob=ibool(i,j,k,ispec)
- ! gets local indices for GLL point
- i = abs_boundary_ijk(1,igll,iface)
- j = abs_boundary_ijk(2,igll,iface)
- k = abs_boundary_ijk(3,igll,iface)
-
- ! gets velocity
- iglob=ibool(i,j,k,ispec)
-
- ! adjoint simulations
- if (SIMULATION_TYPE == 3) then
- b_accel(:,iglob) = b_accel(:,iglob) - b_absorb_field(:,igll,iface)
- endif !adjoint
-
- enddo
- endif ! ispec_is_elastic
- endif ! ispec_is_inner
+ ! adjoint simulations
+ b_accel(:,iglob) = b_accel(:,iglob) - b_absorb_field(:,igll,iface)
+ enddo
+ endif ! ispec_is_elastic
+ endif ! ispec_is_inner
enddo
end subroutine compute_stacey_viscoelastic_bpwf
@@ -385,7 +370,7 @@
if( num_abs_boundary_faces == 0 ) return
! adjoint simulations:
- if (SIMULATION_TYPE == 3 .and. num_abs_boundary_faces > 0) then
+ if( SIMULATION_TYPE == 3 ) then
! reads in absorbing boundary array when first phase is running
if( phase_is_inner .eqv. .false. ) then
! note: the index NSTEP-it+1 is valid if b_displ is read in after the Newmark scheme
@@ -398,12 +383,10 @@
endif
endif !adjoint
- if( num_abs_boundary_faces > 0 ) &
- call compute_stacey_viscoelastic_cuda(Mesh_pointer,phase_is_inner, &
- SAVE_FORWARD,b_absorb_field)
+ call compute_stacey_viscoelastic_cuda(Mesh_pointer,phase_is_inner,b_absorb_field)
! adjoint simulations: stores absorbed wavefield part
- if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. num_abs_boundary_faces > 0 ) then
+ if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
! writes out absorbing boundary value only when second phase is running
if( phase_is_inner .eqv. .true. ) then
! uses fortran routine
Added: seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_total_energy.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_total_energy.f90 (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/compute_total_energy.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -0,0 +1,325 @@
+!=====================================================================
+!
+! S p e c f e m 3 D V e r s i o n 2 . 1
+! ---------------------------------------
+!
+! Main authors: Dimitri Komatitsch and Jeroen Tromp
+! Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+! July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+
+ subroutine compute_total_energy()
+
+! computes kinetic, potential and total energy
+! in all the slices using an MPI reduction
+! and output that to an energy file
+
+ use specfem_par
+ use specfem_par_elastic
+ use specfem_par_acoustic
+ use pml_par
+
+ implicit none
+
+! local variables
+ integer :: i,j,k,l,ispec,iglob
+
+ real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
+ real(kind=CUSTOM_REAL) :: duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
+
+ real(kind=CUSTOM_REAL) :: duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
+ real(kind=CUSTOM_REAL) :: duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
+
+ real(kind=CUSTOM_REAL) :: epsilon_xx,epsilon_yy,epsilon_zz,epsilon_xy,epsilon_xz,epsilon_yz,epsilon_yx,epsilon_zx,epsilon_zy
+ real(kind=CUSTOM_REAL) :: sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
+ real(kind=CUSTOM_REAL) :: vx,vy,vz,pressure
+
+ real(kind=CUSTOM_REAL) :: hp1,hp2,hp3
+
+ real(kind=CUSTOM_REAL) :: lambdal,mul,lambdalplus2mul,rhol,cpl
+ real(kind=CUSTOM_REAL) :: kappal
+
+ real(kind=CUSTOM_REAL) :: integration_weight
+ double precision :: kinetic_energy,potential_energy
+ double precision :: kinetic_energy_glob,potential_energy_glob,total_energy_glob
+
+! local parameters
+ real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
+ real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
+ tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
+
+ kinetic_energy = 0.d0
+ potential_energy = 0.d0
+
+ if(ANISOTROPY .or. ATTENUATION) &
+ call exit_MPI(myrank,'calculation of total energy currently implemented only for media with no anisotropy and no attenuation')
+
+! loop over spectral elements
+ do ispec = 1,NSPEC_AB
+
+! if element is a CPML then do not compute energy in it, since it is non physical;
+! thus, we compute energy in the main domain only, without absorbing elements
+ if(PML_CONDITIONS) then
+ ! do not merge this second line with the first using an ".and." statement
+ ! because array is_CPML() is unallocated when PML_CONDITIONS is false
+ if(is_CPML(ispec)) cycle
+ endif
+
+ !---
+ !--- elastic spectral element
+ !---
+ if(ispec_is_elastic(ispec)) then
+
+ do k=1,NGLLZ
+ do j=1,NGLLY
+ do i=1,NGLLX
+ iglob = ibool(i,j,k,ispec)
+ dummyx_loc(i,j,k) = displ(1,iglob)
+ dummyy_loc(i,j,k) = displ(2,iglob)
+ dummyz_loc(i,j,k) = displ(3,iglob)
+ enddo
+ enddo
+ enddo
+
+ do k=1,NGLLZ
+ do j=1,NGLLY
+ do i=1,NGLLX
+
+ iglob = ibool(i,j,k,ispec)
+
+ tempx1(i,j,k) = 0._CUSTOM_REAL
+ tempx2(i,j,k) = 0._CUSTOM_REAL
+ tempx3(i,j,k) = 0._CUSTOM_REAL
+
+ tempy1(i,j,k) = 0._CUSTOM_REAL
+ tempy2(i,j,k) = 0._CUSTOM_REAL
+ tempy3(i,j,k) = 0._CUSTOM_REAL
+
+ tempz1(i,j,k) = 0._CUSTOM_REAL
+ tempz2(i,j,k) = 0._CUSTOM_REAL
+ tempz3(i,j,k) = 0._CUSTOM_REAL
+
+ do l=1,NGLLX
+ hp1 = hprime_xx(i,l)
+ tempx1(i,j,k) = tempx1(i,j,k) + dummyx_loc(l,j,k)*hp1
+ tempy1(i,j,k) = tempy1(i,j,k) + dummyy_loc(l,j,k)*hp1
+ tempz1(i,j,k) = tempz1(i,j,k) + dummyz_loc(l,j,k)*hp1
+
+ !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+ hp2 = hprime_yy(j,l)
+ tempx2(i,j,k) = tempx2(i,j,k) + dummyx_loc(i,l,k)*hp2
+ tempy2(i,j,k) = tempy2(i,j,k) + dummyy_loc(i,l,k)*hp2
+ tempz2(i,j,k) = tempz2(i,j,k) + dummyz_loc(i,l,k)*hp2
+
+ !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+ hp3 = hprime_zz(k,l)
+ tempx3(i,j,k) = tempx3(i,j,k) + dummyx_loc(i,j,l)*hp3
+ tempy3(i,j,k) = tempy3(i,j,k) + dummyy_loc(i,j,l)*hp3
+ tempz3(i,j,k) = tempz3(i,j,k) + dummyz_loc(i,j,l)*hp3
+ enddo
+
+ ! get derivatives of ux, uy and uz with respect to x, y and z
+ xixl = xix(i,j,k,ispec)
+ xiyl = xiy(i,j,k,ispec)
+ xizl = xiz(i,j,k,ispec)
+ etaxl = etax(i,j,k,ispec)
+ etayl = etay(i,j,k,ispec)
+ etazl = etaz(i,j,k,ispec)
+ gammaxl = gammax(i,j,k,ispec)
+ gammayl = gammay(i,j,k,ispec)
+ gammazl = gammaz(i,j,k,ispec)
+ jacobianl = jacobian(i,j,k,ispec)
+
+ duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+ duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+ duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+ duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
+ duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
+ duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
+
+ duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
+ duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
+ duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
+
+ ! precompute some sums to save CPU time
+ duxdxl_plus_duydyl = duxdxl + duydyl
+ duxdxl_plus_duzdzl = duxdxl + duzdzl
+ duydyl_plus_duzdzl = duydyl + duzdzl
+ duxdyl_plus_duydxl = duxdyl + duydxl
+ duzdxl_plus_duxdzl = duzdxl + duxdzl
+ duzdyl_plus_duydzl = duzdyl + duydzl
+
+ ! compute the strain
+ epsilon_xx = duxdxl
+ epsilon_yy = duydyl
+ epsilon_zz = duzdzl
+ epsilon_xy = 0.5 * duxdyl_plus_duydxl
+ epsilon_xz = 0.5 * duzdxl_plus_duxdzl
+ epsilon_yz = 0.5 * duzdyl_plus_duydzl
+
+ ! define symmetric components of epsilon
+ epsilon_yx = epsilon_xy
+ epsilon_zx = epsilon_xz
+ epsilon_zy = epsilon_yz
+
+ kappal = kappastore(i,j,k,ispec)
+ mul = mustore(i,j,k,ispec)
+ rhol = rhostore(i,j,k,ispec)
+
+ ! isotropic case
+ lambdalplus2mul = kappal + FOUR_THIRDS * mul
+ lambdal = lambdalplus2mul - 2.*mul
+
+ ! compute stress sigma
+ sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
+ sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
+ sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
+
+ sigma_xy = mul*duxdyl_plus_duydxl
+ sigma_xz = mul*duzdxl_plus_duxdzl
+ sigma_yz = mul*duzdyl_plus_duydzl
+
+ ! define symmetric components of sigma
+ sigma_yx = sigma_xy
+ sigma_zx = sigma_xz
+ sigma_zy = sigma_yz
+
+ integration_weight = wxgll(i)*wygll(j)*wzgll(k)*jacobianl
+
+ ! compute kinetic energy 1/2 rho ||v||^2
+ kinetic_energy = kinetic_energy + integration_weight * rhol*(veloc(1,iglob)**2 + &
+ veloc(2,iglob)**2 + veloc(3,iglob)**2) / 2.
+
+ ! compute potential energy 1/2 sigma_ij epsilon_ij
+ potential_energy = potential_energy + integration_weight * &
+ (sigma_xx*epsilon_xx + sigma_xy*epsilon_xy + sigma_xz*epsilon_xz + &
+ sigma_yx*epsilon_yx + sigma_yy*epsilon_yy + sigma_yz*epsilon_yz + &
+ sigma_zx*epsilon_zx + sigma_zy*epsilon_zy + sigma_zz*epsilon_zz) / 2.
+
+ enddo
+ enddo
+ enddo
+
+ !---
+ !--- acoustic spectral element
+ !---
+ else if(ispec_is_acoustic(ispec)) then
+
+ ! for the definition of potential energy in an acoustic fluid, see for instance
+ ! equation (23) of M. Maess et al., Journal of Sound and Vibration 296 (2006) 264-276
+
+ ! in case of an acoustic medium, a potential Chi of (density * displacement) is used as in Chaljub and Valette,
+ ! Geophysical Journal International, vol. 158, p. 131-141 (2004) and *NOT* a velocity potential
+ ! as in Komatitsch and Tromp, Geophysical Journal International, vol. 150, p. 303-318 (2002).
+ ! This permits acoustic-elastic coupling based on a non-iterative time scheme.
+ ! Displacement is then: u = grad(Chi) / rho
+ ! Velocity is then: v = grad(Chi_dot) / rho (Chi_dot being the time derivative of Chi)
+ ! and pressure is: p = - Chi_dot_dot (Chi_dot_dot being the time second derivative of Chi).
+
+ do k=1,NGLLZ
+ do j=1,NGLLY
+ do i=1,NGLLX
+ iglob = ibool(i,j,k,ispec)
+ dummyx_loc(i,j,k) = potential_dot_acoustic(iglob)
+ enddo
+ enddo
+ enddo
+
+ do k=1,NGLLZ
+ do j=1,NGLLY
+ do i=1,NGLLX
+
+ iglob = ibool(i,j,k,ispec)
+
+ tempx1(i,j,k) = 0._CUSTOM_REAL
+ tempx2(i,j,k) = 0._CUSTOM_REAL
+ tempx3(i,j,k) = 0._CUSTOM_REAL
+
+ do l=1,NGLLX
+ hp1 = hprime_xx(i,l)
+ tempx1(i,j,k) = tempx1(i,j,k) + dummyx_loc(l,j,k)*hp1
+
+ !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+ hp2 = hprime_yy(j,l)
+ tempx2(i,j,k) = tempx2(i,j,k) + dummyx_loc(i,l,k)*hp2
+
+ !!! can merge these loops because NGLLX = NGLLY = NGLLZ
+ hp3 = hprime_zz(k,l)
+ tempx3(i,j,k) = tempx3(i,j,k) + dummyx_loc(i,j,l)*hp3
+ enddo
+
+ ! get derivatives of ux, uy and uz with respect to x, y and z
+ xixl = xix(i,j,k,ispec)
+ xiyl = xiy(i,j,k,ispec)
+ xizl = xiz(i,j,k,ispec)
+ etaxl = etax(i,j,k,ispec)
+ etayl = etay(i,j,k,ispec)
+ etazl = etaz(i,j,k,ispec)
+ gammaxl = gammax(i,j,k,ispec)
+ gammayl = gammay(i,j,k,ispec)
+ gammazl = gammaz(i,j,k,ispec)
+ jacobianl = jacobian(i,j,k,ispec)
+
+ duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
+ duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
+ duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
+
+ rhol = rhostore(i,j,k,ispec)
+ kappal = kappastore(i,j,k,ispec)
+ cpl = sqrt(kappal / rhol)
+
+ ! Velocity is v = grad(Chi_dot) / rho (Chi_dot being the time derivative of Chi)
+ vx = duxdxl / rhol
+ vy = duxdyl / rhol
+ vz = duxdzl / rhol
+
+ ! pressure is p = - Chi_dot_dot (Chi_dot_dot being the time second derivative of Chi)
+ pressure = - potential_dot_dot_acoustic(iglob)
+
+ integration_weight = wxgll(i)*wygll(j)*wzgll(k)*jacobianl
+
+ ! compute kinetic energy 1/2 rho ||v||^2
+ kinetic_energy = kinetic_energy + integration_weight * rhol*(vx**2 + vy**2 + vz**2) / 2.
+
+ ! compute potential energy 1/2 sigma_ij epsilon_ij
+ potential_energy = potential_energy + integration_weight * pressure**2 / (2. * rhol * cpl**2)
+
+ enddo
+ enddo
+ enddo
+
+ else
+
+ call exit_MPI(myrank,'calculation of total energy implemented for acoustic and (visco)elastic elements only for now')
+
+ endif
+
+ enddo
+
+! compute the total using a reduction between all the processors
+ call sum_all_dp(kinetic_energy,kinetic_energy_glob)
+ call sum_all_dp(potential_energy,potential_energy_glob)
+ total_energy_glob = kinetic_energy_glob + potential_energy_glob
+
+! write the total to disk from the master
+ if(myrank == 0) write(IOUT_ENERGY,*) it,sngl(kinetic_energy_glob),sngl(potential_energy_glob),sngl(total_energy_glob)
+
+ end subroutine compute_total_energy
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/fault_solver_common.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/fault_solver_common.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/fault_solver_common.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -150,19 +150,19 @@
tmp_vec = 0._CUSTOM_REAL
if (bc%nspec>0) tmp_vec(1,bc%ibulk1) = bc%B
! assembles with other MPI processes
- call assemble_MPI_vector_ext_mesh(NPROC,NGLOB_AB,tmp_vec, &
- num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- my_neighbours_ext_mesh)
+ call assemble_MPI_vector_blocking(NPROC,NGLOB_AB,tmp_vec, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+ my_neighbours_ext_mesh)
if (bc%nspec>0) bc%B = tmp_vec(1,bc%ibulk1)
tmp_vec = 0._CUSTOM_REAL
if (bc%nspec>0) tmp_vec(:,bc%ibulk1) = nxyz
! assembles with other MPI processes
- call assemble_MPI_vector_ext_mesh(NPROC,NGLOB_AB,tmp_vec, &
- num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
- my_neighbours_ext_mesh)
+ call assemble_MPI_vector_blocking(NPROC,NGLOB_AB,tmp_vec, &
+ num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
+ my_neighbours_ext_mesh)
if (bc%nspec>0) nxyz = tmp_vec(:,bc%ibulk1)
endif
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/finalize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/finalize_simulation.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/finalize_simulation.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -38,6 +38,7 @@
implicit none
integer :: irec_local
+ integer :: ier
! write gravity perturbations
if (GRAVITY_SIMULATION) call gravity_output()
@@ -45,46 +46,51 @@
! save last frame
if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
- open(unit=27,file=prname(1:len_trim(prname))//'save_forward_arrays.bin',&
- status='unknown',form='unformatted')
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'save_forward_arrays.bin',&
+ status='unknown',form='unformatted',iostat=ier)
+ if( ier /= 0 ) then
+ print*,'error: opening save_forward_arrays.bin'
+ print*,'path: ',prname(1:len_trim(prname))//'save_forward_arrays.bin'
+ call exit_mpi(myrank,'error opening file save_forward_arrays.bin')
+ endif
if( ACOUSTIC_SIMULATION ) then
- write(27) potential_acoustic
- write(27) potential_dot_acoustic
- write(27) potential_dot_dot_acoustic
+ write(IOUT) potential_acoustic
+ write(IOUT) potential_dot_acoustic
+ write(IOUT) potential_dot_dot_acoustic
endif
if( ELASTIC_SIMULATION ) then
- write(27) displ
- write(27) veloc
- write(27) accel
+ write(IOUT) displ
+ write(IOUT) veloc
+ write(IOUT) accel
if (ATTENUATION) then
- if(FULL_ATTENUATION_SOLID) write(27) R_trace !ZN
- write(27) R_xx
- write(27) R_yy
- write(27) R_xy
- write(27) R_xz
- write(27) R_yz
- if(FULL_ATTENUATION_SOLID) write(27) epsilondev_trace !ZN
- write(27) epsilondev_xx
- write(27) epsilondev_yy
- write(27) epsilondev_xy
- write(27) epsilondev_xz
- write(27) epsilondev_yz
+ if(FULL_ATTENUATION_SOLID) write(IOUT) R_trace !ZN
+ write(IOUT) R_xx
+ write(IOUT) R_yy
+ write(IOUT) R_xy
+ write(IOUT) R_xz
+ write(IOUT) R_yz
+ if(FULL_ATTENUATION_SOLID) write(IOUT) epsilondev_trace !ZN
+ write(IOUT) epsilondev_xx
+ write(IOUT) epsilondev_yy
+ write(IOUT) epsilondev_xy
+ write(IOUT) epsilondev_xz
+ write(IOUT) epsilondev_yz
endif
endif
if( POROELASTIC_SIMULATION ) then
- write(27) displs_poroelastic
- write(27) velocs_poroelastic
- write(27) accels_poroelastic
- write(27) displw_poroelastic
- write(27) velocw_poroelastic
- write(27) accelw_poroelastic
+ write(IOUT) displs_poroelastic
+ write(IOUT) velocs_poroelastic
+ write(IOUT) accels_poroelastic
+ write(IOUT) displw_poroelastic
+ write(IOUT) velocw_poroelastic
+ write(IOUT) accelw_poroelastic
endif
- close(27)
+ close(IOUT)
! adjoint simulations
else if (SIMULATION_TYPE == 3) then
@@ -123,7 +129,11 @@
do irec_local = 1, nrec_local
write(outputname,'(a,i5.5)') OUTPUT_FILES_PATH(1:len_trim(OUTPUT_FILES_PATH)) // &
'/src_frechet.',number_receiver_global(irec_local)
- open(unit=27,file=trim(outputname),status='unknown')
+ open(unit=IOUT,file=trim(outputname),status='unknown',iostat=ier)
+ if( ier /= 0 ) then
+ print*,'error opening file: ',trim(outputname)
+ call exit_mpi(myrank,'error opening file src_frechet.**')
+ endif
!
! r -> z, theta -> -y, phi -> x
!
@@ -133,16 +143,16 @@
! Mrt = -Myz
! Mrp = Mxz
! Mtp = -Mxy
- write(27,*) Mzz_der(irec_local)
- write(27,*) Myy_der(irec_local)
- write(27,*) Mxx_der(irec_local)
- write(27,*) -Myz_der(irec_local)
- write(27,*) Mxz_der(irec_local)
- write(27,*) -Mxy_der(irec_local)
- write(27,*) sloc_der(1,irec_local)
- write(27,*) sloc_der(2,irec_local)
- write(27,*) sloc_der(3,irec_local)
- close(27)
+ write(IOUT,*) Mzz_der(irec_local)
+ write(IOUT,*) Myy_der(irec_local)
+ write(IOUT,*) Mxx_der(irec_local)
+ write(IOUT,*) -Myz_der(irec_local)
+ write(IOUT,*) Mxz_der(irec_local)
+ write(IOUT,*) -Mxy_der(irec_local)
+ write(IOUT,*) sloc_der(1,irec_local)
+ write(IOUT,*) sloc_der(2,irec_local)
+ write(IOUT,*) sloc_der(3,irec_local)
+ close(IOUT)
enddo
endif
endif
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/initialize_simulation.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -137,16 +137,16 @@
! reads in numbers of spectral elements and points for the part of the mesh handled by this process
call create_name_database(prname,myrank,LOCAL_PATH)
if (OLD_TEST_TO_FIX_ONE_DAY) call create_name_database(dsmname,myrank,TRAC_PATH) !! VM VM
- open(unit=27,file=prname(1:len_trim(prname))//'external_mesh.bin',status='old',&
+ open(unit=IIN,file=prname(1:len_trim(prname))//'external_mesh.bin',status='old',&
action='read',form='unformatted',iostat=ier)
if( ier /= 0 ) then
print*,'error: could not open database '
print*,'path: ',prname(1:len_trim(prname))//'external_mesh.bin'
call exit_mpi(myrank,'error opening database')
endif
- read(27) NSPEC_AB
- read(27) NGLOB_AB
- close(27)
+ read(IIN) NSPEC_AB
+ read(IIN) NGLOB_AB
+ close(IIN)
! attenuation arrays size
if( ATTENUATION ) then
@@ -403,6 +403,14 @@
NSPEC_BOUN = 1
endif
+ ! transversely isotropic kernel flags
+ if( SIMULATION_TYPE == 3 ) then
+ if( SAVE_TRANSVERSE_KL .eqv. .true. .and. ANISOTROPIC_KL .eqv. .false. ) then
+ call exit_mpi(myrank, &
+ 'for kernel simulations with SAVE_TRANSVERSE_KL set to .true., please also set ANISOTROPIC_KL to .true. in constants.h')
+ endif
+ endif
+
end subroutine initialize_simulation_adjoint
!
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.F90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/iterate_time.F90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -37,8 +37,8 @@
implicit none
-!---- create a Gnuplot script to display the energy curve in log scale
- if(output_energy .and. myrank == 0) then
+ !---- create a Gnuplot script to display the energy curve in log scale
+ if( OUTPUT_ENERGY .and. myrank == 0) then
open(unit=IOUT_ENERGY,file=trim(OUTPUT_FILES)//'plot_energy.gnu',status='unknown',action='write')
write(IOUT_ENERGY,*) 'set term wxt'
write(IOUT_ENERGY,*) '#set term postscript landscape color solid "Helvetica" 22'
@@ -58,8 +58,9 @@
close(IOUT_ENERGY)
endif
-! open the file in which we will store the energy curve
- if(output_energy .and. myrank == 0) open(unit=IOUT_ENERGY,file=trim(OUTPUT_FILES)//'energy.dat',status='unknown',action='write')
+ ! open the file in which we will store the energy curve
+ if( OUTPUT_ENERGY .and. myrank == 0 ) &
+ open(unit=IOUT_ENERGY,file=trim(OUTPUT_FILES)//'energy.dat',status='unknown',action='write')
!
! s t a r t t i m e i t e r a t i o n s
@@ -94,65 +95,85 @@
do it = 1,NSTEP
! simulation status output and stability check
- if(mod(it,NTSTEP_BETWEEN_OUTPUT_INFO) == 0 .or. it == 5 .or. it == NSTEP) call it_check_stability()
+ if( mod(it,NTSTEP_BETWEEN_OUTPUT_INFO) == 0 .or. it == 5 .or. it == NSTEP ) &
+ call check_stability()
! simulation status output and stability check
- if(output_energy .and. (mod(it,NTSTEP_BETWEEN_OUTPUT_ENERGY) == 0 .or. it == 5 .or. it == NSTEP)) &
- call it_compute_total_energy()
+ if( OUTPUT_ENERGY ) then
+ if( mod(it,NTSTEP_BETWEEN_OUTPUT_ENERGY) == 0 .or. it == 5 .or. it == NSTEP ) &
+ call compute_total_energy()
+ endif
- ! update displacement using Newmark time scheme
- call it_update_displacement_scheme()
+ ! updates wavefields using Newmark time scheme
+ call update_displacement_scheme()
- if(.not. GPU_MODE)then
- if(SIMULATION_TYPE == 3)then
- if(ELASTIC_SIMULATION .and. ACOUSTIC_SIMULATION)then
- if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic()
- if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic()
- else
- if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic()
- if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic()
- endif
- else
+ ! calculates stiffness term
+ if( .not. GPU_MODE )then
+ ! wavefields on CPU
+
+ ! note: the order of the computations for acoustic and elastic domains is crucial for coupled simulations
+ if( SIMULATION_TYPE == 3 ) then
+ ! kernel/adjoint simulations
+
+ ! adjoint wavefields
+ if( ELASTIC_SIMULATION .and. ACOUSTIC_SIMULATION )then
+ ! coupled acoustic-elastic simulations
+ ! 1. elastic domain w/ adjoint wavefields
+ call compute_forces_viscoelastic()
+ ! 2. acoustic domain w/ adjoint wavefields
+ call compute_forces_acoustic()
+ else
+ ! non-coupled simulations
+ ! (purely acoustic or elastic)
if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic()
if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic()
- endif
+ endif
- if(SIMULATION_TYPE == 3)then
- ! acoustic solver
- ! (needs to be done after elastic one)
- if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic_bpwf()
- ! elastic solver
- ! (needs to be done first, before poroelastic one)
- if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic_bpwf()
- endif
+ ! backward/reconstructed wavefields
+ ! acoustic solver
+ ! (needs to be done after elastic one)
+ if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic_bpwf()
+ ! elastic solver
+ ! (needs to be done first, before poroelastic one)
+ if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic_bpwf()
+
+ else
+ ! forward simulations
+
+ ! 1. acoustic domain
+ if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic()
+ ! 2. elastic domain
+ if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic()
+ endif
+
+ ! poroelastic solver
+ if( POROELASTIC_SIMULATION ) call compute_forces_poroelastic()
+
else
- ! acoustic solver
- ! (needs to be done after elastic one)
- if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic_GPU()
- ! elastic solver
- ! (needs to be done first, before poroelastic one)
- if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic_GPU()
+ ! wavefields on GPU
+ ! acoustic solver
+ if( ACOUSTIC_SIMULATION ) call compute_forces_acoustic_GPU()
+ ! elastic solver
+ ! (needs to be done first, before poroelastic one)
+ if( ELASTIC_SIMULATION ) call compute_forces_viscoelastic_GPU()
endif
- ! poroelastic solver
- if( POROELASTIC_SIMULATION ) call compute_forces_poroelastic()
-
! restores last time snapshot saved for backward/reconstruction of wavefields
! note: this must be read in after the Newmark time scheme
if( SIMULATION_TYPE == 3 .and. it == 1 ) then
- call it_read_forward_arrays()
+ call it_read_forward_arrays()
endif
! write the seismograms with time shift (GPU_MODE transfer included)
- if (nrec_local > 0 .or. ( WRITE_SEISMOGRAMS_BY_MASTER .and. myrank == 0 ) ) then
+ if( nrec_local > 0 .or. ( WRITE_SEISMOGRAMS_BY_MASTER .and. myrank == 0 ) ) then
call write_seismograms()
endif
! calculating gravity field at current timestep
- if (GRAVITY_SIMULATION) call gravity_timeseries()
+ if( GRAVITY_SIMULATION ) call gravity_timeseries()
! resetting d/v/a/R/eps for the backward reconstruction with attenuation
- if (ATTENUATION ) then
+ if( ATTENUATION ) then
call it_store_attenuation_arrays()
endif
@@ -167,7 +188,7 @@
endif
! first step of noise tomography, i.e., save a surface movie at every time step
- if ( NOISE_TOMOGRAPHY == 1) then
+ if( NOISE_TOMOGRAPHY == 1 ) then
if( num_free_surface_faces > 0) then
call noise_save_surface_movie(displ,ibool, &
noise_surface_movie,it, &
@@ -185,691 +206,17 @@
call it_print_elapsed_time()
! Transfer fields from GPU card to host for further analysis
- if(GPU_MODE) call it_transfer_from_GPU()
+ if( GPU_MODE ) call it_transfer_from_GPU()
!---- close energy file
- if(output_energy .and. myrank == 0) close(IOUT_ENERGY)
+ if( OUTPUT_ENERGY .and. myrank == 0 ) close(IOUT_ENERGY)
end subroutine iterate_time
!=====================================================================
- subroutine it_check_stability()
-! computes the maximum of the norm of the displacement
-! in all the slices using an MPI reduction
-! and output timestamp file to check that simulation is running fine
-
- use specfem_par
- use specfem_par_elastic
- use specfem_par_poroelastic
- use specfem_par_acoustic
- implicit none
-
- double precision :: tCPU,t_remain,t_total
- integer :: ihours,iminutes,iseconds,int_tCPU, &
- ihours_remain,iminutes_remain,iseconds_remain,int_t_remain, &
- ihours_total,iminutes_total,iseconds_total,int_t_total
-
- ! maximum of the norm of the displacement
- real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all ! elastic
- real(kind=CUSTOM_REAL) Usolidnormp,Usolidnormp_all ! acoustic
- real(kind=CUSTOM_REAL) Usolidnorms,Usolidnorms_all ! solid poroelastic
- real(kind=CUSTOM_REAL) Usolidnormw,Usolidnormw_all ! fluid (w.r.t.s) poroelastic
-
- ! norm of the backward displacement
- real(kind=CUSTOM_REAL) b_Usolidnorm, b_Usolidnorm_all
-
- ! initializes
- Usolidnorm_all = 0.0_CUSTOM_REAL
- Usolidnormp_all = 0.0_CUSTOM_REAL
- Usolidnorms_all = 0.0_CUSTOM_REAL
- Usolidnormw_all = 0.0_CUSTOM_REAL
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-!chris: Rewrite to get norm for each material when coupled simulations
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-! compute maximum of norm of displacement in each slice
- if( ELASTIC_SIMULATION ) then
- if( GPU_MODE) then
- ! way 2: just get maximum of field from GPU
- call get_norm_elastic_from_device(Usolidnorm,Mesh_pointer,1)
- else
- Usolidnorm = maxval(sqrt(displ(1,:)**2 + displ(2,:)**2 + displ(3,:)**2))
- endif
-
- ! check stability of the code, exit if unstable
- ! negative values can occur with some compilers when the unstable value is greater
- ! than the greatest possible floating-point number of the machine
- !if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0.0_CUSTOM_REAL) &
- ! call exit_MPI(myrank,'single forward simulation became unstable and blew up')
-
- ! compute the maximum of the maxima for all the slices using an MPI reduction
- call max_all_cr(Usolidnorm,Usolidnorm_all)
- endif
-
- if( ACOUSTIC_SIMULATION ) then
- if(GPU_MODE) then
- ! way 2: just get maximum of field from GPU
- call get_norm_acoustic_from_device(Usolidnormp,Mesh_pointer,1)
- else
- Usolidnormp = maxval(abs(potential_dot_dot_acoustic(:)))
- endif
-
- ! compute the maximum of the maxima for all the slices using an MPI reduction
- call max_all_cr(Usolidnormp,Usolidnormp_all)
- endif
-
- if( POROELASTIC_SIMULATION ) then
- Usolidnorms = maxval(sqrt(displs_poroelastic(1,:)**2 + displs_poroelastic(2,:)**2 + &
- displs_poroelastic(3,:)**2))
- Usolidnormw = maxval(sqrt(displw_poroelastic(1,:)**2 + displw_poroelastic(2,:)**2 + &
- displw_poroelastic(3,:)**2))
-
- ! compute the maximum of the maxima for all the slices using an MPI reduction
- call max_all_cr(Usolidnorms,Usolidnorms_all)
- call max_all_cr(Usolidnormw,Usolidnormw_all)
- endif
-
-
- ! adjoint simulations
- if( SIMULATION_TYPE == 3 ) then
- if( ELASTIC_SIMULATION ) then
- ! way 2
- if(GPU_MODE) then
- call get_norm_elastic_from_device(b_Usolidnorm,Mesh_pointer,3)
- else
- b_Usolidnorm = maxval(sqrt(b_displ(1,:)**2 + b_displ(2,:)**2 + b_displ(3,:)**2))
- endif
- endif
- if( ACOUSTIC_SIMULATION ) then
- ! way 2
- if(GPU_MODE) then
- call get_norm_acoustic_from_device(b_Usolidnorm,Mesh_pointer,3)
- else
- b_Usolidnorm = maxval(abs(b_potential_dot_dot_acoustic(:)))
- endif
- endif
- if( POROELASTIC_SIMULATION ) then
- b_Usolidnorm = maxval(sqrt(b_displs_poroelastic(1,:)**2 + b_displs_poroelastic(2,:)**2 + &
- b_displs_poroelastic(3,:)**2))
- endif
- ! check stability of the code, exit if unstable
- ! negative values can occur with some compilers when the unstable value is greater
- ! than the greatest possible floating-point number of the machine
- !if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0.0_CUSTOM_REAL) &
- ! call exit_MPI(myrank,'single backward simulation became unstable and blew up')
-
- ! compute max of all slices
- call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
- endif
-
- ! user output
- if(myrank == 0) then
-
- write(IMAIN,*) 'Time step # ',it
- write(IMAIN,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
-
- ! elapsed time since beginning of the simulation
- tCPU = wtime() - time_start
- int_tCPU = int(tCPU)
- ihours = int_tCPU / 3600
- iminutes = (int_tCPU - 3600*ihours) / 60
- iseconds = int_tCPU - 3600*ihours - 60*iminutes
- write(IMAIN,*) 'Elapsed time in seconds = ',tCPU
- write(IMAIN,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
- write(IMAIN,*) 'Mean elapsed time per time step in seconds = ',sngl(tCPU/dble(it))
- if( ELASTIC_SIMULATION ) then
- write(IMAIN,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
- endif
- if( ACOUSTIC_SIMULATION ) then
- write(IMAIN,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all
- endif
- if( POROELASTIC_SIMULATION ) then
- write(IMAIN,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all
- write(IMAIN,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all
- endif
- ! adjoint simulations
- if (SIMULATION_TYPE == 3) write(IMAIN,*) &
- 'Max norm U (backward) in all slices = ',b_Usolidnorm_all
-
- ! compute estimated remaining simulation time
- t_remain = (NSTEP - it) * (tCPU/dble(it))
- int_t_remain = int(t_remain)
- ihours_remain = int_t_remain / 3600
- iminutes_remain = (int_t_remain - 3600*ihours_remain) / 60
- iseconds_remain = int_t_remain - 3600*ihours_remain - 60*iminutes_remain
- write(IMAIN,*) 'Time steps done = ',it,' out of ',NSTEP
- write(IMAIN,*) 'Time steps remaining = ',NSTEP - it
- write(IMAIN,*) 'Estimated remaining time in seconds = ',sngl(t_remain)
- write(IMAIN,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
- ihours_remain,iminutes_remain,iseconds_remain
-
- ! compute estimated total simulation time
- t_total = t_remain + tCPU
- int_t_total = int(t_total)
- ihours_total = int_t_total / 3600
- iminutes_total = (int_t_total - 3600*ihours_total) / 60
- iseconds_total = int_t_total - 3600*ihours_total - 60*iminutes_total
- write(IMAIN,*) 'Estimated total run time in seconds = ',sngl(t_total)
- write(IMAIN,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
- ihours_total,iminutes_total,iseconds_total
- write(IMAIN,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'
-
- if(it < 100) then
- write(IMAIN,*) '************************************************************'
- write(IMAIN,*) '**** BEWARE: the above time estimates are not reliable'
- write(IMAIN,*) '**** because fewer than 100 iterations have been performed'
- write(IMAIN,*) '************************************************************'
- endif
- write(IMAIN,*)
-
- ! flushes file buffer for main output file (IMAIN)
- call flush_IMAIN()
-
- ! write time stamp file to give information about progression of simulation
- write(outputname,"('/timestamp',i6.6)") it
- open(unit=IOUT,file=trim(OUTPUT_FILES)//outputname,status='unknown')
- write(IOUT,*) 'Time step # ',it
- write(IOUT,*) 'Time: ',sngl((it-1)*DT-t0),' seconds'
- write(IOUT,*) 'Elapsed time in seconds = ',tCPU
- write(IOUT,"(' Elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
- write(IOUT,*) 'Mean elapsed time per time step in seconds = ',tCPU/dble(it)
- if( ELASTIC_SIMULATION ) then
- write(IOUT,*) 'Max norm displacement vector U in all slices (m) = ',Usolidnorm_all
- endif
- if( ACOUSTIC_SIMULATION ) then
- write(IOUT,*) 'Max norm pressure P in all slices (Pa) = ',Usolidnormp_all
- endif
- if( POROELASTIC_SIMULATION ) then
- write(IOUT,*) 'Max norm displacement vector Us in all slices (m) = ',Usolidnorms_all
- write(IOUT,*) 'Max norm displacement vector W in all slices (m) = ',Usolidnormw_all
- endif
- ! adjoint simulations
- if (SIMULATION_TYPE == 3) write(IOUT,*) &
- 'Max norm U (backward) in all slices = ',b_Usolidnorm_all
- ! estimation
- write(IOUT,*) 'Time steps done = ',it,' out of ',NSTEP
- write(IOUT,*) 'Time steps remaining = ',NSTEP - it
- write(IOUT,*) 'Estimated remaining time in seconds = ',t_remain
- write(IOUT,"(' Estimated remaining time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
- ihours_remain,iminutes_remain,iseconds_remain
- write(IOUT,*) 'Estimated total run time in seconds = ',t_total
- write(IOUT,"(' Estimated total run time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") &
- ihours_total,iminutes_total,iseconds_total
- write(IOUT,*) 'We have done ',sngl(100.d0*dble(it)/dble(NSTEP)),'% of that'
- close(IOUT)
-
- ! check stability of the code, exit if unstable
- ! negative values can occur with some compilers when the unstable value is greater
- ! than the greatest possible floating-point number of the machine
- if(Usolidnorm_all > STABILITY_THRESHOLD .or. Usolidnorm_all < 0.0_CUSTOM_REAL &
- .or. Usolidnormp_all > STABILITY_THRESHOLD .or. Usolidnormp_all < 0.0_CUSTOM_REAL &
- .or. Usolidnorms_all > STABILITY_THRESHOLD .or. Usolidnorms_all < 0.0_CUSTOM_REAL &
- .or. Usolidnormw_all > STABILITY_THRESHOLD .or. Usolidnormw_all < 0.0_CUSTOM_REAL) &
- call exit_MPI(myrank,'forward simulation became unstable and blew up')
- ! adjoint simulations
- if(SIMULATION_TYPE == 3 .and. (b_Usolidnorm_all > STABILITY_THRESHOLD &
- .or. b_Usolidnorm_all < 0.0)) &
- call exit_MPI(myrank,'backward simulation became unstable and blew up')
-
- endif ! myrank
-
- end subroutine it_check_stability
-
-!=====================================================================
-
- subroutine it_compute_total_energy()
-
-! computes kinetic, potential and total energy
-! in all the slices using an MPI reduction
-! and output that to an energy file
-
- use specfem_par
- use specfem_par_elastic
- use specfem_par_acoustic
- use pml_par
-
- implicit none
-
-! local variables
- integer :: i,j,k,l,ispec,iglob
-
- real(kind=CUSTOM_REAL) :: xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl
- real(kind=CUSTOM_REAL) :: duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl
-
- real(kind=CUSTOM_REAL) :: duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl
- real(kind=CUSTOM_REAL) :: duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl
-
- real(kind=CUSTOM_REAL) :: epsilon_xx,epsilon_yy,epsilon_zz,epsilon_xy,epsilon_xz,epsilon_yz,epsilon_yx,epsilon_zx,epsilon_zy
- real(kind=CUSTOM_REAL) :: sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz,sigma_yx,sigma_zx,sigma_zy
- real(kind=CUSTOM_REAL) :: vx,vy,vz,pressure
-
- real(kind=CUSTOM_REAL) :: hp1,hp2,hp3
-
- real(kind=CUSTOM_REAL) :: lambdal,mul,lambdalplus2mul,rhol,cpl
- real(kind=CUSTOM_REAL) :: kappal
-
- real(kind=CUSTOM_REAL) :: integration_weight
- double precision :: kinetic_energy,potential_energy
- double precision :: kinetic_energy_glob,potential_energy_glob,total_energy_glob
-
-! local parameters
- real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: dummyx_loc,dummyy_loc,dummyz_loc
- real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ) :: &
- tempx1,tempx2,tempx3,tempy1,tempy2,tempy3,tempz1,tempz2,tempz3
-
- kinetic_energy = 0.d0
- potential_energy = 0.d0
-
- if(ANISOTROPY .or. ATTENUATION) &
- call exit_MPI(myrank,'calculation of total energy currently implemented only for media with no anisotropy and no attenuation')
-
-! loop over spectral elements
- do ispec = 1,NSPEC_AB
-
-! if element is a CPML then do not compute energy in it, since it is non physical;
-! thus, we compute energy in the main domain only, without absorbing elements
- if(PML_CONDITIONS) then
- ! do not merge this second line with the first using an ".and." statement
- ! because array is_CPML() is unallocated when PML_CONDITIONS is false
- if(is_CPML(ispec)) cycle
- endif
-
- !---
- !--- elastic spectral element
- !---
- if(ispec_is_elastic(ispec)) then
-
- do k=1,NGLLZ
- do j=1,NGLLY
- do i=1,NGLLX
- iglob = ibool(i,j,k,ispec)
- dummyx_loc(i,j,k) = displ(1,iglob)
- dummyy_loc(i,j,k) = displ(2,iglob)
- dummyz_loc(i,j,k) = displ(3,iglob)
- enddo
- enddo
- enddo
-
- do k=1,NGLLZ
- do j=1,NGLLY
- do i=1,NGLLX
-
- iglob = ibool(i,j,k,ispec)
-
- tempx1(i,j,k) = 0._CUSTOM_REAL
- tempx2(i,j,k) = 0._CUSTOM_REAL
- tempx3(i,j,k) = 0._CUSTOM_REAL
-
- tempy1(i,j,k) = 0._CUSTOM_REAL
- tempy2(i,j,k) = 0._CUSTOM_REAL
- tempy3(i,j,k) = 0._CUSTOM_REAL
-
- tempz1(i,j,k) = 0._CUSTOM_REAL
- tempz2(i,j,k) = 0._CUSTOM_REAL
- tempz3(i,j,k) = 0._CUSTOM_REAL
-
- do l=1,NGLLX
- hp1 = hprime_xx(i,l)
- tempx1(i,j,k) = tempx1(i,j,k) + dummyx_loc(l,j,k)*hp1
- tempy1(i,j,k) = tempy1(i,j,k) + dummyy_loc(l,j,k)*hp1
- tempz1(i,j,k) = tempz1(i,j,k) + dummyz_loc(l,j,k)*hp1
-
- !!! can merge these loops because NGLLX = NGLLY = NGLLZ
- hp2 = hprime_yy(j,l)
- tempx2(i,j,k) = tempx2(i,j,k) + dummyx_loc(i,l,k)*hp2
- tempy2(i,j,k) = tempy2(i,j,k) + dummyy_loc(i,l,k)*hp2
- tempz2(i,j,k) = tempz2(i,j,k) + dummyz_loc(i,l,k)*hp2
-
- !!! can merge these loops because NGLLX = NGLLY = NGLLZ
- hp3 = hprime_zz(k,l)
- tempx3(i,j,k) = tempx3(i,j,k) + dummyx_loc(i,j,l)*hp3
- tempy3(i,j,k) = tempy3(i,j,k) + dummyy_loc(i,j,l)*hp3
- tempz3(i,j,k) = tempz3(i,j,k) + dummyz_loc(i,j,l)*hp3
- enddo
-
- ! get derivatives of ux, uy and uz with respect to x, y and z
- xixl = xix(i,j,k,ispec)
- xiyl = xiy(i,j,k,ispec)
- xizl = xiz(i,j,k,ispec)
- etaxl = etax(i,j,k,ispec)
- etayl = etay(i,j,k,ispec)
- etazl = etaz(i,j,k,ispec)
- gammaxl = gammax(i,j,k,ispec)
- gammayl = gammay(i,j,k,ispec)
- gammazl = gammaz(i,j,k,ispec)
- jacobianl = jacobian(i,j,k,ispec)
-
- duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
- duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
- duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
- duydxl = xixl*tempy1(i,j,k) + etaxl*tempy2(i,j,k) + gammaxl*tempy3(i,j,k)
- duydyl = xiyl*tempy1(i,j,k) + etayl*tempy2(i,j,k) + gammayl*tempy3(i,j,k)
- duydzl = xizl*tempy1(i,j,k) + etazl*tempy2(i,j,k) + gammazl*tempy3(i,j,k)
-
- duzdxl = xixl*tempz1(i,j,k) + etaxl*tempz2(i,j,k) + gammaxl*tempz3(i,j,k)
- duzdyl = xiyl*tempz1(i,j,k) + etayl*tempz2(i,j,k) + gammayl*tempz3(i,j,k)
- duzdzl = xizl*tempz1(i,j,k) + etazl*tempz2(i,j,k) + gammazl*tempz3(i,j,k)
-
- ! precompute some sums to save CPU time
- duxdxl_plus_duydyl = duxdxl + duydyl
- duxdxl_plus_duzdzl = duxdxl + duzdzl
- duydyl_plus_duzdzl = duydyl + duzdzl
- duxdyl_plus_duydxl = duxdyl + duydxl
- duzdxl_plus_duxdzl = duzdxl + duxdzl
- duzdyl_plus_duydzl = duzdyl + duydzl
-
- ! compute the strain
- epsilon_xx = duxdxl
- epsilon_yy = duydyl
- epsilon_zz = duzdzl
- epsilon_xy = 0.5 * duxdyl_plus_duydxl
- epsilon_xz = 0.5 * duzdxl_plus_duxdzl
- epsilon_yz = 0.5 * duzdyl_plus_duydzl
-
- ! define symmetric components of epsilon
- epsilon_yx = epsilon_xy
- epsilon_zx = epsilon_xz
- epsilon_zy = epsilon_yz
-
- kappal = kappastore(i,j,k,ispec)
- mul = mustore(i,j,k,ispec)
- rhol = rhostore(i,j,k,ispec)
-
- ! isotropic case
- lambdalplus2mul = kappal + FOUR_THIRDS * mul
- lambdal = lambdalplus2mul - 2.*mul
-
- ! compute stress sigma
- sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl
- sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl
- sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl
-
- sigma_xy = mul*duxdyl_plus_duydxl
- sigma_xz = mul*duzdxl_plus_duxdzl
- sigma_yz = mul*duzdyl_plus_duydzl
-
- ! define symmetric components of sigma
- sigma_yx = sigma_xy
- sigma_zx = sigma_xz
- sigma_zy = sigma_yz
-
- integration_weight = wxgll(i)*wygll(j)*wzgll(k)*jacobianl
-
- ! compute kinetic energy 1/2 rho ||v||^2
- kinetic_energy = kinetic_energy + integration_weight * rhol*(veloc(1,iglob)**2 + &
- veloc(2,iglob)**2 + veloc(3,iglob)**2) / 2.
-
- ! compute potential energy 1/2 sigma_ij epsilon_ij
- potential_energy = potential_energy + integration_weight * &
- (sigma_xx*epsilon_xx + sigma_xy*epsilon_xy + sigma_xz*epsilon_xz + &
- sigma_yx*epsilon_yx + sigma_yy*epsilon_yy + sigma_yz*epsilon_yz + &
- sigma_zx*epsilon_zx + sigma_zy*epsilon_zy + sigma_zz*epsilon_zz) / 2.
-
- enddo
- enddo
- enddo
-
- !---
- !--- acoustic spectral element
- !---
- else if(ispec_is_acoustic(ispec)) then
-
- ! for the definition of potential energy in an acoustic fluid, see for instance
- ! equation (23) of M. Maess et al., Journal of Sound and Vibration 296 (2006) 264-276
-
- ! in case of an acoustic medium, a potential Chi of (density * displacement) is used as in Chaljub and Valette,
- ! Geophysical Journal International, vol. 158, p. 131-141 (2004) and *NOT* a velocity potential
- ! as in Komatitsch and Tromp, Geophysical Journal International, vol. 150, p. 303-318 (2002).
- ! This permits acoustic-elastic coupling based on a non-iterative time scheme.
- ! Displacement is then: u = grad(Chi) / rho
- ! Velocity is then: v = grad(Chi_dot) / rho (Chi_dot being the time derivative of Chi)
- ! and pressure is: p = - Chi_dot_dot (Chi_dot_dot being the time second derivative of Chi).
-
- do k=1,NGLLZ
- do j=1,NGLLY
- do i=1,NGLLX
- iglob = ibool(i,j,k,ispec)
- dummyx_loc(i,j,k) = potential_dot_acoustic(iglob)
- enddo
- enddo
- enddo
-
- do k=1,NGLLZ
- do j=1,NGLLY
- do i=1,NGLLX
-
- iglob = ibool(i,j,k,ispec)
-
- tempx1(i,j,k) = 0._CUSTOM_REAL
- tempx2(i,j,k) = 0._CUSTOM_REAL
- tempx3(i,j,k) = 0._CUSTOM_REAL
-
- do l=1,NGLLX
- hp1 = hprime_xx(i,l)
- tempx1(i,j,k) = tempx1(i,j,k) + dummyx_loc(l,j,k)*hp1
-
- !!! can merge these loops because NGLLX = NGLLY = NGLLZ
- hp2 = hprime_yy(j,l)
- tempx2(i,j,k) = tempx2(i,j,k) + dummyx_loc(i,l,k)*hp2
-
- !!! can merge these loops because NGLLX = NGLLY = NGLLZ
- hp3 = hprime_zz(k,l)
- tempx3(i,j,k) = tempx3(i,j,k) + dummyx_loc(i,j,l)*hp3
- enddo
-
- ! get derivatives of ux, uy and uz with respect to x, y and z
- xixl = xix(i,j,k,ispec)
- xiyl = xiy(i,j,k,ispec)
- xizl = xiz(i,j,k,ispec)
- etaxl = etax(i,j,k,ispec)
- etayl = etay(i,j,k,ispec)
- etazl = etaz(i,j,k,ispec)
- gammaxl = gammax(i,j,k,ispec)
- gammayl = gammay(i,j,k,ispec)
- gammazl = gammaz(i,j,k,ispec)
- jacobianl = jacobian(i,j,k,ispec)
-
- duxdxl = xixl*tempx1(i,j,k) + etaxl*tempx2(i,j,k) + gammaxl*tempx3(i,j,k)
- duxdyl = xiyl*tempx1(i,j,k) + etayl*tempx2(i,j,k) + gammayl*tempx3(i,j,k)
- duxdzl = xizl*tempx1(i,j,k) + etazl*tempx2(i,j,k) + gammazl*tempx3(i,j,k)
-
- rhol = rhostore(i,j,k,ispec)
- kappal = kappastore(i,j,k,ispec)
- cpl = sqrt(kappal / rhol)
-
- ! Velocity is v = grad(Chi_dot) / rho (Chi_dot being the time derivative of Chi)
- vx = duxdxl / rhol
- vy = duxdyl / rhol
- vz = duxdzl / rhol
-
- ! pressure is p = - Chi_dot_dot (Chi_dot_dot being the time second derivative of Chi)
- pressure = - potential_dot_dot_acoustic(iglob)
-
- integration_weight = wxgll(i)*wygll(j)*wzgll(k)*jacobianl
-
- ! compute kinetic energy 1/2 rho ||v||^2
- kinetic_energy = kinetic_energy + integration_weight * rhol*(vx**2 + vy**2 + vz**2) / 2.
-
- ! compute potential energy 1/2 sigma_ij epsilon_ij
- potential_energy = potential_energy + integration_weight * pressure**2 / (2. * rhol * cpl**2)
-
- enddo
- enddo
- enddo
-
- else
-
- call exit_MPI(myrank,'calculation of total energy implemented for acoustic and (visco)elastic elements only for now')
-
- endif
-
- enddo
-
-! compute the total using a reduction between all the processors
- call sum_all_dp(kinetic_energy,kinetic_energy_glob)
- call sum_all_dp(potential_energy,potential_energy_glob)
- total_energy_glob = kinetic_energy_glob + potential_energy_glob
-
-! write the total to disk from the master
- if(myrank == 0) write(IOUT_ENERGY,*) it,sngl(kinetic_energy_glob),sngl(potential_energy_glob),sngl(total_energy_glob)
-
- end subroutine it_compute_total_energy
-
-!=====================================================================
-
- subroutine it_update_displacement_scheme()
-
-! explicit Newmark time scheme with acoustic & elastic domains:
-! (see e.g. Hughes, 1987; Chaljub et al., 2003)
-!
-! chi(t+delta_t) = chi(t) + delta_t chi_dot(t) + 1/2 delta_t**2 chi_dot_dot(t)
-! chi_dot(t+delta_t) = chi_dot(t) + 1/2 delta_t chi_dot_dot(t) + 1/2 delta_t chi_dot_dot(t+delta_t)
-! chi_dot_dot(t+delta_t) = 1/M_acoustic( -K_acoustic chi(t+delta) + B_acoustic u(t+delta_t) + f(t+delta_t) )
-!
-! u(t+delta_t) = u(t) + delta_t v(t) + 1/2 delta_t**2 a(t)
-! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
-! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
-!
-! where
-! chi, chi_dot, chi_dot_dot are acoustic (fluid) potentials ( dotted with respect to time)
-! u, v, a are displacement,velocity & acceleration
-! M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
-! f denotes a source term (acoustic/elastic)
-!
-! note that this stage calculates the predictor terms
-!
-! for
-! potential chi_dot(t+delta) requires + 1/2 delta_t chi_dot_dot(t+delta_t)
-! at a later stage (corrector) once where chi_dot_dot(t+delta) is calculated
-! and similar,
-! velocity v(t+delta_t) requires + 1/2 delta_t a(t+delta_t)
-! at a later stage once where a(t+delta) is calculated
-! also:
-! boundary term B_elastic requires chi_dot_dot(t+delta)
-! thus chi_dot_dot has to be updated first before the elastic boundary term is considered
-
- use specfem_par
- use specfem_par_acoustic
- use specfem_par_elastic
- use specfem_par_poroelastic
- use pml_par
-
- implicit none
-
-! updates acoustic potentials
- if( ACOUSTIC_SIMULATION ) then
-
- if(.NOT. GPU_MODE) then
- ! on CPU
- potential_acoustic(:) = potential_acoustic(:) &
- + deltat * potential_dot_acoustic(:) &
- + deltatsqover2 * potential_dot_dot_acoustic(:)
- potential_dot_acoustic(:) = potential_dot_acoustic(:) &
- + deltatover2 * potential_dot_dot_acoustic(:)
- potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
- else
- ! on GPU
- call it_update_displacement_ac_cuda(Mesh_pointer, NGLOB_AB, &
- deltat, deltatsqover2, deltatover2, &
- b_deltat, b_deltatsqover2, b_deltatover2)
- endif
-
- endif ! ACOUSTIC_SIMULATION
-
-! updates elastic displacement and velocity
- if( ELASTIC_SIMULATION ) then
-
- if(.NOT. GPU_MODE) then
- ! on CPU
- displ(:,:) = displ(:,:) + deltat*veloc(:,:) + deltatsqover2*accel(:,:)
- veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
- if( SIMULATION_TYPE /= 1 ) accel_adj_coupling(:,:) = accel(:,:)
- accel(:,:) = 0._CUSTOM_REAL
- else
- ! on GPU
- ! Includes SIM_TYPE 1 & 3 (for noise tomography)
- call it_update_displacement_cuda(Mesh_pointer, size(displ), deltat, deltatsqover2,&
- deltatover2, b_deltat, b_deltatsqover2, b_deltatover2)
- endif
- endif
-
-! updates poroelastic displacements and velocities
- if( POROELASTIC_SIMULATION ) then
- ! solid phase
- displs_poroelastic(:,:) = displs_poroelastic(:,:) + deltat*velocs_poroelastic(:,:) + &
- deltatsqover2*accels_poroelastic(:,:)
- velocs_poroelastic(:,:) = velocs_poroelastic(:,:) + deltatover2*accels_poroelastic(:,:)
- accels_poroelastic(:,:) = 0._CUSTOM_REAL
-
- ! fluid phase
- displw_poroelastic(:,:) = displw_poroelastic(:,:) + deltat*velocw_poroelastic(:,:) + &
- deltatsqover2*accelw_poroelastic(:,:)
- velocw_poroelastic(:,:) = velocw_poroelastic(:,:) + deltatover2*accelw_poroelastic(:,:)
- accelw_poroelastic(:,:) = 0._CUSTOM_REAL
- endif
-
-! adjoint simulations
- if (SIMULATION_TYPE == 3 .and. .NOT. GPU_MODE) then
- ! acoustic backward fields
- if( ACOUSTIC_SIMULATION ) then
- if(PML_CONDITIONS)then
- if(nglob_interface_PML_acoustic > 0)then
- call read_potential_on_pml_interface(b_potential_dot_dot_acoustic,b_potential_dot_acoustic,b_potential_acoustic,&
- nglob_interface_PML_acoustic,b_PML_potential,b_reclen_PML_potential)
- endif
- endif
- b_potential_acoustic(:) = b_potential_acoustic(:) &
- + b_deltat * b_potential_dot_acoustic(:) &
- + b_deltatsqover2 * b_potential_dot_dot_acoustic(:)
- b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) &
- + b_deltatover2 * b_potential_dot_dot_acoustic(:)
- b_potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
- endif
-
- ! elastic backward fields
- if( ELASTIC_SIMULATION ) then
- if(PML_CONDITIONS)then
- if(nglob_interface_PML_elastic > 0)then
- call read_field_on_pml_interface(b_accel,b_veloc,b_displ,nglob_interface_PML_elastic,&
- b_PML_field,b_reclen_PML_field)
- endif
- endif
- b_displ(:,:) = b_displ(:,:) + b_deltat*b_veloc(:,:) + b_deltatsqover2*b_accel(:,:)
- b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
- b_accel(:,:) = 0._CUSTOM_REAL
- endif
- ! poroelastic backward fields
- if( POROELASTIC_SIMULATION ) then
- ! solid phase
- b_displs_poroelastic(:,:) = b_displs_poroelastic(:,:) + b_deltat*b_velocs_poroelastic(:,:) + &
- b_deltatsqover2*b_accels_poroelastic(:,:)
- b_velocs_poroelastic(:,:) = b_velocs_poroelastic(:,:) + b_deltatover2*b_accels_poroelastic(:,:)
- b_accels_poroelastic(:,:) = 0._CUSTOM_REAL
-
- ! fluid phase
- b_displw_poroelastic(:,:) = b_displw_poroelastic(:,:) + b_deltat*b_velocw_poroelastic(:,:) + &
- b_deltatsqover2*b_accelw_poroelastic(:,:)
- b_velocw_poroelastic(:,:) = b_velocw_poroelastic(:,:) + b_deltatover2*b_accelw_poroelastic(:,:)
- b_accelw_poroelastic(:,:) = 0._CUSTOM_REAL
- endif
- endif
-
-! adjoint simulations: moho kernel
- if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
- ispec2D_moho_top = 0
- ispec2D_moho_bot = 0
- endif
-
-
- end subroutine it_update_displacement_scheme
-
-!=====================================================================
-
subroutine it_read_forward_arrays()
use specfem_par
@@ -886,7 +233,7 @@
! that is, index it for adjoint sources will match index NSTEP - 1 for backward/reconstructed wavefields
! reads in wavefields
- open(unit=27,file=trim(prname)//'save_forward_arrays.bin',status='old',&
+ open(unit=IIN,file=trim(prname)//'save_forward_arrays.bin',status='old',&
action='read',form='unformatted',iostat=ier)
if( ier /= 0 ) then
print*,'error: opening save_forward_arrays'
@@ -895,65 +242,65 @@
endif
if( ACOUSTIC_SIMULATION ) then
- read(27) b_potential_acoustic
- read(27) b_potential_dot_acoustic
- read(27) b_potential_dot_dot_acoustic
+ read(IIN) b_potential_acoustic
+ read(IIN) b_potential_dot_acoustic
+ read(IIN) b_potential_dot_dot_acoustic
! transfers fields onto GPU
- if(GPU_MODE) &
+ if(GPU_MODE) then
call transfer_b_fields_ac_to_device(NGLOB_AB,b_potential_acoustic, &
- b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
+ b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
+ endif
endif
! elastic wavefields
if( ELASTIC_SIMULATION ) then
- read(27) b_displ
- read(27) b_veloc
- read(27) b_accel
+ read(IIN) b_displ
+ read(IIN) b_veloc
+ read(IIN) b_accel
! puts elastic wavefield to GPU
- if(GPU_MODE) &
+ if(GPU_MODE) then
call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer)
+ endif
! memory variables if attenuation
if( ATTENUATION ) then
- if(FULL_ATTENUATION_SOLID) read(27) b_R_trace
- read(27) b_R_xx
- read(27) b_R_yy
- read(27) b_R_xy
- read(27) b_R_xz
- read(27) b_R_yz
- if(FULL_ATTENUATION_SOLID) read(27) b_epsilondev_trace
- read(27) b_epsilondev_xx
- read(27) b_epsilondev_yy
- read(27) b_epsilondev_xy
- read(27) b_epsilondev_xz
- read(27) b_epsilondev_yz
+ if(FULL_ATTENUATION_SOLID) read(IIN) b_R_trace
+ read(IIN) b_R_xx
+ read(IIN) b_R_yy
+ read(IIN) b_R_xy
+ read(IIN) b_R_xz
+ read(IIN) b_R_yz
+ if(FULL_ATTENUATION_SOLID) read(IIN) b_epsilondev_trace
+ read(IIN) b_epsilondev_xx
+ read(IIN) b_epsilondev_yy
+ read(IIN) b_epsilondev_xy
+ read(IIN) b_epsilondev_xz
+ read(IIN) b_epsilondev_yz
! puts elastic attenuation arrays to GPU
- if(GPU_MODE) &
+ if(GPU_MODE) then
call transfer_b_fields_att_to_device(Mesh_pointer, &
- b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), &
-!!! b_R_trace,b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), & ! please change the above line with this
+ b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
+ size(b_R_xx), &
b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
-!!! b_epsilondev_trace,b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
-!!! ! please change the above line with this
size(b_epsilondev_xx))
- endif
-
+ endif
+ endif ! ATTENUATION
endif
! poroelastic wavefields
if( POROELASTIC_SIMULATION ) then
- read(27) b_displs_poroelastic
- read(27) b_velocs_poroelastic
- read(27) b_accels_poroelastic
- read(27) b_displw_poroelastic
- read(27) b_velocw_poroelastic
- read(27) b_accelw_poroelastic
+ read(IIN) b_displs_poroelastic
+ read(IIN) b_velocs_poroelastic
+ read(IIN) b_accels_poroelastic
+ read(IIN) b_displw_poroelastic
+ read(IIN) b_velocw_poroelastic
+ read(IIN) b_accelw_poroelastic
endif
- close(27)
+ close(IIN)
end subroutine it_read_forward_arrays
@@ -969,6 +316,8 @@
implicit none
+ integer :: ier
+
if( it > 1 .and. it < NSTEP) then
! adjoint simulations
@@ -977,35 +326,43 @@
! reconstucted wavefield b_displ() at it corresponds to time (NSTEP-it-1)*DT - t0
! we read in the reconstructed wavefield at the end of the time iteration loop, i.e. after the Newmark scheme,
! thus, indexing is NSTEP-it (rather than something like NSTEP-(it-1) )
+
if (SIMULATION_TYPE == 3 .and. mod(NSTEP-it,NSTEP_Q_SAVE) == 0) then
! reads files content
write(outputname,"('save_Q_arrays_',i6.6,'.bin')") NSTEP-it
- open(unit=27,file=trim(prname_Q)//trim(outputname),status='old',&
- action='read',form='unformatted')
+ open(unit=IIN,file=trim(prname_Q)//trim(outputname),status='old',&
+ action='read',form='unformatted',iostat=ier)
+ if( ier /= 0 ) then
+ print*,'error: opening save_Q_arrays'
+ print*,'path: ',trim(prname_Q)//trim(outputname)
+ call exit_mpi(myrank,'error open file save_Q_arrays_***.bin for reading')
+ endif
+
if( ELASTIC_SIMULATION ) then
! reads arrays from disk files
- read(27) b_displ
- read(27) b_veloc
- read(27) b_accel
+ read(IIN) b_displ
+ read(IIN) b_veloc
+ read(IIN) b_accel
! puts elastic fields onto GPU
if(GPU_MODE) then
! wavefields
call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
endif
- if(FULL_ATTENUATION_SOLID) read(27) b_R_trace
- read(27) b_R_xx
- read(27) b_R_yy
- read(27) b_R_xy
- read(27) b_R_xz
- read(27) b_R_yz
- if(FULL_ATTENUATION_SOLID) read(27) b_epsilondev_trace
- read(27) b_epsilondev_xx
- read(27) b_epsilondev_yy
- read(27) b_epsilondev_xy
- read(27) b_epsilondev_xz
- read(27) b_epsilondev_yz
+ if(FULL_ATTENUATION_SOLID) read(IIN) b_R_trace
+ read(IIN) b_R_xx
+ read(IIN) b_R_yy
+ read(IIN) b_R_xy
+ read(IIN) b_R_xz
+ read(IIN) b_R_yz
+ if(FULL_ATTENUATION_SOLID) read(IIN) b_epsilondev_trace
+ read(IIN) b_epsilondev_xx
+ read(IIN) b_epsilondev_yy
+ read(IIN) b_epsilondev_xy
+ read(IIN) b_epsilondev_xz
+ read(IIN) b_epsilondev_yz
+
! puts elastic fields onto GPU
if(GPU_MODE) then
! attenuation arrays
@@ -1020,9 +377,9 @@
if( ACOUSTIC_SIMULATION ) then
! reads arrays from disk files
- read(27) b_potential_acoustic
- read(27) b_potential_dot_acoustic
- read(27) b_potential_dot_dot_acoustic
+ read(IIN) b_potential_acoustic
+ read(IIN) b_potential_dot_acoustic
+ read(IIN) b_potential_dot_dot_acoustic
! puts acoustic fields onto GPU
if(GPU_MODE) &
@@ -1030,12 +387,19 @@
b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
endif
- close(27)
+ close(IIN)
+
else if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD .and. mod(it,NSTEP_Q_SAVE) == 0) then
! stores files content
write(outputname,"('save_Q_arrays_',i6.6,'.bin')") it
- open(unit=27,file=trim(prname_Q)//trim(outputname),status='unknown',&
- action='write',form='unformatted')
+ open(unit=IOUT,file=trim(prname_Q)//trim(outputname),status='unknown',&
+ action='write',form='unformatted',iostat=ier)
+ if( ier /= 0 ) then
+ print*,'error: opening save_Q_arrays'
+ print*,'path: ',trim(prname_Q)//trim(outputname)
+ call exit_mpi(myrank,'error open file save_Q_arrays_***.bin for writing')
+ endif
+
if( ELASTIC_SIMULATION ) then
! gets elastic fields from GPU onto CPU
if(GPU_MODE) then
@@ -1043,9 +407,9 @@
endif
! writes to disk file
- write(27) displ
- write(27) veloc
- write(27) accel
+ write(IOUT) displ
+ write(IOUT) veloc
+ write(IOUT) accel
if(GPU_MODE) then
! attenuation arrays
@@ -1057,31 +421,34 @@
size(epsilondev_xx))
endif
- if(FULL_ATTENUATION_SOLID) write(27) R_trace
- write(27) R_xx
- write(27) R_yy
- write(27) R_xy
- write(27) R_xz
- write(27) R_yz
- if(FULL_ATTENUATION_SOLID) write(27) epsilondev_trace
- write(27) epsilondev_xx
- write(27) epsilondev_yy
- write(27) epsilondev_xy
- write(27) epsilondev_xz
- write(27) epsilondev_yz
+ if(FULL_ATTENUATION_SOLID) write(IOUT) R_trace
+ write(IOUT) R_xx
+ write(IOUT) R_yy
+ write(IOUT) R_xy
+ write(IOUT) R_xz
+ write(IOUT) R_yz
+ if(FULL_ATTENUATION_SOLID) write(IOUT) epsilondev_trace
+ write(IOUT) epsilondev_xx
+ write(IOUT) epsilondev_yy
+ write(IOUT) epsilondev_xy
+ write(IOUT) epsilondev_xz
+ write(IOUT) epsilondev_yz
endif
+
if( ACOUSTIC_SIMULATION ) then
- ! gets acoustic fields from GPU onto CPU
- if(GPU_MODE) &
+ ! gets acoustic fields from GPU onto CPU
+ if(GPU_MODE) then
call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
- potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+ potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+ endif
! writes to disk file
- write(27) potential_acoustic
- write(27) potential_dot_acoustic
- write(27) potential_dot_dot_acoustic
+ write(IOUT) potential_acoustic
+ write(IOUT) potential_dot_acoustic
+ write(IOUT) potential_dot_dot_acoustic
endif
- close(27)
+ close(IOUT)
+
endif ! SIMULATION_TYPE
endif ! it
@@ -1091,27 +458,28 @@
subroutine it_print_elapsed_time()
- use specfem_par
- use specfem_par_elastic
- use specfem_par_acoustic
- implicit none
+ use specfem_par
+ use specfem_par_elastic
+ use specfem_par_acoustic
- ! local parameters
- double precision :: tCPU
- integer :: ihours,iminutes,iseconds,int_tCPU
+ implicit none
- if(myrank == 0) then
- ! elapsed time since beginning of the simulation
- tCPU = wtime() - time_start
- int_tCPU = int(tCPU)
- ihours = int_tCPU / 3600
- iminutes = (int_tCPU - 3600*ihours) / 60
- iseconds = int_tCPU - 3600*ihours - 60*iminutes
- write(IMAIN,*) 'Time-Loop Complete. Timing info:'
- write(IMAIN,*) 'Total elapsed time in seconds = ',tCPU
- write(IMAIN,"(' Total elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
- endif
+ ! local parameters
+ double precision :: tCPU
+ integer :: ihours,iminutes,iseconds,int_tCPU
+ if( myrank == 0 ) then
+ ! elapsed time since beginning of the simulation
+ tCPU = wtime() - time_start
+ int_tCPU = int(tCPU)
+ ihours = int_tCPU / 3600
+ iminutes = (int_tCPU - 3600*ihours) / 60
+ iseconds = int_tCPU - 3600*ihours - 60*iminutes
+ write(IMAIN,*) 'Time-Loop Complete. Timing info:'
+ write(IMAIN,*) 'Total elapsed time in seconds = ',tCPU
+ write(IMAIN,"(' Total elapsed time in hh:mm:ss = ',i4,' h ',i2.2,' m ',i2.2,' s')") ihours,iminutes,iseconds
+ endif
+
end subroutine it_print_elapsed_time
!=====================================================================
@@ -1127,16 +495,17 @@
implicit none
! to store forward wave fields
- if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+ if( SIMULATION_TYPE == 1 .and. SAVE_FORWARD ) then
! acoustic potentials
if( ACOUSTIC_SIMULATION ) &
call transfer_fields_ac_from_device(NGLOB_AB,potential_acoustic, &
- potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
+ potential_dot_acoustic, potential_dot_dot_acoustic, &
+ Mesh_pointer)
! elastic wavefield
if( ELASTIC_SIMULATION ) then
- call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
+ call transfer_fields_el_from_device(NDIM*NGLOB_AB,displ,veloc,accel,Mesh_pointer)
if (ATTENUATION) &
call transfer_fields_att_from_device(Mesh_pointer, &
@@ -1147,7 +516,7 @@
size(epsilondev_xx))
endif
- else if (SIMULATION_TYPE == 3) then
+ else if( SIMULATION_TYPE == 3 ) then
! to store kernels
! acoustic domains
@@ -1166,7 +535,7 @@
!call transfer_b_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
! elastic kernels
- call transfer_kernels_el_to_host(Mesh_pointer,rho_kl,mu_kl,kappa_kl,NSPEC_AB)
+ call transfer_kernels_el_to_host(Mesh_pointer,rho_kl,mu_kl,kappa_kl,cijkl_kl,NSPEC_AB)
endif
! specific noise strength kernel
@@ -1175,7 +544,7 @@
endif
! approximative hessian for preconditioning kernels
- if ( APPROXIMATE_HESS_KL ) then
+ if( APPROXIMATE_HESS_KL ) then
if( ELASTIC_SIMULATION ) call transfer_kernels_hess_el_tohost(Mesh_pointer,hess_kl,NSPEC_AB)
if( ACOUSTIC_SIMULATION ) call transfer_kernels_hess_ac_tohost(Mesh_pointer,hess_ac_kl,NSPEC_AB)
endif
@@ -1184,7 +553,6 @@
! frees allocated memory on GPU
call prepare_cleanup_device(Mesh_pointer, &
- SAVE_FORWARD, &
ACOUSTIC_SIMULATION,ELASTIC_SIMULATION, &
STACEY_ABSORBING_CONDITIONS,NOISE_TOMOGRAPHY,COMPUTE_AND_STORE_STRAIN, &
ATTENUATION,ANISOTROPY,APPROXIMATE_OCEAN_LOAD, &
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/prepare_timerun.F90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -250,12 +250,12 @@
deallocate(rmassz_acoustic)
endif
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_acoustic,&
+ call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_acoustic,&
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
my_neighbours_ext_mesh)
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_acoustic_interface, &
+ call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_acoustic_interface, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh,&
my_neighbours_ext_mesh)
@@ -284,15 +284,15 @@
deallocate(rmass)
! assemble mass matrix
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmassx, &
+ call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmassx, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmassy, &
+ call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmassy, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmassz, &
+ call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmassz, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
@@ -307,7 +307,7 @@
if(PML_CONDITIONS)then
if(ACOUSTIC_SIMULATION)then
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_elastic_interface, &
+ call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_elastic_interface, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
@@ -318,7 +318,7 @@
! ocean load
if(APPROXIMATE_OCEAN_LOAD ) then
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_ocean_load, &
+ call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_ocean_load, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
@@ -328,12 +328,12 @@
endif
if(POROELASTIC_SIMULATION) then
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_solid_poroelastic, &
+ call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_solid_poroelastic, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
- call assemble_MPI_scalar_ext_mesh(NPROC,NGLOB_AB,rmass_fluid_poroelastic, &
+ call assemble_MPI_scalar_blocking(NPROC,NGLOB_AB,rmass_fluid_poroelastic, &
num_interfaces_ext_mesh,max_nibool_interfaces_ext_mesh, &
nibool_interfaces_ext_mesh,ibool_interfaces_ext_mesh, &
my_neighbours_ext_mesh)
@@ -852,12 +852,17 @@
! elastic domain
if( ELASTIC_SIMULATION ) then
rho_kl(:,:,:,:) = 0._CUSTOM_REAL
- mu_kl(:,:,:,:) = 0._CUSTOM_REAL
- kappa_kl(:,:,:,:) = 0._CUSTOM_REAL
- cijkl_kl(:,:,:,:,:) = 0._CUSTOM_REAL
- if ( APPROXIMATE_HESS_KL ) &
+ if (ANISOTROPIC_KL) then
+ cijkl_kl(:,:,:,:,:) = 0._CUSTOM_REAL
+ else
+ mu_kl(:,:,:,:) = 0._CUSTOM_REAL
+ kappa_kl(:,:,:,:) = 0._CUSTOM_REAL
+ endif
+
+ if ( APPROXIMATE_HESS_KL ) then
hess_kl(:,:,:,:) = 0._CUSTOM_REAL
+ endif
! reconstructed/backward elastic wavefields
b_displ = 0._CUSTOM_REAL
@@ -961,7 +966,7 @@
! total file size
filesize = b_reclen_field
- filesize = filesize*NSTEP
+ filesize = filesize * NSTEP
if (SIMULATION_TYPE == 3) then
! opens existing files
@@ -1108,8 +1113,9 @@
endif
endif
else
+ ! num_abs_boundary_faces is zero
! needs dummy array
- b_num_abs_boundary_faces = 1
+ b_num_abs_boundary_faces = 0
if( ELASTIC_SIMULATION ) then
allocate(b_absorb_field(NDIM,NGLLSQUARE,b_num_abs_boundary_faces),stat=ier)
if( ier /= 0 ) stop 'error allocating array b_absorb_field'
@@ -1128,7 +1134,7 @@
endif
else ! STACEY_ABSORBING_CONDITIONS
! needs dummy array
- b_num_abs_boundary_faces = 1
+ b_num_abs_boundary_faces = 0
if( ELASTIC_SIMULATION ) then
allocate(b_absorb_field(NDIM,NGLLSQUARE,b_num_abs_boundary_faces),stat=ier)
if( ier /= 0 ) stop 'error allocating array b_absorb_field'
@@ -1246,100 +1252,94 @@
! prepares general fields on GPU
!§!§ JC JC here we will need to add GPU support for the new C-PML routines
call prepare_constants_device(Mesh_pointer, &
- NGLLX, NSPEC_AB, NGLOB_AB, &
- xix, xiy, xiz, etax,etay,etaz, gammax, gammay, gammaz, &
- kappastore, mustore,ibool, &
- num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
- hprime_xx, &
- hprimewgll_xx, &
- wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
- STACEY_ABSORBING_CONDITIONS, &
- abs_boundary_ispec, abs_boundary_ijk, &
- abs_boundary_normal, &
- abs_boundary_jacobian2Dw, &
- num_abs_boundary_faces, &
- ispec_is_inner, &
- NSOURCES, nsources_local, &
- sourcearrays, islice_selected_source, ispec_selected_source, &
- number_receiver_global, ispec_selected_rec, &
- nrec, nrec_local, &
- SIMULATION_TYPE, &
- USE_MESH_COLORING_GPU, &
- nspec_acoustic,nspec_elastic,&
- my_neighbours_ext_mesh,&
- request_send_vector_ext_mesh,&
- request_recv_vector_ext_mesh,&
- buffer_recv_vector_ext_mesh)
+ NGLLX, NSPEC_AB, NGLOB_AB, &
+ xix, xiy, xiz, etax,etay,etaz, gammax, gammay, gammaz, &
+ kappastore, mustore, &
+ ibool, &
+ num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
+ hprime_xx,hprimewgll_xx, &
+ wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
+ STACEY_ABSORBING_CONDITIONS, &
+ abs_boundary_ispec, abs_boundary_ijk, &
+ abs_boundary_normal, &
+ abs_boundary_jacobian2Dw, &
+ num_abs_boundary_faces, &
+ ispec_is_inner, &
+ NSOURCES, nsources_local, &
+ sourcearrays, islice_selected_source, ispec_selected_source, &
+ number_receiver_global, ispec_selected_rec, &
+ nrec, nrec_local, &
+ SIMULATION_TYPE, &
+ USE_MESH_COLORING_GPU, &
+ nspec_acoustic,nspec_elastic,&
+ myrank,SAVE_FORWARD)
! prepares fields on GPU for acoustic simulations
if( ACOUSTIC_SIMULATION ) then
- call prepare_fields_acoustic_device(Mesh_pointer,rmass_acoustic,rhostore,kappastore, &
- num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
- ispec_is_acoustic, &
- NOISE_TOMOGRAPHY,num_free_surface_faces, &
- free_surface_ispec,free_surface_ijk, &
- b_reclen_potential,b_absorb_potential, &
- ELASTIC_SIMULATION, num_coupling_ac_el_faces, &
- coupling_ac_el_ispec,coupling_ac_el_ijk, &
- coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
- num_colors_outer_acoustic,num_colors_inner_acoustic, &
- num_elem_colors_acoustic)
+ call prepare_fields_acoustic_device(Mesh_pointer, &
+ rmass_acoustic,rhostore,kappastore, &
+ num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
+ ispec_is_acoustic, &
+ NOISE_TOMOGRAPHY,num_free_surface_faces, &
+ free_surface_ispec,free_surface_ijk, &
+ b_reclen_potential,b_absorb_potential, &
+ ELASTIC_SIMULATION, num_coupling_ac_el_faces, &
+ coupling_ac_el_ispec,coupling_ac_el_ijk, &
+ coupling_ac_el_normal,coupling_ac_el_jacobian2Dw, &
+ num_colors_outer_acoustic,num_colors_inner_acoustic, &
+ num_elem_colors_acoustic)
if( SIMULATION_TYPE == 3 ) &
call prepare_fields_acoustic_adj_dev(Mesh_pointer, &
- APPROXIMATE_HESS_KL)
+ APPROXIMATE_HESS_KL)
endif
! prepares fields on GPU for elastic simulations
!§!§ JC JC here we will need to add GPU support for the new C-PML routines
if( ELASTIC_SIMULATION ) then
- call prepare_fields_elastic_device(Mesh_pointer, NDIM*NGLOB_AB, &
- rmassx,rmassy,rmassz, &
- rho_vp,rho_vs, &
- num_phase_ispec_elastic,phase_ispec_inner_elastic, &
- ispec_is_elastic, &
- b_absorb_field,b_reclen_field, &
- SAVE_FORWARD, &
- COMPUTE_AND_STORE_STRAIN, &
- epsilondev_xx,epsilondev_yy,epsilondev_xy, &
-!!! epsilondev_trace,epsilondev_xx,epsilondev_yy,epsilondev_xy, &
- epsilondev_xz,epsilondev_yz, &
- ATTENUATION, &
- size(R_xx), &
- R_xx,R_yy,R_xy,R_xz,R_yz, &
-!!! R_trace,R_xx,R_yy,R_xy,R_xz,R_yz, &
- one_minus_sum_beta,factor_common, &
-!!! one_minus_sum_beta_kappa,factor_commonkappa, &
- alphaval,betaval,gammaval, &
- APPROXIMATE_OCEAN_LOAD,rmass_ocean_load, &
- NOISE_TOMOGRAPHY, &
- free_surface_normal,free_surface_ispec,free_surface_ijk, &
- num_free_surface_faces, &
- ACOUSTIC_SIMULATION, &
- num_colors_outer_elastic,num_colors_inner_elastic, &
- num_elem_colors_elastic, &
- ANISOTROPY, &
- c11store,c12store,c13store,c14store,c15store,c16store, &
- c22store,c23store,c24store,c25store,c26store, &
- c33store,c34store,c35store,c36store, &
- c44store,c45store,c46store,c55store,c56store,c66store)
+ call prepare_fields_elastic_device(Mesh_pointer, &
+ rmassx,rmassy,rmassz, &
+ rho_vp,rho_vs, &
+ num_phase_ispec_elastic,phase_ispec_inner_elastic, &
+ ispec_is_elastic, &
+ b_absorb_field,b_reclen_field, &
+ COMPUTE_AND_STORE_STRAIN, &
+ epsilondev_xx,epsilondev_yy,epsilondev_xy, &
+ epsilondev_xz,epsilondev_yz, &
+ ATTENUATION, &
+ size(R_xx), &
+ R_xx,R_yy,R_xy,R_xz,R_yz, &
+ one_minus_sum_beta,factor_common, &
+ alphaval,betaval,gammaval, &
+ APPROXIMATE_OCEAN_LOAD,rmass_ocean_load, &
+ NOISE_TOMOGRAPHY, &
+ free_surface_normal,free_surface_ispec,free_surface_ijk, &
+ num_free_surface_faces, &
+ ACOUSTIC_SIMULATION, &
+ num_colors_outer_elastic,num_colors_inner_elastic, &
+ num_elem_colors_elastic, &
+ ANISOTROPY, &
+ c11store,c12store,c13store,c14store,c15store,c16store, &
+ c22store,c23store,c24store,c25store,c26store, &
+ c33store,c34store,c35store,c36store, &
+ c44store,c45store,c46store,c55store,c56store,c66store)
if( SIMULATION_TYPE == 3 ) &
- call prepare_fields_elastic_adj_dev(Mesh_pointer, NDIM*NGLOB_AB, &
- COMPUTE_AND_STORE_STRAIN, &
- epsilon_trace_over_3, &
- b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
-!!! b_epsilondev_trace,b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
- b_epsilondev_xz,b_epsilondev_yz, &
- b_epsilon_trace_over_3, &
- ATTENUATION,size(R_xx), &
- b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
-!!! b_R_trace,b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
- b_alphaval,b_betaval,b_gammaval, &
- APPROXIMATE_HESS_KL)
+ call prepare_fields_elastic_adj_dev(Mesh_pointer, &
+ NDIM*NGLOB_AB, &
+ COMPUTE_AND_STORE_STRAIN, &
+ epsilon_trace_over_3, &
+ b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy, &
+ b_epsilondev_xz,b_epsilondev_yz, &
+ b_epsilon_trace_over_3, &
+ ATTENUATION,size(R_xx), &
+ b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
+ b_alphaval,b_betaval,b_gammaval, &
+ ANISOTROPIC_KL, &
+ APPROXIMATE_HESS_KL)
endif
@@ -1348,44 +1348,51 @@
stop 'todo poroelastic simulations on GPU'
endif
+ ! synchronizes processes
+ !call sync_all()
+
! prepares needed receiver array for adjoint runs
if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) &
call prepare_sim2_or_3_const_device(Mesh_pointer, &
- islice_selected_rec,size(islice_selected_rec), &
- nadj_rec_local,nrec,myrank)
+ islice_selected_rec,size(islice_selected_rec), &
+ nadj_rec_local,nrec)
! prepares fields on GPU for noise simulations
if ( NOISE_TOMOGRAPHY > 0 ) then
! note: noise tomography is only supported for elastic domains so far.
! copies noise arrays to GPU
- call prepare_fields_noise_device(Mesh_pointer, NSPEC_AB, NGLOB_AB, &
- free_surface_ispec, &
- free_surface_ijk, &
- num_free_surface_faces, &
- NOISE_TOMOGRAPHY, &
- NSTEP,noise_sourcearray, &
- normal_x_noise,normal_y_noise,normal_z_noise, &
- mask_noise,free_surface_jacobian2Dw)
+ call prepare_fields_noise_device(Mesh_pointer, &
+ NSPEC_AB, NGLOB_AB, &
+ free_surface_ispec, &
+ free_surface_ijk, &
+ num_free_surface_faces, &
+ NOISE_TOMOGRAPHY, &
+ NSTEP,noise_sourcearray, &
+ normal_x_noise,normal_y_noise,normal_z_noise, &
+ mask_noise,free_surface_jacobian2Dw)
endif ! NOISE_TOMOGRAPHY
! prepares gravity arrays
if( GRAVITY ) then
call prepare_fields_gravity_device(Mesh_pointer,GRAVITY, &
- minus_deriv_gravity,minus_g,wgll_cube,&
- ACOUSTIC_SIMULATION,rhostore)
+ minus_deriv_gravity,minus_g,wgll_cube,&
+ ACOUSTIC_SIMULATION,rhostore)
endif
+ ! synchronizes processes
+ call sync_all()
+
! sends initial data to device
! puts acoustic initial fields onto GPU
if( ACOUSTIC_SIMULATION ) then
call transfer_fields_ac_to_device(NGLOB_AB,potential_acoustic, &
- potential_dot_acoustic,potential_dot_dot_acoustic,Mesh_pointer)
+ potential_dot_acoustic,potential_dot_dot_acoustic,Mesh_pointer)
if( SIMULATION_TYPE == 3 ) &
call transfer_b_fields_ac_to_device(NGLOB_AB,b_potential_acoustic, &
- b_potential_dot_acoustic,b_potential_dot_dot_acoustic,Mesh_pointer)
+ b_potential_dot_acoustic,b_potential_dot_dot_acoustic,Mesh_pointer)
endif
! puts elastic initial fields onto GPU
@@ -1396,6 +1403,9 @@
call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel,Mesh_pointer)
endif
+ ! synchronizes processes
+ call sync_all()
+
! outputs GPU usage to files for all processes
call output_free_device_memory(myrank)
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/read_mesh_databases.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -436,6 +436,12 @@
! absorbing boundary surface
read(27) num_abs_boundary_faces
+
+ ! checks
+ if( num_abs_boundary_faces < 0 ) then
+ print*,'read_mesh_databases: reading in negative num_abs_boundary_faces ',num_abs_boundary_faces,'...resetting to zero'
+ num_abs_boundary_faces = 0
+ endif
allocate(abs_boundary_ispec(num_abs_boundary_faces), &
abs_boundary_ijk(3,NGLLSQUARE,num_abs_boundary_faces), &
abs_boundary_jacobian2Dw(NGLLSQUARE,num_abs_boundary_faces), &
@@ -696,23 +702,34 @@
!endif
! MPI communications
- allocate(buffer_send_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
- buffer_recv_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
- buffer_send_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
- buffer_recv_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
- request_send_vector_ext_mesh(num_interfaces_ext_mesh), &
- request_recv_vector_ext_mesh(num_interfaces_ext_mesh), &
- request_send_scalar_ext_mesh(num_interfaces_ext_mesh), &
- request_recv_scalar_ext_mesh(num_interfaces_ext_mesh), &
- buffer_send_vector_ext_mesh_s(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
- buffer_recv_vector_ext_mesh_s(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
- buffer_send_vector_ext_mesh_w(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
- buffer_recv_vector_ext_mesh_w(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
- request_send_vector_ext_mesh_s(num_interfaces_ext_mesh), &
- request_recv_vector_ext_mesh_s(num_interfaces_ext_mesh), &
- request_send_vector_ext_mesh_w(num_interfaces_ext_mesh), &
- request_recv_vector_ext_mesh_w(num_interfaces_ext_mesh),stat=ier)
- if( ier /= 0 ) stop 'error allocating array buffer_send_vector_ext_mesh etc.'
+ if( ACOUSTIC_SIMULATION ) then
+ allocate(buffer_send_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+ buffer_recv_scalar_ext_mesh(max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+ request_send_scalar_ext_mesh(num_interfaces_ext_mesh), &
+ request_recv_scalar_ext_mesh(num_interfaces_ext_mesh), &
+ stat=ier)
+ if( ier /= 0 ) stop 'error allocating array buffer_send_scalar_ext_mesh,.. for acoustic simulations'
+ endif
+ if( ELASTIC_SIMULATION ) then
+ allocate(buffer_send_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+ buffer_recv_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+ request_send_vector_ext_mesh(num_interfaces_ext_mesh), &
+ request_recv_vector_ext_mesh(num_interfaces_ext_mesh), &
+ stat=ier)
+ if( ier /= 0 ) stop 'error allocating array buffer_send_vector_ext_mesh,.. for elastic simulations'
+ endif
+ if( POROELASTIC_SIMULATION ) then
+ allocate(buffer_send_vector_ext_mesh_s(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+ buffer_recv_vector_ext_mesh_s(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+ buffer_send_vector_ext_mesh_w(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+ buffer_recv_vector_ext_mesh_w(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+ request_send_vector_ext_mesh_s(num_interfaces_ext_mesh), &
+ request_recv_vector_ext_mesh_s(num_interfaces_ext_mesh), &
+ request_send_vector_ext_mesh_w(num_interfaces_ext_mesh), &
+ request_recv_vector_ext_mesh_w(num_interfaces_ext_mesh), &
+ stat=ier)
+ if( ier /= 0 ) stop 'error allocating array buffer_send_vector_ext_mesh_s,.. for poroelastic simulations'
+ endif
! gets model dimensions
minl = minval( xstore )
@@ -800,31 +817,25 @@
! density kernel
allocate(rho_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
if( ier /= 0 ) stop 'error allocating array rho_kl'
- ! shear modulus kernel
- allocate(mu_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
- if( ier /= 0 ) stop 'error allocating array mu_kl'
- ! compressional modulus kernel
- allocate(kappa_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
- if( ier /= 0 ) stop 'error allocating array kappa_kl'
- ! anisotropic kernels
-!! DK DK commented this out for now; must be made optional
-! allocate(cijkl_kl(21,NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
-! if( ier /= 0 ) stop 'error allocating array cijkl_kl'
-!! DK DK added this for now
- allocate(cijkl_kl(1,1,1,1,1),stat=ier)
+ if (ANISOTROPIC_KL) then
+ ! anisotropic kernels
+ allocate(cijkl_kl(21,NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array cijkl_kl'
+ !dummy
+ allocate(mu_kl(1,1,1,1))
+ allocate(kappa_kl(1,1,1,1))
+ else
+ ! shear modulus kernel
+ allocate(mu_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array mu_kl'
+ ! compressional modulus kernel
+ allocate(kappa_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array kappa_kl'
+ !dummy
+ allocate(cijkl_kl(1,1,1,1,1))
+ endif
- ! derived kernels
- ! density prime kernel
- allocate(rhop_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
- if( ier /= 0 ) stop 'error allocating array rhop_kl'
- ! vp kernel
- allocate(alpha_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
- if( ier /= 0 ) stop 'error allocating array alpha_kl'
- ! vs kernel
- allocate(beta_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
- if( ier /= 0 ) stop 'error allocating array beta_kl'
-
! noise source strength kernel
if (NOISE_TOMOGRAPHY == 3) then
allocate(sigma_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
@@ -843,17 +854,17 @@
! MPI handling
allocate(b_request_send_vector_ext_mesh(num_interfaces_ext_mesh), &
- b_request_recv_vector_ext_mesh(num_interfaces_ext_mesh), &
- b_buffer_send_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
- b_buffer_recv_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
+ b_request_recv_vector_ext_mesh(num_interfaces_ext_mesh), &
+ b_buffer_send_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh), &
+ b_buffer_recv_vector_ext_mesh(NDIM,max_nibool_interfaces_ext_mesh,num_interfaces_ext_mesh),stat=ier)
if( ier /= 0 ) stop 'error allocating array b_request_send_vector_ext_mesh etc.'
! allocates attenuation solids
allocate(b_R_xx(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
- b_R_yy(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
- b_R_xy(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
- b_R_xz(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
- b_R_yz(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS),stat=ier)
+ b_R_yy(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
+ b_R_xy(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
+ b_R_xz(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS), &
+ b_R_yz(NGLLX,NGLLY,NGLLZ,NSPEC_ATTENUATION_AB,N_SLS),stat=ier)
if( ier /= 0 ) stop 'error allocating array b_R_xx etc.'
! note: these arrays are needed for attenuation and/or kernel computations
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/save_adjoint_kernels.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -68,10 +68,10 @@
enddo ! j
enddo ! k
enddo ! ispec
- open(unit=27,file=prname(1:len_trim(prname))//'weights_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'weights_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file weights_kernel.bin'
- write(27) weights_kernel
- close(27)
+ write(IOUT) weights_kernel
+ close(IOUT)
endif
! for noise simulations --- noise strength kernel
@@ -123,25 +123,25 @@
enddo
! save kernels to binary files
- open(unit=27,file=prname(1:len_trim(prname))//'rho_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'rho_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file rho_acoustic_kernel.bin'
- write(27) rho_ac_kl
- close(27)
+ write(IOUT) rho_ac_kl
+ close(IOUT)
- open(unit=27,file=prname(1:len_trim(prname))//'kappa_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'kappa_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file kappa_acoustic_kernel.bin'
- write(27) kappa_ac_kl
- close(27)
+ write(IOUT) kappa_ac_kl
+ close(IOUT)
- open(unit=27,file=prname(1:len_trim(prname))//'rhop_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'rhop_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file rhop_acoustic_kernel.bin'
- write(27) rhop_ac_kl
- close(27)
+ write(IOUT) rhop_ac_kl
+ close(IOUT)
- open(unit=27,file=prname(1:len_trim(prname))//'alpha_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'alpha_acoustic_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file alpha_acoustic_kernel.bin'
- write(27) alpha_ac_kl
- close(27)
+ write(IOUT) alpha_ac_kl
+ close(IOUT)
end subroutine save_kernels_acoustic
@@ -159,15 +159,48 @@
! local parameters
integer:: ispec,i,j,k,iglob,ier
real(kind=CUSTOM_REAL) :: rhol,mul,kappal
- real(kind=CUSTOM_REAL),dimension(21) :: cijkl_kl_local
! Transverse isotropic paramters
real(kind=CUSTOM_REAL) :: A,N,C,L,F,eta
- real(kind=CUSTOM_REAL), dimension(21) :: an_kl
- real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT):: &
- alphav_kl,alphah_kl,betav_kl,betah_kl, &
- eta_kl
+ real(kind=CUSTOM_REAL), dimension(21) :: cijkl_kl_local
+ real(kind=CUSTOM_REAL), dimension(5) :: an_kl
+ real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable :: alphav_kl,alphah_kl,betav_kl,betah_kl,eta_kl
+ ! temporary isotropic kernels
+ real(kind=CUSTOM_REAL), dimension(:,:,:,:),allocatable :: rhop_kl,alpha_kl,beta_kl
+
+ ! allocates temporary transversely isotropic kernels
+ if( ANISOTROPIC_KL ) then
+ if( SAVE_TRANSVERSE_KL ) then
+ allocate(alphav_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT), &
+ alphah_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT), &
+ betav_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT), &
+ betah_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT), &
+ eta_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT), &
+ stat=ier)
+ if( ier /=0 ) stop 'error allocating arrays alphav_kl,...'
+
+ ! derived kernels
+ ! vp kernel
+ allocate(alpha_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array alpha_kl'
+ ! vs kernel
+ allocate(beta_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array beta_kl'
+ endif
+ else
+ ! derived kernels
+ ! vp kernel
+ allocate(alpha_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array alpha_kl'
+ ! vs kernel
+ allocate(beta_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array beta_kl'
+ ! density prime kernel
+ allocate(rhop_kl(NGLLX,NGLLY,NGLLZ,NSPEC_ADJOINT),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array rhop_kl'
+ endif
+
! finalizes calculation of rhop, beta, alpha kernels
do ispec = 1, NSPEC_AB
@@ -180,71 +213,68 @@
iglob = ibool(i,j,k,ispec)
! Store local material values
- rhol = rho_vs(i,j,k,ispec)*rho_vs(i,j,k,ispec) / mustore(i,j,k,ispec)
- mul = mustore(i,j,k,ispec)
- kappal = kappastore(i,j,k,ispec)
+ rhol = rho_vs(i,j,k,ispec)*rho_vs(i,j,k,ispec) / mustore(i,j,k,ispec)
+ mul = mustore(i,j,k,ispec)
+ kappal = kappastore(i,j,k,ispec)
- if (ANISOTROPIC_KL) then
+ if( ANISOTROPIC_KL ) then
+ if( SAVE_TRANSVERSE_KL ) then
+ cijkl_kl_local(:) = - cijkl_kl(:,i,j,k,ispec)
- cijkl_kl_local = - cijkl_kl(:,i,j,k,ispec)
+ ! Computes parameters for an isotropic model
+ A = kappal + FOUR_THIRDS * mul
+ C = A
+ L = mul
+ N = mul
+ F = kappal - 2._CUSTOM_REAL/3._CUSTOM_REAL * mul
+ eta = 1._CUSTOM_REAL
- if (SAVE_TRANSVERSE_KL) then
+ ! note: cijkl_kl_local() is fully anisotropic C_ij kernel components (non-dimensionalized)
+ ! for GLL point at (i,j,k,ispec)
- ! Computes parameters for an isotropic model
- A = kappal + FOUR_THIRDS * mul
- C = A
- L = mul
- N = mul
- F = kappal - 2._CUSTOM_REAL/3._CUSTOM_REAL * mul
- eta = 1._CUSTOM_REAL
+ ! Purpose : compute the kernels for the An coeffs (an_kl)
+ ! from the kernels for Cij (cijkl_kl_local)
- ! note: cijkl_kl_local() is fully anisotropic C_ij kernel components (non-dimensionalized)
- ! for GLL point at (i,j,k,ispec)
+ ! Definition of the input array cij_kl :
+ ! cij_kl(1) = C11 ; cij_kl(2) = C12 ; cij_kl(3) = C13
+ ! cij_kl(4) = C14 ; cij_kl(5) = C15 ; cij_kl(6) = C16
+ ! cij_kl(7) = C22 ; cij_kl(8) = C23 ; cij_kl(9) = C24
+ ! cij_kl(10) = C25 ; cij_kl(11) = C26 ; cij_kl(12) = C33
+ ! cij_kl(13) = C34 ; cij_kl(14) = C35 ; cij_kl(15) = C36
+ ! cij_kl(16) = C44 ; cij_kl(17) = C45 ; cij_kl(18) = C46
+ ! cij_kl(19) = C55 ; cij_kl(20) = C56 ; cij_kl(21) = C66
+ ! where the Cij (Voigt's notation) are defined as function of
+ ! the components of the elastic tensor in spherical coordinates
+ ! by eq. (A.1) of Chen & Tromp, GJI 168 (2007)
- ! Purpose : compute the kernels for the An coeffs (an_kl)
- ! from the kernels for Cij (cijkl_kl_local)
+ ! From the relations giving Cij in function of An
+ ! Checked with Min Chen's results (routine build_cij)
- ! Definition of the input array cij_kl :
- ! cij_kl(1) = C11 ; cij_kl(2) = C12 ; cij_kl(3) = C13
- ! cij_kl(4) = C14 ; cij_kl(5) = C15 ; cij_kl(6) = C16
- ! cij_kl(7) = C22 ; cij_kl(8) = C23 ; cij_kl(9) = C24
- ! cij_kl(10) = C25 ; cij_kl(11) = C26 ; cij_kl(12) = C33
- ! cij_kl(13) = C34 ; cij_kl(14) = C35 ; cij_kl(15) = C36
- ! cij_kl(16) = C44 ; cij_kl(17) = C45 ; cij_kl(18) = C46
- ! cij_kl(19) = C55 ; cij_kl(20) = C56 ; cij_kl(21) = C66
- ! where the Cij (Voigt's notation) are defined as function of
- ! the components of the elastic tensor in spherical coordinates
- ! by eq. (A.1) of Chen & Tromp, GJI 168 (2007)
+ an_kl(1) = cijkl_kl_local(1)+cijkl_kl_local(2)+cijkl_kl_local(7) !A
+ an_kl(2) = cijkl_kl_local(12) !C
+ an_kl(3) = -2*cijkl_kl_local(2)+cijkl_kl_local(21) !N
+ an_kl(4) = cijkl_kl_local(16)+cijkl_kl_local(19) !L
+ an_kl(5) = cijkl_kl_local(3)+cijkl_kl_local(8) !F
- ! From the relations giving Cij in function of An
- ! Checked with Min Chen's results (routine build_cij)
+ ! for parameterization: ( alpha_v, alpha_h, beta_v, beta_h, eta, rho )
+ ! K_alpha_v
+ alphav_kl(i,j,k,ispec) = 2.0 * C * an_kl(2)
+ ! K_alpha_h
+ alphah_kl(i,j,k,ispec) = 2.0 * A * an_kl(1) + 2.0 * A * eta * an_kl(5)
+ ! K_beta_v
+ betav_kl(i,j,k,ispec) = 2.0 * L * an_kl(4) - 4.0 * L * eta * an_kl(5)
+ ! K_beta_h
+ betah_kl(i,j,k,ispec) = 2.0 * N * an_kl(3)
+ ! K_eta
+ eta_kl(i,j,k,ispec) = F * an_kl(5)
- an_kl(1) = cijkl_kl_local(1)+cijkl_kl_local(2)+cijkl_kl_local(7) !A
- an_kl(2) = cijkl_kl_local(12) !C
- an_kl(3) = -2*cijkl_kl_local(2)+cijkl_kl_local(21) !N
- an_kl(4) = cijkl_kl_local(16)+cijkl_kl_local(19) !L
- an_kl(5) = cijkl_kl_local(3)+cijkl_kl_local(8) !F
-
- ! for parameterization: ( alpha_v, alpha_h, beta_v, beta_h, eta, rho )
- ! K_alpha_v
- alphav_kl(i,j,k,ispec) = 2*C*an_kl(2)
- ! K_alpha_h
- alphah_kl(i,j,k,ispec) = 2*A*an_kl(1) + 2*A*eta*an_kl(5)
- ! K_beta_v
- betav_kl(i,j,k,ispec) = 2*L*an_kl(4) - 4*L*eta*an_kl(5)
- ! K_beta_h
- betah_kl(i,j,k,ispec) = 2*N*an_kl(3)
- ! K_eta
- eta_kl(i,j,k,ispec) = F*an_kl(5)
-
- ! to check: isotropic kernels from transverse isotropic ones
- alpha_kl(i,j,k,ispec) = alphav_kl(i,j,k,ispec) &
+ ! to check: isotropic kernels from transverse isotropic ones
+ alpha_kl(i,j,k,ispec) = alphav_kl(i,j,k,ispec) &
+ alphah_kl(i,j,k,ispec)
- beta_kl(i,j,k,ispec) = betav_kl(i,j,k,ispec) &
+ beta_kl(i,j,k,ispec) = betav_kl(i,j,k,ispec) &
+ betah_kl(i,j,k,ispec)
+ endif ! SAVE_TRANSVERSE_KL
- endif ! SAVE_TRANSVERSE_KL
-
else
! isotropic kernels
@@ -283,91 +313,96 @@
enddo
- if (ANISOTROPIC_KL) then
+ if( ANISOTROPIC_KL ) then
+ ! outputs transverse isotropic kernels only
+ if( SAVE_TRANSVERSE_KL ) then
+ ! transverse isotropic kernels
+ ! (alpha_v, alpha_h, beta_v, beta_h, eta, rho ) parameterization
+ open(unit=IOUT,file=trim(prname)//'alphav_kernel.bin',status='unknown',form='unformatted',action='write')
+ write(IOUT) alphav_kl
+ close(IOUT)
+ open(unit=IOUT,file=trim(prname)//'alphah_kernel.bin',status='unknown',form='unformatted',action='write')
+ write(IOUT) alphah_kl
+ close(IOUT)
+ open(unit=IOUT,file=trim(prname)//'betav_kernel.bin',status='unknown',form='unformatted',action='write')
+ write(IOUT) betav_kl
+ close(IOUT)
+ open(unit=IOUT,file=trim(prname)//'betah_kernel.bin',status='unknown',form='unformatted',action='write')
+ write(IOUT) betah_kl
+ close(IOUT)
+ open(unit=IOUT,file=trim(prname)//'eta_kernel.bin',status='unknown',form='unformatted',action='write')
+ write(IOUT) eta_kl
+ close(IOUT)
- ! outputs transverse isotropic kernels only
- if (SAVE_TRANSVERSE_KL) then
- ! transverse isotropic kernels
- ! (alpha_v, alpha_h, beta_v, beta_h, eta, rho ) parameterization
- open(unit=27,file=trim(prname)//'alphav_kernel.bin',status='unknown',form='unformatted',action='write')
- write(27) alphav_kl
- close(27)
- open(unit=27,file=trim(prname)//'alphah_kernel.bin',status='unknown',form='unformatted',action='write')
- write(27) alphah_kl
- close(27)
- open(unit=27,file=trim(prname)//'betav_kernel.bin',status='unknown',form='unformatted',action='write')
- write(27) betav_kl
- close(27)
- open(unit=27,file=trim(prname)//'betah_kernel.bin',status='unknown',form='unformatted',action='write')
- write(27) betah_kl
- close(27)
- open(unit=27,file=trim(prname)//'eta_kernel.bin',status='unknown',form='unformatted',action='write')
- write(27) eta_kl
- close(27)
-
- ! transverse isotropic test kernels
- open(unit=27,file=trim(prname)//'alpha_kernel.bin',status='unknown',form='unformatted',action='write')
- write(27) alpha_kl
- close(27)
- open(unit=27,file=trim(prname)//'beta_kernel.bin',status='unknown',form='unformatted',action='write')
- write(27) beta_kl
- close(27)
-
- else
- ! fully anisotropic kernels
- ! note: the C_ij and density kernels are not for relative perturbations (delta ln( m_i) = delta m_i / m_i),
- ! but absolute perturbations (delta m_i = m_i - m_0).
- ! Kappa and mu are for absolute perturbations, can be used to check with purely isotropic versions.
- open(unit=27,file=trim(prname)//'rho_kernel.bin',status='unknown',form='unformatted',action='write')
- write(27) - rho_kl
- close(27)
- open(unit=27,file=trim(prname)//'cijkl_kernel.bin',status='unknown',form='unformatted',action='write')
- write(27) - cijkl_kl
- close(27)
-
- endif
-
+ ! transverse isotropic test kernels
+ open(unit=IOUT,file=trim(prname)//'alpha_kernel.bin',status='unknown',form='unformatted',action='write')
+ write(IOUT) alpha_kl
+ close(IOUT)
+ open(unit=IOUT,file=trim(prname)//'beta_kernel.bin',status='unknown',form='unformatted',action='write')
+ write(IOUT) beta_kl
+ close(IOUT)
+ else
+ ! fully anisotropic kernels
+ ! note: the C_ij and density kernels are not for relative perturbations (delta ln( m_i) = delta m_i / m_i),
+ ! but absolute perturbations (delta m_i = m_i - m_0).
+ ! Kappa and mu are for absolute perturbations, can be used to check with purely isotropic versions.
+ open(unit=IOUT,file=trim(prname)//'rho_kernel.bin',status='unknown',form='unformatted',action='write')
+ write(IOUT) - rho_kl
+ close(IOUT)
+ open(unit=IOUT,file=trim(prname)//'cijkl_kernel.bin',status='unknown',form='unformatted',action='write')
+ write(IOUT) - cijkl_kl
+ close(IOUT)
+ endif
else
-
! save kernels to binary files
- open(unit=27,file=prname(1:len_trim(prname))//'rho_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'rho_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file rho_kernel.bin'
- write(27) rho_kl
- close(27)
+ write(IOUT) rho_kl
+ close(IOUT)
- open(unit=27,file=prname(1:len_trim(prname))//'mu_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'mu_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file mu_kernel.bin'
- write(27) mu_kl
- close(27)
+ write(IOUT) mu_kl
+ close(IOUT)
- open(unit=27,file=prname(1:len_trim(prname))//'kappa_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'kappa_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file kappa_kernel.bin'
- write(27) kappa_kl
- close(27)
+ write(IOUT) kappa_kl
+ close(IOUT)
- open(unit=27,file=prname(1:len_trim(prname))//'rhop_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'rhop_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file rhop_kernel.bin'
- write(27) rhop_kl
- close(27)
+ write(IOUT) rhop_kl
+ close(IOUT)
- open(unit=27,file=prname(1:len_trim(prname))//'beta_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'beta_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file beta_kernel.bin'
- write(27) beta_kl
- close(27)
+ write(IOUT) beta_kl
+ close(IOUT)
- open(unit=27,file=prname(1:len_trim(prname))//'alpha_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'alpha_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file alpha_kernel.bin'
- write(27) alpha_kl
- close(27)
- endif
+ write(IOUT) alpha_kl
+ close(IOUT)
+ endif ! ANISOTROPIC_KL
if (SAVE_MOHO_MESH) then
- open(unit=27,file=prname(1:len_trim(prname))//'moho_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'moho_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file moho_kernel.bin'
- write(27) moho_kl
- close(27)
+ write(IOUT) moho_kl
+ close(IOUT)
endif
+ ! frees temporary arrays
+ if( ANISOTROPIC_KL ) then
+ if( SAVE_TRANSVERSE_KL ) then
+ deallocate(alphav_kl,alphah_kl,betav_kl,betah_kl,eta_kl)
+ deallocate(alpha_kl,beta_kl)
+ endif
+ else
+ deallocate(rhop_kl,alpha_kl,beta_kl)
+ endif
+
end subroutine save_kernels_elastic
!
@@ -610,98 +645,98 @@
! save kernels to binary files
! primary kernels
- open(unit=27,file=prname(1:len_trim(prname))//'rhot_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'rhot_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file rhot_primeporo_kernel.bin'
- write(27) rhot_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'rhof_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) rhot_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'rhof_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file rhof_primeporo_kernel.bin'
- write(27) rhof_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'sm_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) rhof_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'sm_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file sm_primeporo_kernel.bin'
- write(27) sm_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'eta_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) sm_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'eta_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file eta_primeporo_kernel.bin'
- write(27) eta_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'mufr_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) eta_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'mufr_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file mufr_primeporo_kernel.bin'
- write(27) mufr_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'B_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) mufr_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'B_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file B_primeporo_kernel.bin'
- write(27) B_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'C_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) B_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'C_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file C_primeporo_kernel.bin'
- write(27) C_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'M_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) C_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'M_primeporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file M_primeporo_kernel.bin'
- write(27) M_kl
- close(27)
+ write(IOUT) M_kl
+ close(IOUT)
! density kernels
- open(unit=27,file=prname(1:len_trim(prname))//'rhob_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'rhob_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file rhob_densityporo_kernel.bin'
- write(27) rhob_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'rhofb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) rhob_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'rhofb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file rhofb_densityporo_kernel.bin'
- write(27) rhofb_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'phi_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) rhofb_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'phi_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file phi_densityporo_kernel.bin'
- write(27) phi_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'mufrb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) phi_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'mufrb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file mufrb_densityporo_kernel.bin'
- write(27) mufrb_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'Bb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) mufrb_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'Bb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file Bb_densityporo_kernel.bin'
- write(27) Bb_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'Cb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) Bb_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'Cb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file Cb_densityporo_kernel.bin'
- write(27) Cb_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'Mb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) Cb_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'Mb_densityporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file Mb_densityporo_kernel.bin'
- write(27) Mb_kl
- close(27)
+ write(IOUT) Mb_kl
+ close(IOUT)
! wavespeed kernels
- open(unit=27,file=prname(1:len_trim(prname))//'rhobb_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'rhobb_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file rhobb_waveporo_kernel.bin'
- write(27) rhobb_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'rhofbb_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) rhobb_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'rhofbb_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file rhofbb_waveporo_kernel.bin'
- write(27) rhofbb_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'phib_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) rhofbb_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'phib_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file phib_waveporo_kernel.bin'
- write(27) phib_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'cs_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) phib_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'cs_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file cs_waveporo_kernel.bin'
- write(27) cs_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'cpI_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) cs_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'cpI_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file cpI_waveporo_kernel.bin'
- write(27) cpI_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'cpII_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) cpI_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'cpII_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file cpII_waveporo_kernel.bin'
- write(27) cpII_kl
- close(27)
- open(unit=27,file=prname(1:len_trim(prname))//'ratio_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
+ write(IOUT) cpII_kl
+ close(IOUT)
+ open(unit=IOUT,file=prname(1:len_trim(prname))//'ratio_waveporo_kernel.bin',status='unknown',form='unformatted',iostat=ier)
if( ier /= 0 ) stop 'error opening file ratio_waveporo_kernel.bin'
- write(27) ratio_kl
- close(27)
+ write(IOUT) ratio_kl
+ close(IOUT)
end subroutine save_kernels_poroelastic
@@ -724,11 +759,11 @@
hess_ac_kl(:,:,:,:) = 2._CUSTOM_REAL * hess_ac_kl(:,:,:,:)
! stores into file
- open(unit=27,file=trim(prname)//'hess_acoustic_kernel.bin', &
+ open(unit=IOUT,file=trim(prname)//'hess_acoustic_kernel.bin', &
status='unknown',form='unformatted',action='write',iostat=ier)
if( ier /= 0 ) stop 'error opening file hess_acoustic_kernel.bin'
- write(27) hess_ac_kl
- close(27)
+ write(IOUT) hess_ac_kl
+ close(IOUT)
endif
! elastic domains
@@ -737,11 +772,11 @@
hess_kl(:,:,:,:) = 2._CUSTOM_REAL * hess_kl(:,:,:,:)
! stores into file
- open(unit=27,file=trim(prname)//'hess_kernel.bin', &
+ open(unit=IOUT,file=trim(prname)//'hess_kernel.bin', &
status='unknown',form='unformatted',action='write',iostat=ier)
if( ier /= 0 ) stop 'error opening file hess_kernel.bin'
- write(27) hess_kl
- close(27)
+ write(IOUT) hess_kl
+ close(IOUT)
endif
end subroutine save_kernels_hessian
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/setup_sources_receivers.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -896,7 +896,7 @@
enddo
! writes out to VTK file
- write(IOVTK,*) xmesh,ymesh,zmesh
+ write(IOVTK,'(3e18.6)') xmesh,ymesh,zmesh
endif
enddo ! NSOURCES
endif
@@ -941,7 +941,7 @@
enddo
! writes out to VTK file
- write(IOVTK,*) xmesh,ymesh,zmesh
+ write(IOVTK,'(3e18.6)') xmesh,ymesh,zmesh
endif
enddo
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/specfem3D_par.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/specfem3D_par.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/specfem3D_par.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -363,8 +363,7 @@
real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: b_epsilon_trace_over_3
! adjoint kernels
- real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: rho_kl, mu_kl, kappa_kl, &
- rhop_kl, beta_kl, alpha_kl
+ real(kind=CUSTOM_REAL), dimension(:,:,:,:), allocatable :: rho_kl, mu_kl, kappa_kl
! anisotropic kernels
real(kind=CUSTOM_REAL), dimension(:,:,:,:,:), allocatable :: cijkl_kl
Added: seismo/3D/SPECFEM3D/trunk/src/specfem3D/update_displacement_scheme.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/update_displacement_scheme.f90 (rev 0)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/update_displacement_scheme.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -0,0 +1,259 @@
+!=====================================================================
+!
+! S p e c f e m 3 D V e r s i o n 2 . 1
+! ---------------------------------------
+!
+! Main authors: Dimitri Komatitsch and Jeroen Tromp
+! Princeton University, USA and CNRS / INRIA / University of Pau
+! (c) Princeton University / California Institute of Technology and CNRS / INRIA / University of Pau
+! July 2012
+!
+! This program is free software; you can redistribute it and/or modify
+! it under the terms of the GNU General Public License as published by
+! the Free Software Foundation; either version 2 of the License, or
+! (at your option) any later version.
+!
+! This program is distributed in the hope that it will be useful,
+! but WITHOUT ANY WARRANTY; without even the implied warranty of
+! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+! GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along
+! with this program; if not, write to the Free Software Foundation, Inc.,
+! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+!
+!=====================================================================
+
+
+ subroutine update_displacement_scheme()
+
+! explicit Newmark time scheme with acoustic & elastic domains:
+! (see e.g. Hughes, 1987; Chaljub et al., 2003)
+!
+! chi(t+delta_t) = chi(t) + delta_t chi_dot(t) + 1/2 delta_t**2 chi_dot_dot(t)
+! chi_dot(t+delta_t) = chi_dot(t) + 1/2 delta_t chi_dot_dot(t) + 1/2 delta_t chi_dot_dot(t+delta_t)
+! chi_dot_dot(t+delta_t) = 1/M_acoustic( -K_acoustic chi(t+delta) + B_acoustic u(t+delta_t) + f(t+delta_t) )
+!
+! u(t+delta_t) = u(t) + delta_t v(t) + 1/2 delta_t**2 a(t)
+! v(t+delta_t) = v(t) + 1/2 delta_t a(t) + 1/2 delta_t a(t+delta_t)
+! a(t+delta_t) = 1/M_elastic ( -K_elastic u(t+delta) + B_elastic chi_dot_dot(t+delta_t) + f( t+delta_t) )
+!
+! where
+! chi, chi_dot, chi_dot_dot are acoustic (fluid) potentials ( dotted with respect to time)
+! u, v, a are displacement,velocity & acceleration
+! M is mass matrix, K stiffness matrix and B boundary term for acoustic/elastic domains
+! f denotes a source term (acoustic/elastic)
+!
+! note that this stage calculates the predictor terms
+!
+! for
+! potential chi_dot(t+delta) requires + 1/2 delta_t chi_dot_dot(t+delta_t)
+! at a later stage (corrector) once where chi_dot_dot(t+delta) is calculated
+! and similar,
+! velocity v(t+delta_t) requires + 1/2 delta_t a(t+delta_t)
+! at a later stage once where a(t+delta) is calculated
+! also:
+! boundary term B_elastic requires chi_dot_dot(t+delta)
+! thus chi_dot_dot has to be updated first before the elastic boundary term is considered
+
+ use specfem_par
+ use specfem_par_acoustic
+ use specfem_par_elastic
+ use specfem_par_poroelastic
+ use pml_par
+
+ implicit none
+
+ ! time marching
+
+ ! acoustic domain
+ if( ACOUSTIC_SIMULATION ) call update_displacement_acoustic()
+
+ ! elastic domain
+ if( ELASTIC_SIMULATION ) call update_displacement_elastic()
+
+ ! poroelastic domain
+ if( POROELASTIC_SIMULATION ) call update_displacement_poroelastic()
+
+ ! adjoint simulations: moho kernel
+ if (SAVE_MOHO_MESH .and. SIMULATION_TYPE == 3) then
+ ispec2D_moho_top = 0
+ ispec2D_moho_bot = 0
+ endif
+
+ end subroutine update_displacement_scheme
+
+!
+!--------------------------------------------------------------------------------------------------------------
+!
+
+ subroutine update_displacement_acoustic()
+
+! updates acoustic potentials
+
+ use specfem_par
+ use specfem_par_acoustic
+ use pml_par
+
+ implicit none
+
+ ! Newmark time marching
+
+ if( .not. GPU_MODE ) then
+ ! wavefields on CPU
+
+ ! updates (forward) acoustic potentials
+ potential_acoustic(:) = potential_acoustic(:) &
+ + deltat * potential_dot_acoustic(:) &
+ + deltatsqover2 * potential_dot_dot_acoustic(:)
+ potential_dot_acoustic(:) = potential_dot_acoustic(:) &
+ + deltatover2 * potential_dot_dot_acoustic(:)
+ potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
+
+ ! adjoint simulations
+ if( SIMULATION_TYPE == 3 ) then
+ ! updates acoustic backward/reconstructed fields
+ if( PML_CONDITIONS )then
+ if( nglob_interface_PML_acoustic > 0 )then
+ call read_potential_on_pml_interface(b_potential_dot_dot_acoustic,b_potential_dot_acoustic,b_potential_acoustic,&
+ nglob_interface_PML_acoustic,b_PML_potential,b_reclen_PML_potential)
+ endif
+ endif
+ b_potential_acoustic(:) = b_potential_acoustic(:) &
+ + b_deltat * b_potential_dot_acoustic(:) &
+ + b_deltatsqover2 * b_potential_dot_dot_acoustic(:)
+ b_potential_dot_acoustic(:) = b_potential_dot_acoustic(:) &
+ + b_deltatover2 * b_potential_dot_dot_acoustic(:)
+ b_potential_dot_dot_acoustic(:) = 0._CUSTOM_REAL
+ endif
+
+ else
+ ! wavefields on GPU
+ ! check
+ if( SIMULATION_TYPE == 3 ) then
+ if( PML_CONDITIONS )then
+ call exit_MPI(myrank,'acoustic time marching scheme with PML_CONDITIONS on GPU not implemented yet...')
+ endif
+ endif
+
+ ! updates acoustic potentials
+ call it_update_displacement_ac_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,b_deltat,b_deltatsqover2,b_deltatover2)
+ endif ! GPU_MODE
+
+ end subroutine update_displacement_acoustic
+
+
+!
+!--------------------------------------------------------------------------------------------------------------
+!
+
+
+ subroutine update_displacement_elastic()
+
+! updates elastic wavefields
+
+ use specfem_par
+ use specfem_par_elastic
+ use pml_par
+
+ implicit none
+
+ ! Newmark time marching
+
+ if( .not. GPU_MODE ) then
+ ! wavefields on CPU
+
+ ! updates elastic displacement and velocity
+ displ(:,:) = displ(:,:) + deltat*veloc(:,:) + deltatsqover2*accel(:,:)
+ veloc(:,:) = veloc(:,:) + deltatover2*accel(:,:)
+ if( SIMULATION_TYPE /= 1 ) accel_adj_coupling(:,:) = accel(:,:)
+ accel(:,:) = 0._CUSTOM_REAL
+
+ ! adjoint simulations
+ if( SIMULATION_TYPE == 3 ) then
+ ! elastic backward fields
+ if(PML_CONDITIONS)then
+ if(nglob_interface_PML_elastic > 0)then
+ call read_field_on_pml_interface(b_accel,b_veloc,b_displ,nglob_interface_PML_elastic,&
+ b_PML_field,b_reclen_PML_field)
+ endif
+ endif
+ b_displ(:,:) = b_displ(:,:) + b_deltat*b_veloc(:,:) + b_deltatsqover2*b_accel(:,:)
+ b_veloc(:,:) = b_veloc(:,:) + b_deltatover2*b_accel(:,:)
+ b_accel(:,:) = 0._CUSTOM_REAL
+ endif
+
+ else
+ ! wavefields on GPU
+
+ ! check
+ if( SIMULATION_TYPE == 3 ) then
+ if( PML_CONDITIONS )then
+ if(nglob_interface_PML_elastic > 0)then
+ call exit_MPI(myrank,'elastic time marching scheme with PML_CONDITIONS on GPU not implemented yet...')
+ endif
+ endif
+ endif
+
+ ! updates elastic displacement and velocity
+ ! Includes SIM_TYPE 1 & 3 (for noise tomography)
+ call it_update_displacement_cuda(Mesh_pointer,deltat,deltatsqover2,deltatover2,b_deltat,b_deltatsqover2,b_deltatover2)
+ endif ! GPU_MODE
+
+ end subroutine update_displacement_elastic
+
+!
+!--------------------------------------------------------------------------------------------------------------
+!
+
+ subroutine update_displacement_poroelastic()
+
+! updates poroelastic wavefields
+
+ use specfem_par
+ use specfem_par_poroelastic
+
+ implicit none
+
+ ! Newmark time marching
+
+ if( .not. GPU_MODE ) then
+ ! wavefields on CPU
+
+ ! updates poroelastic displacements and velocities
+ ! solid phase
+ displs_poroelastic(:,:) = displs_poroelastic(:,:) + deltat*velocs_poroelastic(:,:) + &
+ deltatsqover2*accels_poroelastic(:,:)
+ velocs_poroelastic(:,:) = velocs_poroelastic(:,:) + deltatover2*accels_poroelastic(:,:)
+ accels_poroelastic(:,:) = 0._CUSTOM_REAL
+
+ ! fluid phase
+ displw_poroelastic(:,:) = displw_poroelastic(:,:) + deltat*velocw_poroelastic(:,:) + &
+ deltatsqover2*accelw_poroelastic(:,:)
+ velocw_poroelastic(:,:) = velocw_poroelastic(:,:) + deltatover2*accelw_poroelastic(:,:)
+ accelw_poroelastic(:,:) = 0._CUSTOM_REAL
+
+ ! adjoint simulations
+ if( SIMULATION_TYPE == 3 ) then
+ ! poroelastic backward fields
+ ! solid phase
+ b_displs_poroelastic(:,:) = b_displs_poroelastic(:,:) + b_deltat*b_velocs_poroelastic(:,:) + &
+ b_deltatsqover2*b_accels_poroelastic(:,:)
+ b_velocs_poroelastic(:,:) = b_velocs_poroelastic(:,:) + b_deltatover2*b_accels_poroelastic(:,:)
+ b_accels_poroelastic(:,:) = 0._CUSTOM_REAL
+
+ ! fluid phase
+ b_displw_poroelastic(:,:) = b_displw_poroelastic(:,:) + b_deltat*b_velocw_poroelastic(:,:) + &
+ b_deltatsqover2*b_accelw_poroelastic(:,:)
+ b_velocw_poroelastic(:,:) = b_velocw_poroelastic(:,:) + b_deltatover2*b_accelw_poroelastic(:,:)
+ b_accelw_poroelastic(:,:) = 0._CUSTOM_REAL
+ endif
+
+ else
+ ! wavefields on GPU
+ call exit_MPI(myrank,'poroelastic time marching scheme on GPU not implemented yet...')
+ endif ! GPU_MODE
+
+ end subroutine update_displacement_poroelastic
+
+
Modified: seismo/3D/SPECFEM3D/trunk/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D/trunk/src/specfem3D/write_seismograms.f90 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/src/specfem3D/write_seismograms.f90 2013-08-20 14:13:26 UTC (rev 22718)
@@ -87,204 +87,204 @@
if(.not. GPU_MODE .or. (GPU_MODE .and. (.not. USE_CUDA_SEISMOGRAMS))) then
- do irec_local = 1,nrec_local
+ do irec_local = 1,nrec_local
- ! gets global number of that receiver
- irec = number_receiver_global(irec_local)
+ ! gets global number of that receiver
+ irec = number_receiver_global(irec_local)
- ! gets local receiver interpolators
- ! (1-D Lagrange interpolators)
- hxir(:) = hxir_store(irec_local,:)
- hetar(:) = hetar_store(irec_local,:)
- hgammar(:) = hgammar_store(irec_local,:)
+ ! gets local receiver interpolators
+ ! (1-D Lagrange interpolators)
+ hxir(:) = hxir_store(irec_local,:)
+ hetar(:) = hetar_store(irec_local,:)
+ hgammar(:) = hgammar_store(irec_local,:)
- ! forward simulations
- select case( SIMULATION_TYPE )
- case( 1 )
+ ! forward simulations
+ select case( SIMULATION_TYPE )
+ case( 1 )
- ! receiver's spectral element
- ispec = ispec_selected_rec(irec)
+ ! receiver's spectral element
+ ispec = ispec_selected_rec(irec)
- ! elastic wave field
- if( ispec_is_elastic(ispec) ) then
- ! interpolates displ/veloc/accel at receiver locations
- call compute_interpolated_dva(displ,veloc,accel,NGLOB_AB, &
- ispec,NSPEC_AB,ibool, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- hxir,hetar,hgammar, &
- dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
- endif !elastic
+ ! elastic wave field
+ if( ispec_is_elastic(ispec) ) then
+ ! interpolates displ/veloc/accel at receiver locations
+ call compute_interpolated_dva(displ,veloc,accel,NGLOB_AB, &
+ ispec,NSPEC_AB,ibool, &
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ hxir,hetar,hgammar, &
+ dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+ endif !elastic
- ! acoustic wave field
- if( ispec_is_acoustic(ispec) ) then
- ! displacement vector
- call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
- potential_acoustic, displ_element,&
- hprime_xx,hprime_yy,hprime_zz, &
- xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
- ibool,rhostore,GRAVITY)
- ! velocity vector
- call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
- potential_dot_acoustic, veloc_element,&
- hprime_xx,hprime_yy,hprime_zz, &
- xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
- ibool,rhostore,GRAVITY)
+ ! acoustic wave field
+ if( ispec_is_acoustic(ispec) ) then
+ ! displacement vector
+ call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+ potential_acoustic, displ_element,&
+ hprime_xx,hprime_yy,hprime_zz, &
+ xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+ ibool,rhostore,GRAVITY)
+ ! velocity vector
+ call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+ potential_dot_acoustic, veloc_element,&
+ hprime_xx,hprime_yy,hprime_zz, &
+ xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+ ibool,rhostore,GRAVITY)
- ! interpolates displ/veloc/pressure at receiver locations
- call compute_interpolated_dva_ac(displ_element,veloc_element,&
- potential_dot_dot_acoustic,potential_dot_acoustic,&
- potential_acoustic,NGLOB_AB, &
- ispec,NSPEC_AB,ibool, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- hxir,hetar,hgammar, &
- dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
- endif ! acoustic
+ ! interpolates displ/veloc/pressure at receiver locations
+ call compute_interpolated_dva_ac(displ_element,veloc_element,&
+ potential_dot_dot_acoustic,potential_dot_acoustic,&
+ potential_acoustic,NGLOB_AB, &
+ ispec,NSPEC_AB,ibool, &
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ hxir,hetar,hgammar, &
+ dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+ endif ! acoustic
- ! poroelastic wave field
- if( ispec_is_poroelastic(ispec) ) then
- ! interpolates displ/veloc/accel at receiver locations
- ! call compute_interpolated_dva(displw_poroelastic,velocw_poroelastic,accelw_poroelastic,NGLOB_AB, &
- call compute_interpolated_dva(displs_poroelastic,velocs_poroelastic,accels_poroelastic,NGLOB_AB, &
- ispec,NSPEC_AB,ibool, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- hxir,hetar,hgammar, &
- dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
- endif !poroelastic
+ ! poroelastic wave field
+ if( ispec_is_poroelastic(ispec) ) then
+ ! interpolates displ/veloc/accel at receiver locations
+ ! call compute_interpolated_dva(displw_poroelastic,velocw_poroelastic,accelw_poroelastic,NGLOB_AB, &
+ call compute_interpolated_dva(displs_poroelastic,velocs_poroelastic,accels_poroelastic,NGLOB_AB, &
+ ispec,NSPEC_AB,ibool, &
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ hxir,hetar,hgammar, &
+ dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+ endif !poroelastic
- !adjoint simulations
- case( 2 )
+ !adjoint simulations
+ case( 2 )
- ! adjoint source is placed at receiver
- ispec = ispec_selected_source(irec)
+ ! adjoint source is placed at receiver
+ ispec = ispec_selected_source(irec)
- ! elastic wave field
- if( ispec_is_elastic(ispec) ) then
- ! interpolates displ/veloc/accel at receiver locations
- call compute_interpolated_dva(displ,veloc,accel,NGLOB_AB, &
- ispec,NSPEC_AB,ibool, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- hxir,hetar,hgammar, &
- dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+ ! elastic wave field
+ if( ispec_is_elastic(ispec) ) then
+ ! interpolates displ/veloc/accel at receiver locations
+ call compute_interpolated_dva(displ,veloc,accel,NGLOB_AB, &
+ ispec,NSPEC_AB,ibool, &
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ hxir,hetar,hgammar, &
+ dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
- ! stores elements displacement field
- do k = 1,NGLLZ
- do j = 1,NGLLY
- do i = 1,NGLLX
- iglob = ibool(i,j,k,ispec)
- displ_element(:,i,j,k) = displ(:,iglob)
+ ! stores elements displacement field
+ do k = 1,NGLLZ
+ do j = 1,NGLLY
+ do i = 1,NGLLX
+ iglob = ibool(i,j,k,ispec)
+ displ_element(:,i,j,k) = displ(:,iglob)
+ enddo
enddo
enddo
- enddo
- ! gets derivatives of local receiver interpolators
- hpxir(:) = hpxir_store(irec_local,:)
- hpetar(:) = hpetar_store(irec_local,:)
- hpgammar(:) = hpgammar_store(irec_local,:)
+ ! gets derivatives of local receiver interpolators
+ hpxir(:) = hpxir_store(irec_local,:)
+ hpetar(:) = hpetar_store(irec_local,:)
+ hpgammar(:) = hpgammar_store(irec_local,:)
- ! computes the integrated derivatives of source parameters (M_jk and X_s)
- call compute_adj_source_frechet(displ_element,Mxx(irec),Myy(irec),Mzz(irec),&
- Mxy(irec),Mxz(irec),Myz(irec),eps_s,eps_m_s, &
- hxir,hetar,hgammar,hpxir,hpetar,hpgammar, &
- hprime_xx,hprime_yy,hprime_zz, &
- xix(:,:,:,ispec),xiy(:,:,:,ispec),xiz(:,:,:,ispec), &
- etax(:,:,:,ispec),etay(:,:,:,ispec),etaz(:,:,:,ispec), &
- gammax(:,:,:,ispec),gammay(:,:,:,ispec),gammaz(:,:,:,ispec))
+ ! computes the integrated derivatives of source parameters (M_jk and X_s)
+ call compute_adj_source_frechet(displ_element,Mxx(irec),Myy(irec),Mzz(irec),&
+ Mxy(irec),Mxz(irec),Myz(irec),eps_s,eps_m_s, &
+ hxir,hetar,hgammar,hpxir,hpetar,hpgammar, &
+ hprime_xx,hprime_yy,hprime_zz, &
+ xix(:,:,:,ispec),xiy(:,:,:,ispec),xiz(:,:,:,ispec), &
+ etax(:,:,:,ispec),etay(:,:,:,ispec),etaz(:,:,:,ispec), &
+ gammax(:,:,:,ispec),gammay(:,:,:,ispec),gammaz(:,:,:,ispec))
- stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_src(irec),hdur_gaussian(irec))
- stf_deltat = stf * deltat
- Mxx_der(irec_local) = Mxx_der(irec_local) + eps_s(1,1) * stf_deltat
- Myy_der(irec_local) = Myy_der(irec_local) + eps_s(2,2) * stf_deltat
- Mzz_der(irec_local) = Mzz_der(irec_local) + eps_s(3,3) * stf_deltat
- Mxy_der(irec_local) = Mxy_der(irec_local) + 2 * eps_s(1,2) * stf_deltat
- Mxz_der(irec_local) = Mxz_der(irec_local) + 2 * eps_s(1,3) * stf_deltat
- Myz_der(irec_local) = Myz_der(irec_local) + 2 * eps_s(2,3) * stf_deltat
+ stf = comp_source_time_function(dble(NSTEP-it)*DT-t0-tshift_src(irec),hdur_gaussian(irec))
+ stf_deltat = stf * deltat
+ Mxx_der(irec_local) = Mxx_der(irec_local) + eps_s(1,1) * stf_deltat
+ Myy_der(irec_local) = Myy_der(irec_local) + eps_s(2,2) * stf_deltat
+ Mzz_der(irec_local) = Mzz_der(irec_local) + eps_s(3,3) * stf_deltat
+ Mxy_der(irec_local) = Mxy_der(irec_local) + 2 * eps_s(1,2) * stf_deltat
+ Mxz_der(irec_local) = Mxz_der(irec_local) + 2 * eps_s(1,3) * stf_deltat
+ Myz_der(irec_local) = Myz_der(irec_local) + 2 * eps_s(2,3) * stf_deltat
- sloc_der(:,irec_local) = sloc_der(:,irec_local) + eps_m_s(:) * stf_deltat
- endif ! elastic
+ sloc_der(:,irec_local) = sloc_der(:,irec_local) + eps_m_s(:) * stf_deltat
+ endif ! elastic
- ! acoustic wave field
- if( ispec_is_acoustic(ispec) ) then
- ! displacement vector
- call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
- potential_acoustic, displ_element,&
- hprime_xx,hprime_yy,hprime_zz, &
- xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
- ibool,rhostore,GRAVITY)
- ! velocity vector
- call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
- potential_dot_acoustic, veloc_element,&
- hprime_xx,hprime_yy,hprime_zz, &
- xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
- ibool,rhostore,GRAVITY)
+ ! acoustic wave field
+ if( ispec_is_acoustic(ispec) ) then
+ ! displacement vector
+ call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+ potential_acoustic, displ_element,&
+ hprime_xx,hprime_yy,hprime_zz, &
+ xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+ ibool,rhostore,GRAVITY)
+ ! velocity vector
+ call compute_gradient(ispec,NSPEC_AB,NGLOB_AB, &
+ potential_dot_acoustic, veloc_element,&
+ hprime_xx,hprime_yy,hprime_zz, &
+ xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+ ibool,rhostore,GRAVITY)
- ! interpolates displ/veloc/pressure at receiver locations
- call compute_interpolated_dva_ac(displ_element,veloc_element,&
- potential_dot_dot_acoustic,potential_dot_acoustic,&
- potential_acoustic,NGLOB_AB, &
- ispec,NSPEC_AB,ibool, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- hxir,hetar,hgammar, &
- dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
- endif ! acoustic
+ ! interpolates displ/veloc/pressure at receiver locations
+ call compute_interpolated_dva_ac(displ_element,veloc_element,&
+ potential_dot_dot_acoustic,potential_dot_acoustic,&
+ potential_acoustic,NGLOB_AB, &
+ ispec,NSPEC_AB,ibool, &
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ hxir,hetar,hgammar, &
+ dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+ endif ! acoustic
- !adjoint simulations
- case( 3 )
+ !adjoint simulations
+ case( 3 )
- ispec = ispec_selected_rec(irec)
+ ispec = ispec_selected_rec(irec)
- ! elastic wave field
- if( ispec_is_elastic(ispec) ) then
- ! backward fields: interpolates displ/veloc/accel at receiver locations
- call compute_interpolated_dva(b_displ,b_veloc,b_accel,NGLOB_ADJOINT,&
- ispec,NSPEC_AB,ibool, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- hxir,hetar,hgammar, &
- dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
- endif ! elastic
+ ! elastic wave field
+ if( ispec_is_elastic(ispec) ) then
+ ! backward fields: interpolates displ/veloc/accel at receiver locations
+ call compute_interpolated_dva(b_displ,b_veloc,b_accel,NGLOB_ADJOINT,&
+ ispec,NSPEC_AB,ibool, &
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ hxir,hetar,hgammar, &
+ dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+ endif ! elastic
- ! acoustic wave field
- if( ispec_is_acoustic(ispec) ) then
- ! backward fields: displacement vector
- call compute_gradient(ispec,NSPEC_AB,NGLOB_ADJOINT, &
- b_potential_acoustic, displ_element,&
- hprime_xx,hprime_yy,hprime_zz, &
- xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
- ibool,rhostore,GRAVITY)
- ! backward fields: velocity vector
- call compute_gradient(ispec,NSPEC_AB,NGLOB_ADJOINT, &
- b_potential_dot_acoustic, veloc_element,&
- hprime_xx,hprime_yy,hprime_zz, &
- xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
- ibool,rhostore,GRAVITY)
+ ! acoustic wave field
+ if( ispec_is_acoustic(ispec) ) then
+ ! backward fields: displacement vector
+ call compute_gradient(ispec,NSPEC_AB,NGLOB_ADJOINT, &
+ b_potential_acoustic, displ_element,&
+ hprime_xx,hprime_yy,hprime_zz, &
+ xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+ ibool,rhostore,GRAVITY)
+ ! backward fields: velocity vector
+ call compute_gradient(ispec,NSPEC_AB,NGLOB_ADJOINT, &
+ b_potential_dot_acoustic, veloc_element,&
+ hprime_xx,hprime_yy,hprime_zz, &
+ xix,xiy,xiz,etax,etay,etaz,gammax,gammay,gammaz, &
+ ibool,rhostore,GRAVITY)
- ! backward fields: interpolates displ/veloc/pressure at receiver locations
- call compute_interpolated_dva_ac(displ_element,veloc_element,&
- b_potential_dot_dot_acoustic,b_potential_dot_acoustic,&
- b_potential_acoustic,NGLOB_ADJOINT, &
- ispec,NSPEC_AB,ibool, &
- xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
- hxir,hetar,hgammar, &
- dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
- endif ! acoustic
+ ! backward fields: interpolates displ/veloc/pressure at receiver locations
+ call compute_interpolated_dva_ac(displ_element,veloc_element,&
+ b_potential_dot_dot_acoustic,b_potential_dot_acoustic,&
+ b_potential_acoustic,NGLOB_ADJOINT, &
+ ispec,NSPEC_AB,ibool, &
+ xi_receiver(irec),eta_receiver(irec),gamma_receiver(irec), &
+ hxir,hetar,hgammar, &
+ dxd,dyd,dzd,vxd,vyd,vzd,axd,ayd,azd)
+ endif ! acoustic
- end select ! SIMULATION_TYPE
+ end select ! SIMULATION_TYPE
- ! store North, East and Vertical components
- ! distinguish between single and double precision for reals
- if(CUSTOM_REAL == SIZE_REAL) then
- seismograms_d(:,irec_local,it) = sngl((nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd))
- seismograms_v(:,irec_local,it) = sngl((nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd))
- seismograms_a(:,irec_local,it) = sngl((nu(:,1,irec)*axd + nu(:,2,irec)*ayd + nu(:,3,irec)*azd))
- else
- seismograms_d(:,irec_local,it) = (nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd)
- seismograms_v(:,irec_local,it) = (nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd)
- seismograms_a(:,irec_local,it) = (nu(:,1,irec)*axd + nu(:,2,irec)*ayd + nu(:,3,irec)*azd)
- endif
+ ! store North, East and Vertical components
+ ! distinguish between single and double precision for reals
+ if(CUSTOM_REAL == SIZE_REAL) then
+ seismograms_d(:,irec_local,it) = sngl((nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd))
+ seismograms_v(:,irec_local,it) = sngl((nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd))
+ seismograms_a(:,irec_local,it) = sngl((nu(:,1,irec)*axd + nu(:,2,irec)*ayd + nu(:,3,irec)*azd))
+ else
+ seismograms_d(:,irec_local,it) = (nu(:,1,irec)*dxd + nu(:,2,irec)*dyd + nu(:,3,irec)*dzd)
+ seismograms_v(:,irec_local,it) = (nu(:,1,irec)*vxd + nu(:,2,irec)*vyd + nu(:,3,irec)*vzd)
+ seismograms_a(:,irec_local,it) = (nu(:,1,irec)*axd + nu(:,2,irec)*ayd + nu(:,3,irec)*azd)
+ endif
- !adjoint simulations
- if (SIMULATION_TYPE == 2) seismograms_eps(:,:,irec_local,it) = eps_s(:,:)
+ !adjoint simulations
+ if (SIMULATION_TYPE == 2) seismograms_eps(:,:,irec_local,it) = eps_s(:,:)
- enddo ! nrec_local
+ enddo ! nrec_local
endif
Modified: seismo/3D/SPECFEM3D/trunk/todo_list_please_dont_remove.txt
===================================================================
--- seismo/3D/SPECFEM3D/trunk/todo_list_please_dont_remove.txt 2013-08-20 00:07:29 UTC (rev 22717)
+++ seismo/3D/SPECFEM3D/trunk/todo_list_please_dont_remove.txt 2013-08-20 14:13:26 UTC (rev 22718)
@@ -323,7 +323,7 @@
------------------------------------------------
+ fault_solver_common:
- - make ordered version of subroutine assemble_MPI_vector_ext_mesh, and use it in subroutine initialize_fault
+ - make ordered version of subroutine assemble_MPI_vector_blocking, and use it in subroutine initialize_fault
+ fault_solver_dynamic:
- many hard-coded ad hoc features need to be set instead as user options
More information about the CIG-COMMITS
mailing list