[cig-commits] r19137 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda meshfem3D shared specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Mon Oct 31 14:45:36 PDT 2011
Author: danielpeter
Date: 2011-10-31 14:45:35 -0700 (Mon, 31 Oct 2011)
New Revision: 19137
Modified:
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/check_mesh_quality.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90
Log:
updates preparation for acoustic & elastic adjoint simulations
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu 2011-10-31 21:45:35 UTC (rev 19137)
@@ -58,8 +58,8 @@
int* islice_selected_source,
int* ispec_selected_source,
int* ispec_is_elastic,
- int NSOURCES,
- float* d_debug) {
+ int NSOURCES //,float* d_debug
+ ) {
int i = threadIdx.x;
int j = threadIdx.y;
int k = threadIdx.z;
@@ -82,10 +82,12 @@
//if(i==0 && j==0 && k==0) printf("add sources kernel: stf = %e\n",stf);
iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
atomicAdd(&accel[iglob*3],
sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf);
atomicAdd(&accel[iglob*3+1],
sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 1, i,j,k)]*stf);
+
// if((iglob*3+2 == 304598)) {
// atomicAdd(&d_debug[0],1.0f);
// d_debug[1] = accel[iglob*3+2];
@@ -93,6 +95,7 @@
// d_debug[3] = stf;
// }
// d_debug[4] = 42.0f;
+
atomicAdd(&accel[iglob*3+2],
sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)]*stf);
}
@@ -137,7 +140,6 @@
//int USE_FORCE_POINT_SOURCE = *USE_FORCE_POINT_SOURCEf;
int myrank = *myrankf;
- float* d_debug;
int num_blocks_x = NSOURCES;
int num_blocks_y = 1;
@@ -152,16 +154,15 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(5,5,5);
+
+ //float* d_debug;
// (float* accel, int* ibool, int* ispec_is_inner, int phase_is_inner,
// float* sourcearrays, double* stf_pre_compute,int myrank,
// int* islice_selected_source, int* ispec_selected_source,
// int* ispec_is_elastic, int NSOURCES)
-
- //daniel
//printf("add sources : nsources_local = %d\n",mp->nsources_local);
//printf("add sources : stf = %e\n",h_stf_pre_compute[0]);
-
compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel,
mp->d_ibool,
mp->d_ispec_is_inner,
@@ -172,8 +173,8 @@
mp->d_islice_selected_source,
mp->d_ispec_selected_source,
mp->d_ispec_is_elastic,
- NSOURCES,
- d_debug);
+ NSOURCES //,d_debug
+ );
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_add_sources_kernel");
@@ -218,7 +219,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(5,5,5);
- float* d_debug;
+ //float* d_debug;
// float* h_debug = (float*)calloc(128,sizeof(float));
// cudaMalloc((void**)&d_debug,128*sizeof(float));
// cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
@@ -230,8 +231,8 @@
*myrank,
mp->d_islice_selected_source,mp->d_ispec_selected_source,
mp->d_ispec_is_elastic,
- NSOURCES,
- d_debug);
+ NSOURCES //,d_debug
+ );
// cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
// for(int i=0;i<10;i++) {
@@ -249,7 +250,8 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool, int* ispec_selected_rec,
+__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool,
+ int* ispec_selected_rec,
int irec_master_noise,
realw* accel,
realw* noise_sourcearray,
@@ -285,12 +287,17 @@
int it = *it_f-1; // -1 for Fortran -> C indexing differences
int irec_master_noise = *irec_master_noise_f;
int myrank = *myrank_f;
+
dim3 grid(1,1,1);
dim3 threads(125,1,1);
+
if(myrank == islice_selected_rec[irec_master_noise-1]) {
- add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool, mp->d_ispec_selected_rec,
- irec_master_noise, mp->d_accel,
- mp->d_noise_sourcearray, it);
+ add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool,
+ mp->d_ispec_selected_rec,
+ irec_master_noise,
+ mp->d_accel,
+ mp->d_noise_sourcearray,
+ it);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel");
@@ -304,48 +311,52 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void add_sources_SIM_TYPE_2_OR_3_kernel(float* accel, int nrec,
- float* adj_sourcearrays,
- int* ibool,
- int* ispec_is_inner,
- int* ispec_selected_rec,
- int phase_is_inner,
- int* islice_selected_rec,
- int* pre_computed_irec,
- int nadj_rec_local,
- int NTSTEP_BETWEEN_ADJSRC,
- int myrank,
- int* debugi,
- float* debugf) {
+__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(float* accel,
+ int nrec,
+ float* adj_sourcearrays,
+ int* ibool,
+ int* ispec_is_inner,
+ int* ispec_is_elastic,
+ int* ispec_selected_rec,
+ int phase_is_inner,
+ int* islice_selected_rec,
+ int* pre_computed_irec,
+ int nadj_rec_local //,int myrank //,int* debugi,float* debugf
+ ) {
+
int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
- if(irec_local<nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+ if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+
int irec = pre_computed_irec[irec_local];
- int ispec_selected = ispec_selected_rec[irec]-1;
- if(ispec_is_inner[ispec_selected] == phase_is_inner) {
- int i = threadIdx.x;
- int j = threadIdx.y;
- int k = threadIdx.z;
- int iglob = ibool[i+5*(j+5*(k+5*ispec_selected))]-1;
+ int ispec = ispec_selected_rec[irec]-1;
+ if( ispec_is_elastic[ispec] ){
- // atomic operations are absolutely necessary for correctness!
- atomicAdd(&(accel[0+3*iglob]),adj_sourcearrays[INDEX5(5,5,5,3,
- i,j,k,
- 0,
- irec_local)]);
+ if(ispec_is_inner[ispec] == phase_is_inner) {
+ int i = threadIdx.x;
+ int j = threadIdx.y;
+ int k = threadIdx.z;
+ //int iglob = ibool[i+5*(j+5*(k+5*ispec))]-1;
+ int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
- atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
- i,j,k,
- 1,
- irec_local)]);
+ // atomic operations are absolutely necessary for correctness!
+ atomicAdd(&(accel[0+3*iglob]),adj_sourcearrays[INDEX5(5,5,5,3,
+ i,j,k,
+ 0,
+ irec_local)]);
- atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
- i,j,k,
- 2,
- irec_local)]);
- }
+ atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
+ i,j,k,
+ 1,
+ irec_local)]);
+ atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+ i,j,k,
+ 2,
+ irec_local)]);
+ }
+ } // ispec_is_elastic
}
}
@@ -353,55 +364,72 @@
/* ----------------------------------------------------------------------------------------------- */
extern "C"
-void FC_FUNC_(add_sources_sim_type_2_or_3,
- ADD_SOURCES_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
- float* h_adj_sourcearrays,
- int* size_adj_sourcearrays, int* ispec_is_inner,
- int* phase_is_inner, int* ispec_selected_rec,
- int* ibool,
- int* myrank, int* nrec, int* time_index,
- int* h_islice_selected_rec,int* nadj_rec_local,
- int* NTSTEP_BETWEEN_READ_ADJSRC) {
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+ ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+ float* h_adj_sourcearrays,
+ int* phase_is_inner,
+ int* h_ispec_is_inner,
+ int* h_ispec_is_elastic,
+ int* h_ispec_selected_rec,
+ int* myrank,
+ int* nrec,
+ int* time_index,
+ int* h_islice_selected_rec,
+ int* nadj_rec_local,
+ int* NTSTEP_BETWEEN_READ_ADJSRC) {
-TRACE("add_sources_sim_type_2_or_3");
+TRACE("add_sources_el_sim_type_2_or_3");
- if(*nadj_rec_local > 0) {
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- int rank;
- MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+ // checks
+ if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
- // make sure grid dimension is less than 65535 in x dimension
- int num_blocks_x = *nadj_rec_local;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
- num_blocks_y = num_blocks_y*2;
- }
- dim3 grid(num_blocks_x,num_blocks_y,1);
- dim3 threads(5,5,5);
+ //int rank;
+ //MPI_Comm_rank(MPI_COMM_WORLD,&rank);
- float* d_adj_sourcearrays;
- print_CUDA_error_if_any(cudaMalloc((void**)&d_adj_sourcearrays,
- (*nadj_rec_local)*3*125*sizeof(float)),1);
- float* h_adj_sourcearrays_slice = (float*)malloc((*nadj_rec_local)*3*125*sizeof(float));
+ // make sure grid dimension is less than 65535 in x dimension
+ int num_blocks_x = mp->nadj_rec_local;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
- int* h_pre_computed_irec = new int[*nadj_rec_local];
- int* d_pre_computed_irec;
- cudaMalloc((void**)&d_pre_computed_irec,(*nadj_rec_local)*sizeof(int));
+ dim3 grid(num_blocks_x,num_blocks_y,1);
+ dim3 threads(5,5,5);
- // build slice of adj_sourcearrays because full array is *very* large.
- int irec_local = 0;
- for(int irec = 0;irec<*nrec;irec++) {
- if(*myrank == h_islice_selected_rec[irec]) {
- irec_local++;
- h_pre_computed_irec[irec_local-1] = irec;
- if(ispec_is_inner[ispec_selected_rec[irec]-1] == *phase_is_inner) {
- for(int k=0;k<5;k++) {
- for(int j=0;j<5;j++) {
- for(int i=0;i<5;i++) {
+ //float* d_adj_sourcearrays;
+ //print_CUDA_error_if_any(cudaMalloc((void**)&d_adj_sourcearrays,
+ // (*nadj_rec_local)*3*125*sizeof(float)),1);
- h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+ //float* h_adj_sourcearrays_slice = (float*)malloc((*nadj_rec_local)*3*125*sizeof(float));
+
+ //int* h_pre_computed_irec = new int[*nadj_rec_local];
+
+ //int* d_pre_computed_irec;
+ //cudaMalloc((void**)&d_pre_computed_irec,(*nadj_rec_local)*sizeof(int));
+
+ // build slice of adj_sourcearrays because full array is *very* large.
+ // note: this extracts array values for local adjoint sources at given time step "time_index"
+ // from large adj_sourcearrays array into h_adj_sourcearrays_slice
+ int ispec,i,j,k;
+ int irec_local = 0;
+ for(int irec = 0; irec < *nrec; irec++) {
+ if(*myrank == h_islice_selected_rec[irec]) {
+ irec_local++;
+ //h_pre_computed_irec[irec_local-1] = irec;
+
+ // takes only elastic sources
+ ispec = h_ispec_selected_rec[irec]-1;
+ if( h_ispec_is_elastic[ispec] ){
+
+ if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+ for(k=0;k<5;k++) {
+ for(j=0;j<5;j++) {
+ for(i=0;i<5;i++) {
+
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
i,j,k,0,
irec_local-1)]
= h_adj_sourcearrays[INDEX6(*nadj_rec_local,
@@ -411,7 +439,7 @@
*time_index-1,
0,i,j,k)];
- h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
i,j,k,1,
irec_local-1)]
= h_adj_sourcearrays[INDEX6(*nadj_rec_local,
@@ -421,7 +449,7 @@
*time_index-1,
1,i,j,k)];
- h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
i,j,k,2,
irec_local-1)]
= h_adj_sourcearrays[INDEX6(*nadj_rec_local,
@@ -433,100 +461,107 @@
}
}
}
- }
- }
+ } // phase_is_inner
+ } // h_ispec_is_elastic
}
- // printf("irec_local vs. *nadj_rec_local -> %d vs. %d\n",irec_local,*nadj_rec_local);
- // for(int ispec=0;ispec<(*nadj_rec_local);ispec++) {
- // for(int i=0;i<5;i++)
- // for(int j=0;j<5;j++)
- // for(int k=0;k<5;k++) {
- // h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,ispec)] =
- // h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
- // ispec,
- // *time_index-1,
- // 0,
- // i,j,k)];
- // h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,ispec)] =
- // h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
- // ispec,
- // *time_index-1,
- // 1,
- // i,j,k)];
- // h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,ispec)] =
- // h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_ADJSRC,3,5,5,
- // ispec,
- // *time_index-1,
- // 2,
- // i,j,k)];
- // }
+ }
+ // check all local sources were added
+ if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
- // }
+ // printf("irec_local vs. *nadj_rec_local -> %d vs. %d\n",irec_local,*nadj_rec_local);
+ // for(int ispec=0;ispec<(*nadj_rec_local);ispec++) {
+ // for(int i=0;i<5;i++)
+ // for(int j=0;j<5;j++)
+ // for(int k=0;k<5;k++) {
+ // h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,ispec)] =
+ // h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
+ // ispec,
+ // *time_index-1,
+ // 0,
+ // i,j,k)];
+ // h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,ispec)] =
+ // h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
+ // ispec,
+ // *time_index-1,
+ // 1,
+ // i,j,k)];
+ // h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,ispec)] =
+ // h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_ADJSRC,3,5,5,
+ // ispec,
+ // *time_index-1,
+ // 2,
+ // i,j,k)];
+ // }
- cudaMemcpy(d_adj_sourcearrays, h_adj_sourcearrays_slice,(*nadj_rec_local)*3*125*sizeof(float),
- cudaMemcpyHostToDevice);
+ // }
+ // copies extracted array values onto GPU
+ cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+ (mp->nadj_rec_local)*3*125*sizeof(float),cudaMemcpyHostToDevice);
- // the irec_local variable needs to be precomputed (as
- // h_pre_comp..), because normally it is in the loop updating accel,
- // and due to how it's incremented, it cannot be parallelized
- // int irec_local=0;
- // for(int irec=0;irec<*nrec;irec++) {
- // if(*myrank == h_islice_selected_rec[irec]) {
- // h_pre_computed_irec_local_index[irec] = irec_local;
- // irec_local++;
- // if(irec_local==1) {
- // // printf("%d:first useful irec==%d\n",rank,irec);
- // }
- // }
- // else h_pre_computed_irec_local_index[irec] = 0;
- // }
- cudaMemcpy(d_pre_computed_irec,h_pre_computed_irec,
- (*nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice);
- // pause_for_debugger(1);
- int* d_debugi, *h_debugi;
- float* d_debugf, *h_debugf;
- h_debugi = (int*)calloc(num_blocks_x,sizeof(int));
- cudaMalloc((void**)&d_debugi,num_blocks_x*sizeof(int));
- cudaMemcpy(d_debugi,h_debugi,num_blocks_x*sizeof(int),cudaMemcpyHostToDevice);
- h_debugf = (float*)calloc(num_blocks_x,sizeof(float));
- cudaMalloc((void**)&d_debugf,num_blocks_x*sizeof(float));
- cudaMemcpy(d_debugf,h_debugf,num_blocks_x*sizeof(float),cudaMemcpyHostToDevice);
+ // the irec_local variable needs to be precomputed (as
+ // h_pre_comp..), because normally it is in the loop updating accel,
+ // and due to how it's incremented, it cannot be parallelized
- add_sources_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel, *nrec,
- d_adj_sourcearrays, mp->d_ibool,
- mp->d_ispec_is_inner,
- mp->d_ispec_selected_rec,
- *phase_is_inner,
- mp->d_islice_selected_rec,
- d_pre_computed_irec,
- *nadj_rec_local,
- *NTSTEP_BETWEEN_READ_ADJSRC,
- *myrank,
- d_debugi,d_debugf);
+ // int irec_local=0;
+ // for(int irec=0;irec<*nrec;irec++) {
+ // if(*myrank == h_islice_selected_rec[irec]) {
+ // h_pre_computed_irec_local_index[irec] = irec_local;
+ // irec_local++;
+ // if(irec_local==1) {
+ // // printf("%d:first useful irec==%d\n",rank,irec);
+ // }
+ // }
+ // else h_pre_computed_irec_local_index[irec] = 0;
+ // }
+ //cudaMemcpy(mp->d_pre_computed_irec,mp->h_pre_computed_irec,
+ // (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice);
- cudaMemcpy(h_debugi,d_debugi,num_blocks_x*sizeof(int),cudaMemcpyDeviceToHost);
- cudaMemcpy(h_debugf,d_debugf,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost);
+ // pause_for_debugger(1);
+ //int* d_debugi, *h_debugi;
+ //float* d_debugf, *h_debugf;
+ //h_debugi = (int*)calloc(num_blocks_x,sizeof(int));
+ //cudaMalloc((void**)&d_debugi,num_blocks_x*sizeof(int));
+ //cudaMemcpy(d_debugi,h_debugi,num_blocks_x*sizeof(int),cudaMemcpyHostToDevice);
+ //h_debugf = (float*)calloc(num_blocks_x,sizeof(float));
+ //cudaMalloc((void**)&d_debugf,num_blocks_x*sizeof(float));
+ //cudaMemcpy(d_debugf,h_debugf,num_blocks_x*sizeof(float),cudaMemcpyHostToDevice);
- // printf("%d: pre_com0:%d\n",rank,h_pre_computed_irec_local_index[0]);
- // printf("%d: pre_com1:%d\n",rank,h_pre_computed_irec_local_index[1]);
- // printf("%d: pre_com2:%d\n",rank,h_pre_computed_irec_local_index[2]);
- // for(int i=156;i<(156+30);i++) {
- // if(rank==0) printf("%d:debug[%d] = i/f = %d / %e\n",rank,i,h_debugi[i],h_debugf[i]);
- // }
+ add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel,
+ *nrec,
+ mp->d_adj_sourcearrays,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ mp->d_ispec_is_elastic,
+ mp->d_ispec_selected_rec,
+ *phase_is_inner,
+ mp->d_islice_selected_rec,
+ mp->d_pre_computed_irec,
+ mp->nadj_rec_local //,*myrank //,d_debugi,d_debugf
+ );
+ //cudaMemcpy(h_debugi,d_debugi,num_blocks_x*sizeof(int),cudaMemcpyDeviceToHost);
+ //cudaMemcpy(h_debugf,d_debugf,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost);
+
+ // printf("%d: pre_com0:%d\n",rank,h_pre_computed_irec_local_index[0]);
+ // printf("%d: pre_com1:%d\n",rank,h_pre_computed_irec_local_index[1]);
+ // printf("%d: pre_com2:%d\n",rank,h_pre_computed_irec_local_index[2]);
+ // for(int i=156;i<(156+30);i++) {
+ // if(rank==0) printf("%d:debug[%d] = i/f = %d / %e\n",rank,i,h_debugi[i],h_debugf[i]);
+ // }
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- // MPI_Barrier(MPI_COMM_WORLD);
- exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
+ // MPI_Barrier(MPI_COMM_WORLD);
+ exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
- // printf("Proc %d exiting with successful kernel\n",rank);
- // exit(1);
+ // printf("Proc %d exiting with successful kernel\n",rank);
+ // exit(1);
#endif
- delete h_pre_computed_irec;
- cudaFree(d_adj_sourcearrays);
- cudaFree(d_pre_computed_irec);
- }
+ //cudaFree(d_adj_sourcearrays);
+ //cudaFree(d_pre_computed_irec);
+ //free(h_adj_sourcearrays_slice);
+ //delete h_pre_computed_irec;
}
/* ----------------------------------------------------------------------------------------------- */
@@ -711,65 +746,61 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void add_sources_acoustic_SIM_TYPE_2_OR_3_kernel(float* potential_dot_dot_acoustic,
- int nrec,
- int pre_computed_index,
- float* adj_sourcearrays,
- int* ibool,
- int* ispec_is_inner,
- int* ispec_is_acoustic,
- float* kappastore,
- int* ispec_selected_rec,
- int phase_is_inner,
- int* islice_selected_rec,
- int* pre_computed_irec_local_index,
- int nadj_rec_local,
- int NTSTEP_BETWEEN_ADJSRC,
- int myrank) {
+__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(float* potential_dot_dot_acoustic,
+ int nrec,
+ float* adj_sourcearrays,
+ int* ibool,
+ int* ispec_is_inner,
+ int* ispec_is_acoustic,
+ int* ispec_selected_rec,
+ int phase_is_inner,
+ int* islice_selected_rec,
+ int* pre_computed_irec,
+ int nadj_rec_local,
+ float* kappastore) {
- int irec = blockIdx.x + gridDim.x*blockIdx.y;
+ int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
- //float kappal;
- int i,j,k,iglob,ispec;
+ // because of grid shape, irec_local can be too big
+ if(irec_local < nadj_rec_local) {
- // because of grid shape, irec can be too big
- if(irec<nrec) {
+ int irec = pre_computed_irec[irec_local];
- // adds source only if this proc carries the sources
- if( myrank == islice_selected_rec[irec] ){
+ int ispec = ispec_selected_rec[irec]-1;
+ if( ispec_is_acoustic[ispec] ){
- // adds acoustic source
- ispec = ispec_selected_rec[irec]-1;
- if( ispec_is_acoustic[ispec] ){
+ // checks if element is in phase_is_inner run
+ if(ispec_is_inner[ispec] == phase_is_inner) {
+ int i = threadIdx.x;
+ int j = threadIdx.y;
+ int k = threadIdx.z;
+ int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
- // checks if element is in phase_is_inner run
- if(ispec_is_inner[ispec] == phase_is_inner) {
- i = threadIdx.x;
- j = threadIdx.y;
- k = threadIdx.z;
- iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+ //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
- //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+ //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+ // pre_computed_irec_local_index[irec],
+ // pre_computed_index,
+ // 0,
+ // i,j,k)]/kappal;
- //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
- // pre_computed_irec_local_index[irec],
- // pre_computed_index,
- // 0,
- // i,j,k)]/kappal;
+ // beware, for acoustic medium, a pressure source would be taking the negative
+ // and divide by Kappa of the fluid;
+ // this would have to be done when constructing the adjoint source.
+ //
+ // note: we take the first component of the adj_sourcearrays
+ // the idea is to have e.g. a pressure source, where all 3 components would be the same
- // beware, for acoustic medium, a pressure source would be taking the negative
- // and divide by Kappa of the fluid;
- // this would have to be done when constructing the adjoint source.
- //
- // note: we take the first component of the adj_sourcearrays
- // the idea is to have e.g. a pressure source, where all 3 components would be the same
+ atomicAdd(&potential_dot_dot_acoustic[iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+ i,j,k,
+ 0,
+ irec_local)] // / kappal
+ );
- atomicAdd(&potential_dot_dot_acoustic[iglob],
- +adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
- pre_computed_irec_local_index[irec],pre_computed_index-1,
- 0,i,j,k)] // / kappal
- );
- }
+ //+adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+ // pre_computed_irec_local_index[irec],pre_computed_index-1,
+ // 0,i,j,k)] // / kappal
+ // );
}
}
}
@@ -781,85 +812,111 @@
extern "C"
void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
- float* h_adj_sourcearrays,
- int* size_adj_sourcearrays,
- int* phase_is_inner,
- int* myrank,
- int* nrec,
- int* pre_computed_index,
- int* h_islice_selected_rec,
- int* nadj_rec_local,
- int* NTSTEP_BETWEEN_ADJSRC) {
+ float* h_adj_sourcearrays,
+ int* phase_is_inner,
+ int* h_ispec_is_inner,
+ int* h_ispec_is_acoustic,
+ int* h_ispec_selected_rec,
+ int* myrank,
+ int* nrec,
+ int* time_index,
+ int* h_islice_selected_rec,
+ int* nadj_rec_local,
+ int* NTSTEP_BETWEEN_READ_ADJSRC) {
TRACE("add_sources_ac_sim_2_or_3_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+ // checks
+ if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
+
// make sure grid dimension is less than 65535 in x dimension
- int num_blocks_x = *nrec;
+ int num_blocks_x = mp->nadj_rec_local;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
num_blocks_x = ceil(num_blocks_x/2.0);
num_blocks_y = num_blocks_y*2;
}
+
dim3 grid(num_blocks_x,num_blocks_y,1);
dim3 threads(5,5,5);
- // copies source arrays
- // daniel: todo workaround -- adj_sourcearrays can be very big, but here only at
- // specific time it (pre_computed_irec_local_index) is actually needed...
- float* d_adj_sourcearrays;
- print_CUDA_error_if_any(cudaMalloc((void**)&d_adj_sourcearrays,
- (*size_adj_sourcearrays)*sizeof(float)),731);
- print_CUDA_error_if_any(cudaMemcpy(d_adj_sourcearrays, h_adj_sourcearrays,
- (*size_adj_sourcearrays)*sizeof(float),cudaMemcpyHostToDevice),732);
+ // build slice of adj_sourcearrays because full array is *very* large.
+ // note: this extracts array values for local adjoint sources at given time step "time_index"
+ // from large adj_sourcearrays array into h_adj_sourcearrays_slice
+ int ispec,i,j,k;
+ int irec_local = 0;
+ for(int irec = 0; irec < *nrec; irec++) {
+ if(*myrank == h_islice_selected_rec[irec]) {
+ irec_local++;
- //int* h_pre_computed_irec_local_index = new int[*nadj_rec_local];
- int* h_pre_computed_irec_local_index = (int*) calloc(*nrec,sizeof(int));
+ // takes only acoustic sources
+ ispec = h_ispec_selected_rec[irec]-1;
+ if( h_ispec_is_acoustic[ispec] ){
- int* d_pre_computed_irec_local_index;
- print_CUDA_error_if_any(cudaMalloc((void**)&d_pre_computed_irec_local_index,
- (*nrec)*sizeof(int)),741);
+ if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+ for(k=0;k<5;k++) {
+ for(j=0;j<5;j++) {
+ for(i=0;i<5;i++) {
- // the irec_local variable needs to be precomputed (as
- // h_pre_comp..), because normally it is in the loop updating accel,
- // and due to how it's incremented, it cannot be parallized
- int irec_local=0;
- for(int irec=0;irec<*nrec;irec++) {
- if(*myrank == h_islice_selected_rec[irec]) {
- h_pre_computed_irec_local_index[irec] = irec_local;
- irec_local++;
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+ i,j,k,0,
+ irec_local-1)]
+ = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+ *NTSTEP_BETWEEN_READ_ADJSRC,
+ 3,5,5,
+ irec_local-1,
+ *time_index-1,
+ 0,i,j,k)];
+
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+ i,j,k,1,
+ irec_local-1)]
+ = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+ *NTSTEP_BETWEEN_READ_ADJSRC,
+ 3,5,5,
+ irec_local-1,
+ *time_index-1,
+ 1,i,j,k)];
+
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+ i,j,k,2,
+ irec_local-1)]
+ = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+ *NTSTEP_BETWEEN_READ_ADJSRC,
+ 3,5,5,
+ irec_local-1,
+ *time_index-1,
+ 2,i,j,k)];
+ }
+ }
+ }
+ } // phase_is_inner
+ } // h_ispec_is_acoustic
}
- else h_pre_computed_irec_local_index[irec] = 0;
}
+ // check all local sources were added
+ if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
- //daniel
- //printf("irec local: rank=%d irec_local=%d nadj_rec_local=%d nrec=%d \n",*myrank,irec_local,*nadj_rec_local,*nrec);
+ // copies extracted array values onto GPU
+ cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+ (mp->nadj_rec_local)*3*125*sizeof(float),cudaMemcpyHostToDevice);
- print_CUDA_error_if_any(cudaMemcpy(d_pre_computed_irec_local_index,h_pre_computed_irec_local_index,
- (*nrec)*sizeof(int),cudaMemcpyHostToDevice),742);
+ // launches cuda kernel for acoustic adjoint sources
+ add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+ *nrec,
+ mp->d_adj_sourcearrays,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ mp->d_ispec_is_acoustic,
+ mp->d_ispec_selected_rec,
+ *phase_is_inner,
+ mp->d_islice_selected_rec,
+ mp->d_pre_computed_irec,
+ mp->nadj_rec_local,
+ mp->d_kappastore);
- add_sources_acoustic_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
- *nrec,
- *pre_computed_index,
- d_adj_sourcearrays,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- mp->d_ispec_is_acoustic,
- mp->d_kappastore,
- mp->d_ispec_selected_rec,
- *phase_is_inner,
- mp->d_islice_selected_rec,
- d_pre_computed_irec_local_index,
- *nadj_rec_local,
- *NTSTEP_BETWEEN_ADJSRC,
- *myrank);
-
- //delete h_pre_computed_irec_local_index;
- free(h_pre_computed_irec_local_index);
- cudaFree(d_adj_sourcearrays);
- cudaFree(d_pre_computed_irec_local_index);
-
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
#endif
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu 2011-10-31 21:45:35 UTC (rev 19137)
@@ -93,14 +93,9 @@
// (normal points outwards of acoustic element)
displ_n = displ_x*nx + displ_y*ny + displ_z*nz;
-
// gets associated, weighted jacobian
jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
- //daniel
- //if( igll == 0 ) printf("gpu: %i %i %i %i %i %e \n",i,j,k,ispec,iglob,jacobianw);
-
-
// continuity of pressure and normal displacement on global point
// note: newark time scheme together with definition of scalar potential:
@@ -147,10 +142,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
-//daniel
-// printf("gpu: %i %i %i \n",num_coupling_ac_el_faces,SIMULATION_TYPE,phase_is_inner);
-
-
+ // launches GPU kernel
compute_coupling_acoustic_el_kernel<<<grid,threads>>>(mp->d_displ,
mp->d_potential_dot_dot_acoustic,
num_coupling_ac_el_faces,
@@ -237,10 +229,6 @@
// gets associated, weighted jacobian
jacobianw = coupling_ac_el_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
- //daniel
- //if( igll == 0 ) printf("gpu: %i %i %i %i %i %e \n",i,j,k,ispec,iglob,jacobianw);
-
-
// continuity of displacement and pressure on global point
//
// note: newark time scheme together with definition of scalar potential:
@@ -288,10 +276,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- //daniel
- // printf("gpu: %i %i %i \n",num_coupling_ac_el_faces,SIMULATION_TYPE,phase_is_inner);
-
-
+ // launches GPU kernel
compute_coupling_elastic_ac_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
mp->d_accel,
num_coupling_ac_el_faces,
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu 2011-10-31 21:45:35 UTC (rev 19137)
@@ -467,7 +467,7 @@
offset1 = K*NGLL2+J*NGLLX+l;
temp1l += s_dummy_loc[offset1]*hp1;
- // daniel: not more assumes that hprime_xx = hprime_yy = hprime_zz
+ //no more assumes that hprime_xx = hprime_yy = hprime_zz
hp2 = hprime_yy[l*NGLLX+J];
offset2 = K*NGLL2+l*NGLLX+I;
temp2l += s_dummy_loc[offset2]*hp2;
@@ -547,7 +547,7 @@
offset1 = K*NGLL2+J*NGLLX+l;
temp1l += s_temp1[offset1]*fac1;
- //daniel: not more assumes hprimewgll_xx = hprimewgll_yy = hprimewgll_zz
+ //no more assumes hprimewgll_xx = hprimewgll_yy = hprimewgll_zz
fac2 = hprimewgll_yy[J*NGLLX+l];
offset2 = K*NGLL2+l*NGLLX+I;
temp2l += s_temp2[offset2]*fac2;
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-10-31 21:45:35 UTC (rev 19137)
@@ -41,11 +41,11 @@
// cuda constant arrays
__constant__ float d_hprime_xx[NGLL2];
-__constant__ float d_hprime_yy[NGLL2]; // daniel: remove only if certain: check if hprime_yy == hprime_xx
-__constant__ float d_hprime_zz[NGLL2]; // daniel: check if hprime_zz == hprime_xx
+__constant__ float d_hprime_yy[NGLL2];
+__constant__ float d_hprime_zz[NGLL2];
__constant__ float d_hprimewgll_xx[NGLL2];
-__constant__ float d_hprimewgll_yy[NGLL2]; // daniel: check if hprimewgll_yy == hprimewgll_xx
-__constant__ float d_hprimewgll_zz[NGLL2]; // daniel: remove only if certain...
+__constant__ float d_hprimewgll_yy[NGLL2];
+__constant__ float d_hprimewgll_zz[NGLL2];
__constant__ float d_wgllwgll_xy[NGLL2];
__constant__ float d_wgllwgll_xz[NGLL2];
__constant__ float d_wgllwgll_yz[NGLL2];
@@ -64,7 +64,7 @@
float* d_etax, float* d_etay, float* d_etaz,
float* d_gammax, float* d_gammay, float* d_gammaz,
float* d_kappav, float* d_muv,
- float* d_debug,
+ //float* d_debug,
int COMPUTE_AND_STORE_STRAIN,
float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,
@@ -471,7 +471,7 @@
// debugging
//printf("Starting with grid %dx%d for %d blocks\n",num_blocks_x,num_blocks_y,nb_blocks_to_compute);
- float* d_debug;
+// float* d_debug;
// float* h_debug;
// h_debug = (float*)calloc(128,sizeof(float));
// cudaMalloc((void**)&d_debug,128*sizeof(float));
@@ -494,7 +494,7 @@
mp->d_etax, mp->d_etay, mp->d_etaz,
mp->d_gammax, mp->d_gammay, mp->d_gammaz,
mp->d_kappav, mp->d_muv,
- d_debug,
+ //d_debug,
COMPUTE_AND_STORE_STRAIN,
mp->d_epsilondev_xx,
mp->d_epsilondev_yy,
@@ -535,7 +535,7 @@
mp->d_etax, mp->d_etay, mp->d_etaz,
mp->d_gammax, mp->d_gammay, mp->d_gammaz,
mp->d_kappav, mp->d_muv,
- d_debug,
+ //d_debug,
COMPUTE_AND_STORE_STRAIN,
mp->d_b_epsilondev_xx,
mp->d_b_epsilondev_yy,
@@ -611,7 +611,7 @@
float* d_etax, float* d_etay, float* d_etaz,
float* d_gammax, float* d_gammay, float* d_gammaz,
float* d_kappav, float* d_muv,
- float* d_debug,
+ //float* d_debug,
int COMPUTE_AND_STORE_STRAIN,
float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
float* epsilondev_xz,float* epsilondev_yz,
@@ -1450,12 +1450,11 @@
//if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob);
// only update this global point once
- // daniel: todo - workaround to not use the temporary update array
- // atomicExch returns the old value, i.e. 0 indicates that we still have to do this point
+ // daniel: TODO - there might be better ways to implement a mutex like below,
+ // and find a workaround to not use the temporary update array.
+ // atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point
- //if(igll == 0 ) printf("updated_dof %d %d\n",iglob,updated_dof_ocean_load[iglob]);
-
if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
// get normal
@@ -1470,7 +1469,7 @@
additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
- // daniel: probably wouldn't need atomicAdd anymore, but just to be sure...
+ // probably wouldn't need atomicAdd anymore, but just to be sure...
atomicAdd(&accel[iglob*3], + additional_term * nx);
atomicAdd(&accel[iglob*3+1], + additional_term * ny);
atomicAdd(&accel[iglob*3+2], + additional_term * nz);
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-10-31 21:45:35 UTC (rev 19137)
@@ -65,8 +65,8 @@
float* kappa_kl,
float* epsilon_trace_over_3,
float* b_epsilon_trace_over_3,
- int NSPEC_AB,
- float* d_debug) {
+ int NSPEC_AB //,float* d_debug
+ ) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
@@ -147,13 +147,11 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- float* d_debug;
- /*
- float* h_debug;
- h_debug = (float*)calloc(128,sizeof(float));
- cudaMalloc((void**)&d_debug,128*sizeof(float));
- cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
- */
+ //float* d_debug;
+ //float* h_debug;
+ //h_debug = (float*)calloc(128,sizeof(float));
+ //cudaMalloc((void**)&d_debug,128*sizeof(float));
+ //cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
mp->d_accel, mp->d_b_displ,
@@ -173,8 +171,8 @@
mp->d_kappa_kl,
mp->d_epsilon_trace_over_3,
mp->d_b_epsilon_trace_over_3,
- mp->NSPEC_AB,
- d_debug);
+ mp->NSPEC_AB //,d_debug
+ );
/*
cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
cudaFree(d_debug);
@@ -221,8 +219,8 @@
float* normal_z_noise,
float* Sigma_kl,
float deltat,
- int num_free_surface_faces,
- float* d_debug) {
+ int num_free_surface_faces //,float* d_debug
+ ) {
int iface = blockIdx.x + blockIdx.y*gridDim.x;
if(iface < num_free_surface_faces) {
@@ -264,18 +262,17 @@
void FC_FUNC_(compute_kernels_strgth_noise_cu,
COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
float* h_noise_surface_movie,
- int* num_free_surface_faces_f,
float* deltat) {
TRACE("compute_kernels_strgth_noise_cu");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- int num_free_surface_faces = *num_free_surface_faces_f;
- cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,3*25*num_free_surface_faces*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+ 3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice);
- int num_blocks_x = num_free_surface_faces;
+ int num_blocks_x = mp->num_free_surface_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
num_blocks_x = ceil(num_blocks_x/2.0);
@@ -286,7 +283,7 @@
dim3 threads(25,1,1);
// float* h_debug = (float*)calloc(128,sizeof(float));
- float* d_debug;
+ //float* d_debug;
// cudaMalloc((void**)&d_debug,128*sizeof(float));
// cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
@@ -299,8 +296,8 @@
mp->d_normal_y_noise,
mp->d_normal_z_noise,
mp->d_Sigma_kl,*deltat,
- num_free_surface_faces,
- d_debug);
+ mp->num_free_surface_faces //,d_debug
+ );
// cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
// for(int i=0;i<8;i++) {
@@ -310,7 +307,6 @@
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_kernels_strength_noise_cuda_kernel");
#endif
-
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2011-10-31 21:45:35 UTC (rev 19137)
@@ -92,9 +92,6 @@
// gets associated, weighted jacobian
jacobianw = abs_boundary_jacobian2Dw[INDEX2(NGLL2,igll,iface)];
-//daniel
-//if( igll == 0 ) printf("gpu: %i %i %i %i %i %e %e %e\n",i,j,k,ispec,iglob,rhol,kappal,jacobianw);
-
// Sommerfeld condition
atomicAdd(&potential_dot_dot_acoustic[iglob],-potential_dot_acoustic[iglob]*jacobianw/cpl/rhol);
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2011-10-31 21:45:35 UTC (rev 19137)
@@ -89,7 +89,7 @@
#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*128
-//daniel: TODO check speed of alternatives
+//daniel: TODO -- check speed of these alternative definitions
//#define INDEX2(xsize,x,y) x + (y)*xsize
//#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
@@ -107,6 +107,8 @@
void exit_on_cuda_error(char* kernel_name);
+void exit_on_error(char* info);
+
/* ----------------------------------------------------------------------------------------------- */
// cuda constant arrays
@@ -207,6 +209,11 @@
float* d_station_seismo_field;
float* h_station_seismo_field;
+ int nadj_rec_local;
+ float* d_adj_sourcearrays;
+ float* h_adj_sourcearrays_slice;
+ int* d_pre_computed_irec;
+
// surface elements (to save for noise tomography and acoustic simulations)
int* d_free_surface_ispec;
int* d_free_surface_ijk;
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2011-10-31 21:45:35 UTC (rev 19137)
@@ -40,31 +40,6 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void transfer_surface_to_host_kernel(int* free_surface_ispec,int* free_surface_ijk,
- int num_free_surface_faces, int* ibool,
- realw* displ, realw* noise_surface_movie) {
- int igll = threadIdx.x;
- int iface = blockIdx.x + blockIdx.y*gridDim.x;
-
- // int id = tx + blockIdx.x*blockDim.x + blockIdx.y*blockDim.x*gridDim.x;
-
- if(iface < num_free_surface_faces) {
- int ispec = free_surface_ispec[iface]-1; //-1 for C-based indexing
-
- int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
- int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
- int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
-
- int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-
- noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)] = displ[iglob*3];
- noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)] = displ[iglob*3+1];
- noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)] = displ[iglob*3+2];
- }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
extern "C"
void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){
TRACE("fortranflush");
@@ -124,15 +99,43 @@
/* ----------------------------------------------------------------------------------------------- */
+__global__ void transfer_surface_to_host_kernel(int* free_surface_ispec,
+ int* free_surface_ijk,
+ int num_free_surface_faces,
+ int* ibool,
+ realw* displ,
+ realw* noise_surface_movie) {
+ int igll = threadIdx.x;
+ int iface = blockIdx.x + blockIdx.y*gridDim.x;
+
+ // int id = tx + blockIdx.x*blockDim.x + blockIdx.y*blockDim.x*gridDim.x;
+
+ if(iface < num_free_surface_faces) {
+ int ispec = free_surface_ispec[iface]-1; //-1 for C-based indexing
+
+ int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
+ int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
+ int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
+
+ int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+ noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)] = displ[iglob*3];
+ noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)] = displ[iglob*3+1];
+ noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)] = displ[iglob*3+2];
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
extern "C"
void FC_FUNC_(transfer_surface_to_host,
TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
- realw* h_noise_surface_movie,
- int* num_free_surface_faces) {
+ realw* h_noise_surface_movie) {
TRACE("transfer_surface_to_host");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
- int num_blocks_x = *num_free_surface_faces;
+
+ int num_blocks_x = mp->num_free_surface_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
num_blocks_x = ceil(num_blocks_x/2.0);
@@ -143,13 +146,13 @@
transfer_surface_to_host_kernel<<<grid,threads>>>(mp->d_free_surface_ispec,
mp->d_free_surface_ijk,
- *num_free_surface_faces,
+ mp->num_free_surface_faces,
mp->d_ibool,
mp->d_displ,
mp->d_noise_surface_movie);
cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
- 3*25*(*num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
+ 3*25*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("transfer_surface_to_host");
@@ -167,9 +170,8 @@
realw* normal_y_noise,
realw* normal_z_noise,
realw* mask_noise,
- realw* free_surface_jacobian2Dw,
- realw* wgllwgll_xy,
- float* d_debug) {
+ realw* free_surface_jacobian2Dw //,float* d_debug
+ ) {
int iface = blockIdx.x + gridDim.x*blockIdx.y; // surface element id
@@ -234,22 +236,24 @@
extern "C"
void FC_FUNC_(noise_read_add_surface_movie_cu,
NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
- realw* h_noise_surface_movie,
- int* num_free_surface_faces_f,
- int* NOISE_TOMOGRAPHYf) {
+ realw* h_noise_surface_movie,
+ int* NOISE_TOMOGRAPHYf) {
TRACE("noise_read_add_surface_movie_cu");
// EPIK_TRACER("noise_read_add_surface_movie_cu");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- int num_free_surface_faces = *num_free_surface_faces_f;
int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf;
- float* d_noise_surface_movie;
- cudaMalloc((void**)&d_noise_surface_movie,3*25*num_free_surface_faces*sizeof(float));
- cudaMemcpy(d_noise_surface_movie, h_noise_surface_movie,
- 3*25*num_free_surface_faces*sizeof(realw),cudaMemcpyHostToDevice);
- int num_blocks_x = num_free_surface_faces;
+ //float* d_noise_surface_movie;
+ //cudaMalloc((void**)&d_noise_surface_movie,3*25*num_free_surface_faces*sizeof(float));
+ //cudaMemcpy(d_noise_surface_movie, h_noise_surface_movie,
+ // 3*25*num_free_surface_faces*sizeof(realw),cudaMemcpyHostToDevice);
+
+ cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
+ 3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice);
+
+ int num_blocks_x = mp->num_free_surface_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
num_blocks_x = ceil(num_blocks_x/2.0);
@@ -259,50 +263,46 @@
dim3 threads(25,1,1);
// float* h_debug = (float*)calloc(128,sizeof(float));
- float* d_debug;
+ //float* d_debug;
// cudaMalloc((void**)&d_debug,128*sizeof(float));
// cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
if(NOISE_TOMOGRAPHY == 2) { // add surface source to forward field
noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_accel,
- mp->d_ibool,
- mp->d_free_surface_ispec,
- mp->d_free_surface_ijk,
- num_free_surface_faces,
- d_noise_surface_movie,
- mp->d_normal_x_noise,
- mp->d_normal_y_noise,
- mp->d_normal_z_noise,
- mp->d_mask_noise,
- mp->d_free_surface_jacobian2Dw,
- mp->d_wgllwgll_xy,
- d_debug);
+ mp->d_ibool,
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->num_free_surface_faces,
+ mp->d_noise_surface_movie,
+ mp->d_normal_x_noise,
+ mp->d_normal_y_noise,
+ mp->d_normal_z_noise,
+ mp->d_mask_noise,
+ mp->d_free_surface_jacobian2Dw //,d_debug
+ );
}
else if(NOISE_TOMOGRAPHY == 3) { // add surface source to adjoint (backward) field
noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
- mp->d_ibool,
- mp->d_free_surface_ispec,
- mp->d_free_surface_ijk,
- num_free_surface_faces,
- d_noise_surface_movie,
- mp->d_normal_x_noise,
- mp->d_normal_y_noise,
- mp->d_normal_z_noise,
- mp->d_mask_noise,
- mp->d_free_surface_jacobian2Dw,
- mp->d_wgllwgll_xy,
- d_debug);
+ mp->d_ibool,
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->num_free_surface_faces,
+ mp->d_noise_surface_movie,
+ mp->d_normal_x_noise,
+ mp->d_normal_y_noise,
+ mp->d_normal_z_noise,
+ mp->d_mask_noise,
+ mp->d_free_surface_jacobian2Dw //,d_debug
+ );
}
-
// cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
// for(int i=0;i<8;i++) {
// printf("debug[%d]= %e\n",i,h_debug[i]);
// }
// MPI_Abort(MPI_COMM_WORLD,1);
+ //cudaFree(d_noise_surface_movie);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("noise_read_add_surface_movie_cuda_kernel");
#endif
-
- cudaFree(d_noise_surface_movie);
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-10-31 21:45:35 UTC (rev 19137)
@@ -221,7 +221,7 @@
/* ----------------------------------------------------------------------------------------------- */
-//daniel
+//daniel: helper function
/*
__global__ void check_phase_ispec_kernel(int num_phase_ispec,
int* phase_ispec,
@@ -290,7 +290,7 @@
*/
/* ----------------------------------------------------------------------------------------------- */
-//daniel
+//daniel: helper function
/*
__global__ void check_ispec_is_kernel(int NSPEC_AB,
int* ispec_is,
@@ -353,7 +353,7 @@
}
*/
/* ----------------------------------------------------------------------------------------------- */
-//daniel
+//daniel: helper function
/*
__global__ void check_array_ispec_kernel(int num_array_ispec,
int* array_ispec,
@@ -481,7 +481,7 @@
}
// allocates mesh parameter structure
- Mesh* mp = (Mesh*)malloc(sizeof(Mesh));
+ Mesh* mp = (Mesh*) malloc( sizeof(Mesh) );
if (mp == NULL) exit_on_error("error allocating mesh pointer");
*Mesh_pointer = (long)mp;
@@ -664,7 +664,10 @@
PREPARE_SIM2_OR_3_CONST_DEVICE)(
long* Mesh_pointer_f,
int* islice_selected_rec,
- int* islice_selected_rec_size) {
+ int* islice_selected_rec_size,
+ int* nadj_rec_local,
+ int* nrec,
+ int* myrank) {
TRACE("prepare_sim2_or_3_const_device");
@@ -673,11 +676,44 @@
// allocates arrays for receivers
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
*islice_selected_rec_size*sizeof(int)),7001);
-
// copies arrays to GPU device
print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_rec,islice_selected_rec,
*islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),7002);
+ // adjoint source arrays
+ mp->nadj_rec_local = *nadj_rec_local;
+ if( mp->nadj_rec_local > 0 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
+ (mp->nadj_rec_local)*3*125*sizeof(float)),7003);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
+ (mp->nadj_rec_local)*sizeof(int)),7004);
+
+ // prepares local irec array:
+ // the irec_local variable needs to be precomputed (as
+ // h_pre_comp..), because normally it is in the loop updating accel,
+ // and due to how it's incremented, it cannot be parallelized
+ int* h_pre_computed_irec = (int*) malloc( (mp->nadj_rec_local)*sizeof(int) );
+ if( h_pre_computed_irec == NULL ) exit_on_error("prepare_sim2_or_3_const_device: h_pre_computed_irec not allocated\n");
+
+ int irec_local = 0;
+ for(int irec = 0; irec < *nrec; irec++) {
+ if(*myrank == islice_selected_rec[irec]) {
+ irec_local++;
+ h_pre_computed_irec[irec_local-1] = irec;
+ }
+ }
+ if( irec_local != mp->nadj_rec_local ) exit_on_error("prepare_sim2_or_3_const_device: irec_local not equal\n");
+ // copies values onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_pre_computed_irec,h_pre_computed_irec,
+ (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice),7010);
+ free(h_pre_computed_irec);
+
+ // temporary array to prepare extracted source array values
+ mp->h_adj_sourcearrays_slice = (float*) malloc( (mp->nadj_rec_local)*3*125*sizeof(float) );
+ if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
+
+ }
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_sim2_or_3_const_device");
#endif
@@ -785,7 +821,7 @@
if( mp->nrec_local > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
mp->nrec_local*125*sizeof(float)),9107);
- mp->h_station_seismo_potential = (float*)malloc(mp->nrec_local*125*sizeof(float));
+ mp->h_station_seismo_potential = (float*) malloc( mp->nrec_local*125*sizeof(float) );
if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
}
@@ -848,7 +884,7 @@
// initializes kernel values to zero
print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),9019);
+ 125*mp->NSPEC_AB*sizeof(float)),9019);
print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,
125*mp->NSPEC_AB*sizeof(float)),9020);
@@ -945,7 +981,8 @@
if( mp->nrec_local > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
3*125*(mp->nrec_local)*sizeof(float)),8015);
- mp->h_station_seismo_field = (float*)malloc(3*125*(mp->nrec_local)*sizeof(float));
+ mp->h_station_seismo_field = (float*) malloc( 3*125*(mp->nrec_local)*sizeof(float) );
+ if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
}
// absorbing conditions
@@ -1055,7 +1092,7 @@
}
-
+
if( *OCEANS ){
// oceans needs a free surface
mp->num_free_surface_faces = *num_free_surface_faces;
@@ -1131,12 +1168,12 @@
// initializes kernel values to zero
print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),8207);
+ 125*mp->NSPEC_AB*sizeof(float)),8207);
print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),8208);
+ 125*mp->NSPEC_AB*sizeof(float)),8208);
print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),8209);
-
+ 125*mp->NSPEC_AB*sizeof(float)),8209);
+
// strains used for attenuation and kernel simulations
if( *COMPUTE_AND_STORE_STRAIN ){
// strains
@@ -1319,8 +1356,8 @@
125*(mp->NSPEC_AB)*sizeof(float)),4401);
// initializes kernel values to zero
print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),4403);
-
+ 125*mp->NSPEC_AB*sizeof(float)),4403);
+
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1520,7 +1557,14 @@
} // ELASTIC_SIMULATION
// purely adjoint & kernel array
- if( *SIMULATION_TYPE == 2 || *SIMULATION_TYPE == 3 ) cudaFree(mp->d_islice_selected_rec);
+ if( *SIMULATION_TYPE == 2 || *SIMULATION_TYPE == 3 ){
+ cudaFree(mp->d_islice_selected_rec);
+ if(mp->nadj_rec_local > 0 ){
+ cudaFree(mp->d_adj_sourcearrays);
+ cudaFree(mp->d_pre_computed_irec);
+ free(mp->h_adj_sourcearrays_slice);
+ }
+ }
// NOISE arrays
if( *NOISE_TOMOGRAPHY > 0 ){
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/save_and_compare_cpu_vs_gpu.c 2011-10-31 21:45:35 UTC (rev 19137)
@@ -86,10 +86,10 @@
FILE* fp; int it;
printf("Opening %s for analysis\n",filename);
fp = fopen(filename,"rb");
- char* errorstr;
+ //char* errorstr;
if(fp == 0) {
- errorstr = strerror(errno);
- printf("FILE ERROR:%s\n",errorstr);
+ //errorstr = (char*) strerror(errno);
+ printf("FILE ERROR:%s\n",strerror(errno));
perror("file error\n");
exit(1);
}
@@ -165,18 +165,20 @@
FILE* fp_cpu;
fp_cpu = fopen(cpu_file,"rb");
- char* errorstr;
+ //char* errorstr;
if(fp_cpu == 0) {
- errorstr = strerror(errno);
- printf("CPU FILE ERROR:%s\n",errorstr);
+ //errorstr = (char*) strerror(errno);
+ //printf("CPU FILE ERROR:%s\n",errorstr);
+ printf("CPU FILE ERROR:%s\n",strerror(errno));
perror("cpu file error\n");
}
FILE* fp_gpu;
fp_gpu = fopen(gpu_file,"rb");
if(fp_gpu == NULL) {
- errorstr = strerror(errno);
- printf("GPU FILE ERROR:%s\n",errorstr);
+ //errorstr = (char*) strerror(errno);
+ //printf("GPU FILE ERROR:%s\n",errorstr);
+ printf("GPU FILE ERROR:%s\n",strerror(errno));
perror("gpu file error\n");
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2011-10-31 21:45:35 UTC (rev 19137)
@@ -70,14 +70,18 @@
int* irec_master_noise_f,
int* islice_selected_rec){}
-void FC_FUNC_(add_sources_sim_type_2_or_3,
- ADD_SOURCES_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+ ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
float* h_adj_sourcearrays,
- int* size_adj_sourcearrays, int* ispec_is_inner,
- int* phase_is_inner, int* ispec_selected_rec,
- int* ibool,
- int* myrank, int* nrec, int* time_index,
- int* h_islice_selected_rec,int* nadj_rec_local,
+ int* phase_is_inner,
+ int* h_ispec_is_inner,
+ int* h_ispec_is_elastic,
+ int* h_ispec_selected_rec,
+ int* myrank,
+ int* nrec,
+ int* time_index,
+ int* h_islice_selected_rec,
+ int* nadj_rec_local,
int* NTSTEP_BETWEEN_READ_ADJSRC){}
void FC_FUNC_(compute_add_sources_ac_cuda,
@@ -100,15 +104,17 @@
void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
- float* h_adj_sourcearrays,
- int* size_adj_sourcearrays,
- int* phase_is_inner,
- int* myrank,
- int* nrec,
- int* pre_computed_index,
- int* h_islice_selected_rec,
- int* nadj_rec_local,
- int* NTSTEP_BETWEEN_ADJSRC){}
+ float* h_adj_sourcearrays,
+ int* phase_is_inner,
+ int* h_ispec_is_inner,
+ int* h_ispec_is_acoustic,
+ int* h_ispec_selected_rec,
+ int* myrank,
+ int* nrec,
+ int* time_index,
+ int* h_islice_selected_rec,
+ int* nadj_rec_local,
+ int* NTSTEP_BETWEEN_ADJSRC){}
/* from compute_coupling_cuda.cu */
@@ -232,9 +238,8 @@
void FC_FUNC_(compute_kernels_strgth_noise_cu,
COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
- float* h_noise_surface_movie,
- int* num_free_surface_faces_f,
- float* deltat){}
+ float* h_noise_surface_movie,
+ float* deltat){}
void FC_FUNC_(compute_kernels_acoustic_cuda,
COMPUTE_KERNELS_ACOUSTIC_CUDA)(
@@ -301,13 +306,11 @@
void FC_FUNC_(transfer_surface_to_host,
TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
- realw* h_noise_surface_movie,
- int* num_free_surface_faces){}
+ realw* h_noise_surface_movie){}
void FC_FUNC_(noise_read_add_surface_movie_cu,
NOISE_READ_ADD_SURFACE_MOVIE_CU)(long* Mesh_pointer_f,
realw* h_noise_surface_movie,
- int* num_free_surface_faces_f,
int* NOISE_TOMOGRAPHYf){}
@@ -363,7 +366,10 @@
PREPARE_SIM2_OR_3_CONST_DEVICE)(
long* Mesh_pointer_f,
int* islice_selected_rec,
- int* islice_selected_rec_size){}
+ int* islice_selected_rec_size,
+ int* nadj_rec_local,
+ int* nrec,
+ int* myrank){}
void FC_FUNC_(prepare_fields_acoustic_device,
PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2011-10-31 21:45:35 UTC (rev 19137)
@@ -48,7 +48,8 @@
int* ibool,
float* station_seismo_field,
float* desired_field,
- int nrec_local,int* debug_index) {
+ int nrec_local //,int* debug_index
+ ) {
int blockID = blockIdx.x + blockIdx.y*gridDim.x;
if(blockID<nrec_local) {
//int nodeID = threadIdx.x + blockID*blockDim.x;
@@ -98,7 +99,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- int* d_debug_index;
+ //int* d_debug_index;
//int* h_debug_index;
//cudaMalloc((void**)&d_debug_index,125*sizeof(int));
//h_debug_index = (int*)calloc(125,sizeof(int));
@@ -107,11 +108,12 @@
// prepare field transfer array on device
transfer_stations_fields_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
- d_ispec_selected,
- mp->d_ibool,
- mp->d_station_seismo_field,
- d_field,
- mp->nrec_local,d_debug_index);
+ d_ispec_selected,
+ mp->d_ibool,
+ mp->d_station_seismo_field,
+ d_field,
+ mp->nrec_local //,d_debug_index
+ );
//cudaMemcpy(h_debug_index,d_debug_index,125*sizeof(int),cudaMemcpyDeviceToHost);
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/check_mesh_quality.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/check_mesh_quality.f90 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/check_mesh_quality.f90 2011-10-31 21:45:35 UTC (rev 19137)
@@ -90,6 +90,7 @@
!logical :: USE_OPENDX
!character(len=256):: line
+ integer,dimension(1) :: tmp_ispec_max_skewness,tmp_ispec_max_skewness_MPI
if (myrank == 0) then
@@ -167,7 +168,8 @@
if((myrank == skewness_max_rank) .and. (myrank /= 0)) then
- call send_i_t(ispec_max_skewness,1,0)
+ tmp_ispec_max_skewness(1) = ispec_max_skewness
+ call send_i_t(tmp_ispec_max_skewness,1,0)
end if
@@ -175,7 +177,8 @@
if(skewness_max_rank /= myrank) then
- call recv_i_t(ispec_max_skewness_MPI,1,skewness_max_rank)
+ call recv_i_t(tmp_ispec_max_skewness_MPI,1,skewness_max_rank)
+ ispec_max_skewness_MPI = tmp_ispec_max_skewness_MPI(1)
else
ispec_max_skewness_MPI = ispec_max_skewness
end if
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/check_mesh_resolution.f90 2011-10-31 21:45:35 UTC (rev 19137)
@@ -71,6 +71,7 @@
logical :: has_vs_zero
+ real(kind=CUSTOM_REAL),dimension(1) :: tmp_val
! initializations
if( DT <= 0.0d0) then
@@ -296,13 +297,16 @@
model_speed_max = vsmax_glob
endif
endif
- call bcast_all_cr(model_speed_max,1)
+ tmp_val(1) = model_speed_max
+ call bcast_all_cr(tmp_val,1)
+ model_speed_max = tmp_val(1)
! returns minimum period
if( myrank == 0 ) min_resolved_period = pmax_glob
- call bcast_all_cr(min_resolved_period,1)
+ tmp_val(1) = min_resolved_period
+ call bcast_all_cr(tmp_val,1)
+ min_resolved_period = tmp_val(1)
-
end subroutine check_mesh_resolution
!
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/shared/write_c_binary.c 2011-10-31 21:45:35 UTC (rev 19137)
@@ -137,7 +137,7 @@
}
/*
-//daniel: debug checks file size
+//debug checks file size
// see:
//https://www.securecoding.cert.org/confluence/display/seccode/FIO19-C.+Do+not+use+fseek()+and+ftell()+to+compute+the+size+of+a+file
printf("file size: %lld \n",*filesize);
@@ -253,7 +253,7 @@
ret = 0;
/*
-//daniel: debug
+//debug
float dat[*length/4];
memcpy(dat,buffer,*length);
printf("buffer length: %d %d\n",*length,*index);
@@ -295,10 +295,8 @@
}
}
-//daniel: debug
-// printf("buffer done length: %d %d\n",donelen,*length);
-
-
+ //debug
+ // printf("buffer done length: %d %d\n",donelen,*length);
}
//void
@@ -355,7 +353,7 @@
}
/*
-//daniel: debug
+// debug
printf("position: %lld %d %d \n",pos,*length,*index);
printf("buffer done length: %d %d\n",donelen,*length);
float dat[*length/4];
@@ -367,7 +365,6 @@
printf("return buffer: %d %e \n",i,dat[i]);
}
*/
-
}
@@ -428,7 +425,7 @@
}
/*
- // daniel: debug check filesize below or above 2 GB
+ // debug check filesize below or above 2 GB
// filesize gives bytes needed; 4-byte integer limited to +- 2,147,483,648 bytes ~ 2 GB
float s = *filesize / 1024. / 1024. / 1024.;
if( s > 2.0 ){
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90 2011-10-31 21:45:35 UTC (rev 19137)
@@ -339,12 +339,7 @@
if( it < NSTEP ) then
! receivers act as sources
- if( GPU_MODE) then
- call add_sources_ac_sim_2_or_3_cuda(Mesh_pointer, adj_sourcearrays, &
- size(adj_sourcearrays), phase_is_inner, myrank, nrec, &
- NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC),&
- islice_selected_rec, nadj_rec_local, NTSTEP_BETWEEN_READ_ADJSRC)
- else
+ if( .NOT. GPU_MODE ) then
irec_local = 0
do irec = 1,nrec
! add the source (only if this proc carries the source)
@@ -378,6 +373,15 @@
endif
endif
enddo ! nrec
+ else
+ ! on GPU
+ call add_sources_ac_sim_2_or_3_cuda(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
+ ispec_is_inner,ispec_is_acoustic, &
+ ispec_selected_rec,myrank,nrec, &
+ NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
+ islice_selected_rec,nadj_rec_local, &
+ NTSTEP_BETWEEN_READ_ADJSRC)
+
endif ! GPU_MODE
endif ! it
endif ! nadj_rec_local > 0
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90 2011-10-31 21:45:35 UTC (rev 19137)
@@ -238,29 +238,32 @@
! adjoint simulations
if (SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3) then
- ! read in adjoint sources block by block (for memory consideration)
- ! e.g., in exploration experiments, both the number of receivers (nrec) and
- ! the number of time steps (NSTEP) are huge,
- ! which may cause problems since we have a large array:
- ! adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ)
+ ! adds adjoint source in this partitions
+ if( nadj_rec_local > 0 ) then
- ! figure out if we need to read in a chunk of the adjoint source at this timestep
- it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) ) !chunk_number
- ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0))
+ ! read in adjoint sources block by block (for memory consideration)
+ ! e.g., in exploration experiments, both the number of receivers (nrec) and
+ ! the number of time steps (NSTEP) are huge,
+ ! which may cause problems since we have a large array:
+ ! adj_sourcearrays(nadj_rec_local,NSTEP,NDIM,NGLLX,NGLLY,NGLLZ)
- ! needs to read in a new chunk/block of the adjoint source
- ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner'
- ! we first do calculations for the boudaries, and then start communication
- ! with other partitions while calculate for the inner part
- ! this must be done carefully, otherwise the adjoint sources may be added twice
- if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then
+ ! figure out if we need to read in a chunk of the adjoint source at this timestep
+ it_sub_adj = ceiling( dble(it)/dble(NTSTEP_BETWEEN_READ_ADJSRC) ) !chunk_number
+ ibool_read_adj_arrays = (((mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC) == 0)) .and. (nadj_rec_local > 0))
+ ! needs to read in a new chunk/block of the adjoint source
+ ! note that for each partition, we divide it into two parts --- boundaries and interior --- indicated by 'phase_is_inner'
+ ! we first do calculations for the boudaries, and then start communication
+ ! with other partitions while calculate for the inner part
+ ! this must be done carefully, otherwise the adjoint sources may be added twice
+ if (ibool_read_adj_arrays .and. (.not. phase_is_inner)) then
+
! allocates temporary source array
allocate(adj_sourcearray(NTSTEP_BETWEEN_READ_ADJSRC,NDIM,NGLLX,NGLLY,NGLLZ),stat=ier)
if( ier /= 0 ) stop 'error allocating array adj_sourcearray'
if (.not. SU_FORMAT) then
-!!! read ascii adjoint sources
+ !!! read ascii adjoint sources
irec_local = 0
do irec = 1, nrec
! compute source arrays
@@ -279,7 +282,7 @@
endif
enddo
else
-!!! read SU adjoint sources
+ !!! read SU adjoint sources
! range of the block we need to read
it_start = NSTEP - it_sub_adj*NTSTEP_BETWEEN_READ_ADJSRC + 1
it_end = it_start + NTSTEP_BETWEEN_READ_ADJSRC - 1
@@ -322,49 +325,53 @@
deallocate(adj_sourcearray)
- endif ! if(ibool_read_adj_arrays)
+ endif ! if(ibool_read_adj_arrays)
- if( it < NSTEP ) then
+ if( it < NSTEP ) then
if(.NOT. GPU_MODE) then
- ! receivers act as sources
- irec_local = 0
- do irec = 1,nrec
+ ! receivers act as sources
+ irec_local = 0
+ do irec = 1,nrec
- ! add the source (only if this proc carries the source)
- if (myrank == islice_selected_rec(irec)) then
- irec_local = irec_local + 1
+ ! add the source (only if this proc carries the source)
+ if (myrank == islice_selected_rec(irec)) then
+ irec_local = irec_local + 1
- ! checks if element is in phase_is_inner run
- if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then
+ ispec = ispec_selected_rec(irec)
+ if( ispec_is_elastic(ispec) ) then
- ! adds source array
- do k = 1,NGLLZ
- do j = 1,NGLLY
- do i = 1,NGLLX
- iglob = ibool(i,j,k,ispec_selected_rec(irec))
+ ! checks if element is in phase_is_inner run
+ if (ispec_is_inner(ispec_selected_rec(irec)) .eqv. phase_is_inner) then
- accel(:,iglob) = accel(:,iglob) &
+ ! adds source array
+ do k = 1,NGLLZ
+ do j = 1,NGLLY
+ do i = 1,NGLLX
+ iglob = ibool(i,j,k,ispec_selected_rec(irec))
+
+ accel(:,iglob) = accel(:,iglob) &
+ adj_sourcearrays(irec_local, &
NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
:,i,j,k)
- enddo
- enddo
+ enddo
enddo
-
- endif ! phase_is_inner
- endif
- enddo ! nrec
+ enddo
+ endif ! phase_is_inner
+ endif ! ispec_is_elastic
+ endif
+ enddo ! nrec
else ! GPU_MODE == .true.
- call add_sources_sim_type_2_or_3(Mesh_pointer, adj_sourcearrays, &
- size(adj_sourcearrays), ispec_is_inner,&
- phase_is_inner, ispec_selected_rec,ibool,myrank, nrec, &
- NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC),&
- islice_selected_rec, nadj_rec_local, NTSTEP_BETWEEN_READ_ADJSRC)
+ call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
+ ispec_is_inner,ispec_is_elastic, &
+ ispec_selected_rec,myrank,nrec, &
+ NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
+ islice_selected_rec,nadj_rec_local, &
+ NTSTEP_BETWEEN_READ_ADJSRC)
endif ! GPU_MODE
- endif ! it
-
+ endif ! it
+ endif ! nadj_rec_local
endif !adjoint
! note: b_displ() is read in after Newark time scheme, thus
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90 2011-10-31 21:45:35 UTC (rev 19137)
@@ -443,9 +443,7 @@
! distributes routines according to chosen NGLLX in constants.h
-!daniel
-! note:
-! i put it here rather than in compute_forces_elastic_Dev.f90 because compiler complains that:
+!daniel: note -- i put it here rather than in compute_forces_elastic_Dev.f90 because compiler complains that:
! " The storage extent of the dummy argument exceeds that of the actual argument. "
subroutine compute_forces_elastic_Dev_sim1(iphase)
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/noise_tomography.f90 2011-10-31 21:45:35 UTC (rev 19137)
@@ -45,14 +45,14 @@
module user_noise_distribution
-!daniel: TODO -- this default value will produce errors
-! when using with the default example/noise_tomography
+!daniel: TODO -- setting USE_PIERO_DISTRIBUTION = .true. will produce errors
+! when using with the default example in "example/noise_tomography/"
! i left it here so that Max can run his example without changing this every time...
logical,parameter :: USE_PIERO_DISTRIBUTION = .true.
contains
-! wrapper function
+! wrapper function
! this subroutine must be modified by USERS for their own noise distribution
subroutine noise_distribution_direction(xcoord_in,ycoord_in,zcoord_in, &
@@ -116,7 +116,7 @@
! dummy assign to avoid compiler warnings
ldummy = xcoord_in
ldummy = ycoord_in
- ldummy = zcoord_in
+ ldummy = zcoord_in
end subroutine noise_distribution_direction_d
@@ -208,7 +208,7 @@
end subroutine noise_distribution_dir_non_uni
-
+
end module user_noise_distribution
@@ -250,7 +250,7 @@
logical, dimension(NSPEC_AB_VAL) :: ispec_is_acoustic
-!daniel: from global code...
+ !from global code...
!integer, dimension(NSPEC2D_TOP_VAL) :: ibelm_top ! equals free_surface_ispec
!integer :: NSPEC2D_TOP_VAL ! equals num_free_surface_faces
!integer :: nspec_top ! equals num_free_surface_faces
@@ -297,7 +297,7 @@
! noise distribution and noise direction
ipoin = 0
- !daniel: from global code, carefull: ngllz must not be face on top...
+ !from global code, carefull: ngllz must not be face on top...
! do ispec2D = 1, nspec_top
! ispec = ibelm_top(ispec2D)
! k = NGLLZ
@@ -321,13 +321,13 @@
ipoin = ipoin + 1
iglob = ibool(i,j,k,ispec)
-
+
! this subroutine must be modified by USERS in module user_noise_distribution
call noise_distribution_direction(xstore(iglob), &
ystore(iglob),zstore(iglob), &
normal_x_noise_out,normal_y_noise_out,normal_z_noise_out, &
- mask_noise_out)
-
+ mask_noise_out)
+
normal_x_noise(ipoin) = normal_x_noise_out
normal_y_noise(ipoin) = normal_y_noise_out
normal_z_noise(ipoin) = normal_z_noise_out
@@ -653,7 +653,7 @@
real(kind=CUSTOM_REAL),dimension(NDIM,NGLLSQUARE,num_free_surface_faces) :: noise_surface_movie
integer(kind=8) :: Mesh_pointer
logical :: GPU_MODE
-
+
! writes out wavefield at surface
if( num_free_surface_faces > 0 ) then
@@ -675,7 +675,7 @@
enddo
! TODO: Check if transfer_surface_to_hose is compatible with newer version above
else ! GPU_MODE == 1
- call transfer_surface_to_host(Mesh_pointer, noise_surface_movie, num_free_surface_faces)
+ call transfer_surface_to_host(Mesh_pointer,noise_surface_movie)
endif
! save surface motion to disk
@@ -743,8 +743,7 @@
call read_abs(2,noise_surface_movie,CUSTOM_REAL*NDIM*NGLLSQUARE*num_free_surface_faces,it)
if(GPU_MODE) then
- call noise_read_add_surface_movie_cu(Mesh_pointer, noise_surface_movie,&
- num_free_surface_faces,NOISE_TOMOGRAPHY)
+ call noise_read_add_surface_movie_cu(Mesh_pointer, noise_surface_movie,NOISE_TOMOGRAPHY)
else ! GPU_MODE==0
! get coordinates of surface mesh and surface displacement
@@ -780,7 +779,7 @@
endif ! GPU_MODE
endif
-
+
end subroutine noise_read_add_surface_movie
@@ -870,7 +869,7 @@
enddo
else ! GPU_MODE==1
- call compute_kernels_strgth_noise_cu(Mesh_pointer, noise_surface_movie,num_free_surface_faces,deltat)
+ call compute_kernels_strgth_noise_cu(Mesh_pointer,noise_surface_movie,deltat)
endif ! GPU_MODE
endif
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-10-31 21:45:35 UTC (rev 19137)
@@ -603,7 +603,7 @@
filesize = b_reclen_potential
filesize = filesize*NSTEP
- ! daniel: debug check size limit
+ ! debug check size limit
!if( NSTEP > 2147483647 / b_reclen_potential ) then
! print *,'file size needed exceeds integer 4-byte limit: ',b_reclen_potential,NSTEP
! print *,' ',CUSTOM_REAL, NGLLSQUARE, num_abs_boundary_faces,NSTEP
@@ -842,7 +842,8 @@
! prepares needed receiver array for adjoint runs
if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) &
call prepare_sim2_or_3_const_device(Mesh_pointer, &
- islice_selected_rec,size(islice_selected_rec))
+ islice_selected_rec,size(islice_selected_rec), &
+ nadj_rec_local,nrec,myrank)
! prepares fields on GPU for noise simulations
if ( NOISE_TOMOGRAPHY > 0 ) then
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90 2011-10-31 21:45:35 UTC (rev 19137)
@@ -323,7 +323,7 @@
! locate inner and outer elements
call rmd_setup_inner_outer_elemnts()
-!daniel: todo mesh coloring
+!daniel: TODO -- mesh coloring
! call rmd_setup_color_perm()
! gets model dimensions
@@ -513,7 +513,7 @@
!-------------------------------------------------------------------------------------------------
!
-!daniel: todo mesh coloring
+!daniel: TODO -- mesh coloring
subroutine rmd_setup_color_perm()
use specfem_par
@@ -697,7 +697,7 @@
! sorts array according to permutation
! SORT_MESH_INNER_OUTER
-!daniel
+!daniel: TODO -- mesh permutation
if( .false. ) then
! permutation of ibool
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90 2011-10-31 18:21:21 UTC (rev 19136)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/write_seismograms.f90 2011-10-31 21:45:35 UTC (rev 19137)
@@ -804,7 +804,7 @@
implicit none
character(len=256) procname,final_LOCAL_PATH
- integer :: irec_local,irec
+ integer :: irec_local,irec
! headers
integer,parameter :: nheader=240 ! 240 bytes
More information about the CIG-COMMITS
mailing list