[cig-commits] r19039 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Thu Oct 6 18:29:29 PDT 2011
Author: danielpeter
Date: 2011-10-06 18:29:29 -0700 (Thu, 06 Oct 2011)
New Revision: 19039
Modified:
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
Log:
updates attenuation in elastic kernel; unravels elastic and noise simulation preparation
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu 2011-10-07 01:29:29 UTC (rev 19039)
@@ -172,7 +172,10 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool, int* ispec_selected_rec, int irec_master_noise, real* accel, real* noise_sourcearray, int it) {
+__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool, int* ispec_selected_rec,
+ int irec_master_noise,
+ real* accel, real* noise_sourcearray,
+ int it) {
int tx = threadIdx.x;
int iglob = ibool[tx + 125*(ispec_selected_rec[irec_master_noise-1]-1)]-1;
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-10-07 01:29:29 UTC (rev 19039)
@@ -21,12 +21,33 @@
__constant__ float d_wgllwgll_yz[NGLL2];
-/* ----------------------------------------------------------------------------------------------- */
-
void Kernel_2(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
- int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE);
+ int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION);
+__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
+ int num_phase_ispec_elastic, int d_iphase, int* d_ibool);
+__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
+ int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,
+ float* d_displ, float* d_accel,
+ float* d_xix, float* d_xiy, float* d_xiz,
+ float* d_etax, float* d_etay, float* d_etaz,
+ float* d_gammax, float* d_gammay, float* d_gammaz,
+ float* d_kappav, float* d_muv,
+ float* d_debug,
+ int COMPUTE_AND_STORE_STRAIN,
+ float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
+ float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,
+ int SIMULATION_TYPE,
+ int ATTENUATION,int NSPEC,
+ float* one_minus_sum_beta,float* factor_common,
+ float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+ float* alphaval,float* betaval,float* gammaval);
+
+__global__ void kernel_3_cuda_device(real* veloc,
+ real* accel, int size,
+ real deltatover2, real* rmass);
+
/* ----------------------------------------------------------------------------------------------- */
// prepares a device array with with all inter-element edge-nodes -- this
@@ -237,7 +258,8 @@
int* nspec_outer_elastic,
int* nspec_inner_elastic,
int* COMPUTE_AND_STORE_STRAIN,
- int* SIMULATION_TYPE) {
+ int* SIMULATION_TYPE,
+ int* ATTENUATION) {
TRACE("compute_forces_elastic_cuda");
// EPIK_TRACER("compute_forces_elastic_cuda");
@@ -256,23 +278,135 @@
/* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
/* if(myrank==0) { */
- Kernel_2(num_elements, mp, *iphase, *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE);
+ Kernel_2(num_elements,mp,*iphase,*COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION);
cudaThreadSynchronize();
/* MPI_Barrier(MPI_COMM_WORLD); */
}
+
/* ----------------------------------------------------------------------------------------------- */
-__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase, int* d_ibool);
+// updates stress
-__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,float* d_displ, float* d_accel, float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz, float* d_gammax, float* d_gammay, float* d_gammaz, float* d_kappav, float* d_muv,float* d_debug,int COMPUTE_AND_STORE_STRAIN,float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,int SIMULATION_TYPE);
+__device__ void compute_element_att_stress(int tx,int working_element,int NSPEC,
+ float* R_xx,
+ float* R_yy,
+ float* R_xy,
+ float* R_xz,
+ float* R_yz,
+ reald* sigma_xx,
+ reald* sigma_yy,
+ reald* sigma_zz,
+ reald* sigma_xy,
+ reald* sigma_xz,
+ reald* sigma_yz) {
+ int i_sls,offset_sls;
+ reald R_xx_val,R_yy_val;
+
+ for(i_sls = 0; i_sls < N_SLS; i_sls++){
+ // index
+ offset_sls = tx + 125*(working_element + NSPEC*i_sls);
+
+ R_xx_val = R_xx[offset_sls]; //(i,j,k,ispec,i_sls)
+ R_yy_val = R_yy[offset_sls];
+
+ *sigma_xx = *sigma_xx - R_xx_val;
+ *sigma_yy = *sigma_yy - R_yy_val;
+ *sigma_zz = *sigma_zz + R_xx_val + R_yy_val;
+ *sigma_xy = *sigma_xy - R_xy[offset_sls];
+ *sigma_xz = *sigma_xz - R_xz[offset_sls];
+ *sigma_yz = *sigma_yz - R_yz[offset_sls];
+ }
+ return;
+}
+
/* ----------------------------------------------------------------------------------------------- */
+// updates R_memory
+
+__device__ void compute_element_att_memory(int tx,int working_element,int NSPEC,
+ float* d_muv,
+ float* factor_common,
+ float* alphaval,float* betaval,float* gammaval,
+ float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+ float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
+ float* epsilondev_xz,float* epsilondev_yz,
+ reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
+ reald epsilondev_xz_loc,reald epsilondev_yz_loc
+ ){
+
+ int i_sls;
+ int ijk_ispec;
+ int offset_sls,offset_align,offset_common;
+ reald mul;
+ reald alphaval_loc,betaval_loc,gammaval_loc;
+ reald factor_loc,Sn,Snp1;
+
+ // indices
+ offset_align = tx + 128 * working_element;
+ ijk_ispec = tx + 125 * working_element;
+
+ mul = d_muv[offset_align];
+
+ // use Runge-Kutta scheme to march in time
+ for(i_sls = 0; i_sls < N_SLS; i_sls++){
+
+ // indices
+ offset_common = i_sls + N_SLS*(tx + 125*working_element); // (i_sls,i,j,k,ispec)
+ offset_sls = tx + 125*(working_element + NSPEC*i_sls); // (i,j,k,ispec,i_sls)
+
+ factor_loc = mul * factor_common[offset_common]; //mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
+
+ alphaval_loc = alphaval[i_sls]; // (i_sls)
+ betaval_loc = betaval[i_sls];
+ gammaval_loc = gammaval[i_sls];
+
+ // term in xx
+ Sn = factor_loc * epsilondev_xx[ijk_ispec]; //(i,j,k,ispec)
+ Snp1 = factor_loc * epsilondev_xx_loc; //(i,j,k)
+
+ //R_xx(i,j,k,ispec,i_sls) = alphaval_loc * R_xx(i,j,k,ispec,i_sls) +
+ // betaval_loc * Sn + gammaval_loc * Snp1;
+
+ R_xx[offset_sls] = alphaval_loc * R_xx[offset_sls] +
+ betaval_loc * Sn + gammaval_loc * Snp1;
+
+ // term in yy
+ Sn = factor_loc * epsilondev_yy[ijk_ispec];
+ Snp1 = factor_loc * epsilondev_yy_loc;
+ R_yy[offset_sls] = alphaval_loc * R_yy[offset_sls] +
+ betaval_loc * Sn + gammaval_loc * Snp1;
+ // term in zz not computed since zero trace
+ // term in xy
+ Sn = factor_loc * epsilondev_xy[ijk_ispec];
+ Snp1 = factor_loc * epsilondev_xy_loc;
+ R_xy[offset_sls] = alphaval_loc * R_xy[offset_sls] +
+ betaval_loc * Sn + gammaval_loc * Snp1;
+ // term in xz
+ Sn = factor_loc * epsilondev_xz[ijk_ispec];
+ Snp1 = factor_loc * epsilondev_xz_loc;
+ R_xz[offset_sls] = alphaval_loc * R_xz[offset_sls] +
+ betaval_loc * Sn + gammaval_loc * Snp1;
+ // term in yz
+ Sn = factor_loc * epsilondev_yz[ijk_ispec];
+ Snp1 = factor_loc * epsilondev_yz_loc;
+ R_yz[offset_sls] = alphaval_loc * R_yz[offset_sls] +
+ betaval_loc * Sn + gammaval_loc * Snp1;
+ }
+ return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// KERNEL 2
+
+/* ----------------------------------------------------------------------------------------------- */
+
void Kernel_2(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
- int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE)
+ int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION)
{
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -314,7 +448,8 @@
mp->d_xix, mp->d_xiy, mp->d_xiz,
mp->d_etax, mp->d_etay, mp->d_etaz,
mp->d_gammax, mp->d_gammay, mp->d_gammaz,
- mp->d_kappav, mp->d_muv,d_debug,
+ mp->d_kappav, mp->d_muv,
+ d_debug,
COMPUTE_AND_STORE_STRAIN,
mp->d_epsilondev_xx,
mp->d_epsilondev_yy,
@@ -322,8 +457,12 @@
mp->d_epsilondev_xz,
mp->d_epsilondev_yz,
mp->d_epsilon_trace_over_3,
- // 1);
- SIMULATION_TYPE);
+ SIMULATION_TYPE,
+ ATTENUATION,mp->NSPEC_AB,
+ mp->d_one_minus_sum_beta,mp->d_factor_common,
+ mp->d_R_xx,mp->d_R_yy,mp->d_R_xy,mp->d_R_xz,mp->d_R_yz,
+ mp->d_alphaval,mp->d_betaval,mp->d_gammaval
+ );
// cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
@@ -348,7 +487,8 @@
mp->d_xix, mp->d_xiy, mp->d_xiz,
mp->d_etax, mp->d_etay, mp->d_etaz,
mp->d_gammax, mp->d_gammay, mp->d_gammaz,
- mp->d_kappav, mp->d_muv,d_debug,
+ mp->d_kappav, mp->d_muv,
+ d_debug,
COMPUTE_AND_STORE_STRAIN,
mp->d_b_epsilondev_xx,
mp->d_b_epsilondev_yy,
@@ -356,7 +496,12 @@
mp->d_b_epsilondev_xz,
mp->d_b_epsilondev_yz,
mp->d_b_epsilon_trace_over_3,
- SIMULATION_TYPE);
+ SIMULATION_TYPE,
+ ATTENUATION,mp->NSPEC_AB,
+ mp->d_one_minus_sum_beta,mp->d_factor_common,
+ mp->d_b_R_xx,mp->d_b_R_yy,mp->d_b_R_xy,mp->d_b_R_xz,mp->d_b_R_yz,
+ mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval
+ );
}
// cudaEventRecord( stop, 0 );
@@ -381,7 +526,8 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
+__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
+ int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
int bx = blockIdx.x;
int tx = threadIdx.x;
int working_element;
@@ -411,18 +557,31 @@
// doesn't seem to change the performance.
// #define MANUALLY_UNROLLED_LOOPS
-/* ----------------------------------------------------------------------------------------------- */
-__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,float* d_displ, float* d_accel, float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz, float* d_gammax, float* d_gammay, float* d_gammaz, float* d_kappav, float* d_muv,float* d_debug,int COMPUTE_AND_STORE_STRAIN,float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,int SIMULATION_TYPE)
-{
+__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
+ int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,
+ float* d_displ, float* d_accel,
+ float* d_xix, float* d_xiy, float* d_xiz,
+ float* d_etax, float* d_etay, float* d_etaz,
+ float* d_gammax, float* d_gammay, float* d_gammaz,
+ float* d_kappav, float* d_muv,
+ float* d_debug,
+ int COMPUTE_AND_STORE_STRAIN,
+ float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
+ float* epsilondev_xz,float* epsilondev_yz,
+ float* epsilon_trace_over_3,
+ int SIMULATION_TYPE,
+ int ATTENUATION, int NSPEC,
+ float* one_minus_sum_beta,float* factor_common,
+ float* R_xx, float* R_yy, float* R_xy, float* R_xz, float* R_yz,
+ float* alphaval,float* betaval,float* gammaval
+ ){
/* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
int bx = blockIdx.y*gridDim.x+blockIdx.x;
/* int bx = blockIdx.x; */
- int tx = threadIdx.x;
-
+ int tx = threadIdx.x;
-
//const int NGLLX = 5;
// const int NGLL2 = 25;
const int NGLL3 = 125;
@@ -435,6 +594,7 @@
int active,offset;
int iglob = 0;
int working_element;
+
reald tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
reald duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
@@ -442,6 +602,7 @@
reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
reald fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+ reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
#ifndef MANUALLY_UNROLLED_LOOPS
int l;
@@ -475,14 +636,14 @@
iglob = d_ibool[working_element*125 + tx]-1;
#ifdef USE_TEXTURES
- s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
- s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
- s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
+ s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
+ s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
+ s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
#else
- // changing iglob indexing to match fortran row changes fast style
- s_dummyx_loc[tx] = d_displ[iglob*3];
- s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
- s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+ // changing iglob indexing to match fortran row changes fast style
+ s_dummyx_loc[tx] = d_displ[iglob*3];
+ s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+ s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
#endif
}
@@ -497,179 +658,203 @@
#ifndef MANUALLY_UNROLLED_LOOPS
- tempx1l = 0.f;
- tempx2l = 0.f;
- tempx3l = 0.f;
+ tempx1l = 0.f;
+ tempx2l = 0.f;
+ tempx3l = 0.f;
- tempy1l = 0.f;
- tempy2l = 0.f;
- tempy3l = 0.f;
+ tempy1l = 0.f;
+ tempy2l = 0.f;
+ tempy3l = 0.f;
- tempz1l = 0.f;
- tempz2l = 0.f;
- tempz3l = 0.f;
+ tempz1l = 0.f;
+ tempz2l = 0.f;
+ tempz3l = 0.f;
- for (l=0;l<NGLLX;l++) {
- hp1 = d_hprime_xx[l*NGLLX+I];
- offset = K*NGLL2+J*NGLLX+l;
- tempx1l += s_dummyx_loc[offset]*hp1;
- tempy1l += s_dummyy_loc[offset]*hp1;
- tempz1l += s_dummyz_loc[offset]*hp1;
-
- hp2 = d_hprime_xx[l*NGLLX+J];
- offset = K*NGLL2+l*NGLLX+I;
- tempx2l += s_dummyx_loc[offset]*hp2;
- tempy2l += s_dummyy_loc[offset]*hp2;
- tempz2l += s_dummyz_loc[offset]*hp2;
+ for (l=0;l<NGLLX;l++) {
+ hp1 = d_hprime_xx[l*NGLLX+I];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l += s_dummyx_loc[offset]*hp1;
+ tempy1l += s_dummyy_loc[offset]*hp1;
+ tempz1l += s_dummyz_loc[offset]*hp1;
+
+ hp2 = d_hprime_xx[l*NGLLX+J];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l += s_dummyx_loc[offset]*hp2;
+ tempy2l += s_dummyy_loc[offset]*hp2;
+ tempz2l += s_dummyz_loc[offset]*hp2;
- hp3 = d_hprime_xx[l*NGLLX+K];
- offset = l*NGLL2+J*NGLLX+I;
- tempx3l += s_dummyx_loc[offset]*hp3;
- tempy3l += s_dummyy_loc[offset]*hp3;
- tempz3l += s_dummyz_loc[offset]*hp3;
+ hp3 = d_hprime_xx[l*NGLLX+K];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l += s_dummyx_loc[offset]*hp3;
+ tempy3l += s_dummyy_loc[offset]*hp3;
+ tempz3l += s_dummyz_loc[offset]*hp3;
- // if(working_element == 169 && tx == 0) {
- // atomicAdd(&d_debug[0],1.0);
- // d_debug[1+3*l] = tempz3l;
- // d_debug[2+3*l] = s_dummyz_loc[offset];
- // d_debug[3+3*l] = hp3;
- // }
-
- }
+ // if(working_element == 169 && tx == 0) {
+ // atomicAdd(&d_debug[0],1.0);
+ // d_debug[1+3*l] = tempz3l;
+ // d_debug[2+3*l] = s_dummyz_loc[offset];
+ // d_debug[3+3*l] = hp3;
+ // }
+
+ }
#else
- tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
-
- tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+
#endif
// compute derivatives of ux, uy and uz with respect to x, y and z
- offset = working_element*NGLL3_ALIGN + tx;
+ offset = working_element*NGLL3_ALIGN + tx;
- xixl = d_xix[offset];
- xiyl = d_xiy[offset];
- xizl = d_xiz[offset];
- etaxl = d_etax[offset];
- etayl = d_etay[offset];
- etazl = d_etaz[offset];
- gammaxl = d_gammax[offset];
- gammayl = d_gammay[offset];
- gammazl = d_gammaz[offset];
+ xixl = d_xix[offset];
+ xiyl = d_xiy[offset];
+ xizl = d_xiz[offset];
+ etaxl = d_etax[offset];
+ etayl = d_etay[offset];
+ etazl = d_etaz[offset];
+ gammaxl = d_gammax[offset];
+ gammayl = d_gammay[offset];
+ gammazl = d_gammaz[offset];
- duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
- duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
- duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+ duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+ duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+ duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
- duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
- duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
- duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+ duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+ duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+ duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
- duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
- duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
- duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+ duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+ duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+ duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
-
-
- duxdxl_plus_duydyl = duxdxl + duydyl;
- duxdxl_plus_duzdzl = duxdxl + duzdzl;
- duydyl_plus_duzdzl = duydyl + duzdzl;
- duxdyl_plus_duydxl = duxdyl + duydxl;
- duzdxl_plus_duxdzl = duzdxl + duxdzl;
- duzdyl_plus_duydzl = duzdyl + duydzl;
- if(COMPUTE_AND_STORE_STRAIN) {
- float templ = 1.0f/3.0f * (duxdxl + duydyl + duzdzl);
- epsilondev_xx[offset] = duxdxl - templ;
- epsilondev_yy[offset] = duydyl - templ;
- epsilondev_xy[offset] = 0.5 * duxdyl_plus_duydxl;
- epsilondev_xz[offset] = 0.5 * duzdxl_plus_duxdzl;
- epsilondev_yz[offset] = 0.5 * duzdyl_plus_duydzl;
- if(SIMULATION_TYPE == 3) {
- epsilon_trace_over_3[tx + working_element*125] = templ;
- }
- }
-// compute elements with an elastic isotropic rheology
- kappal = d_kappav[offset];
- mul = d_muv[offset];
+ duxdxl_plus_duydyl = duxdxl + duydyl;
+ duxdxl_plus_duzdzl = duxdxl + duzdzl;
+ duydyl_plus_duzdzl = duydyl + duzdzl;
+ duxdyl_plus_duydxl = duxdyl + duydxl;
+ duzdxl_plus_duxdzl = duzdxl + duxdzl;
+ duzdyl_plus_duydzl = duzdyl + duydzl;
- lambdalplus2mul = kappal + 1.33333333333333333333f * mul; // 4./3. = 1.3333333
- lambdal = lambdalplus2mul - 2.f*mul;
+ // computes deviatoric strain attenuation and/or for kernel calculations
+ if(COMPUTE_AND_STORE_STRAIN) {
+ float templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+ /*
+ epsilondev_xx[offset] = duxdxl - templ;
+ epsilondev_yy[offset] = duydyl - templ;
+ epsilondev_xy[offset] = 0.5f * duxdyl_plus_duydxl;
+ epsilondev_xz[offset] = 0.5f * duzdxl_plus_duxdzl;
+ epsilondev_yz[offset] = 0.5f * duzdyl_plus_duydzl;
+ */
+ // local storage: stresses at this current time step
+ epsilondev_xx_loc = duxdxl - templ;
+ epsilondev_yy_loc = duydyl - templ;
+ epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+ epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+ epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
-// compute the six components of the stress tensor sigma
- sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
- sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
- sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+ if(SIMULATION_TYPE == 3) {
+ epsilon_trace_over_3[tx + working_element*125] = templ;
+ }
+ }
+
+ // compute elements with an elastic isotropic rheology
+ kappal = d_kappav[offset];
+ mul = d_muv[offset];
+
+ // attenuation
+ if(ATTENUATION){
+ // use unrelaxed parameters if attenuation
+ mul = mul * one_minus_sum_beta[tx+working_element*125]; // (i,j,k,ispec)
+ }
+
+ // isotropic case
+ lambdalplus2mul = kappal + 1.33333333333333333333f * mul; // 4./3. = 1.3333333
+ lambdal = lambdalplus2mul - 2.0f * mul;
+
+ // compute the six components of the stress tensor sigma
+ sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+ sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+ sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
- sigma_xy = mul*duxdyl_plus_duydxl;
- sigma_xz = mul*duzdxl_plus_duxdzl;
- sigma_yz = mul*duzdyl_plus_duydzl;
+ sigma_xy = mul*duxdyl_plus_duydxl;
+ sigma_xz = mul*duzdxl_plus_duxdzl;
+ sigma_yz = mul*duzdyl_plus_duydzl;
- jacobianl = 1.f / (xixl*(etayl*gammazl-etazl*gammayl)-xiyl*(etaxl*gammazl-etazl*gammaxl)+xizl*(etaxl*gammayl-etayl*gammaxl));
+ if(ATTENUATION){
+ // subtract memory variables if attenuation
+ compute_element_att_stress(tx,working_element,NSPEC,
+ R_xx,R_yy,R_xy,R_xz,R_yz,
+ &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+ }
+
+ jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)-xiyl*(etaxl*gammazl-etazl*gammaxl)+xizl*(etaxl*gammayl-etayl*gammaxl));
// form the dot product with the test vector
- s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl);
- s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl);
- s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+ s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_xy*xiyl + sigma_xz*xizl);
+ s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_yz*xizl);
+ s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
- s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl);
- s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl);
- s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+ s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_xy*etayl + sigma_xz*etazl);
+ s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_yz*etazl);
+ s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
- s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl);
- s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl);
- s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+ s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_xy*gammayl + sigma_xz*gammazl);
+ s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_yz*gammazl);
+ s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
}
@@ -682,152 +867,173 @@
#ifndef MANUALLY_UNROLLED_LOOPS
- tempx1l = 0.f;
- tempy1l = 0.f;
- tempz1l = 0.f;
+ tempx1l = 0.f;
+ tempy1l = 0.f;
+ tempz1l = 0.f;
- tempx2l = 0.f;
- tempy2l = 0.f;
- tempz2l = 0.f;
+ tempx2l = 0.f;
+ tempy2l = 0.f;
+ tempz2l = 0.f;
- tempx3l = 0.f;
- tempy3l = 0.f;
- tempz3l = 0.f;
+ tempx3l = 0.f;
+ tempy3l = 0.f;
+ tempz3l = 0.f;
- for (l=0;l<NGLLX;l++) {
+ for (l=0;l<NGLLX;l++) {
- fac1 = d_hprimewgll_xx[I*NGLLX+l];
- offset = K*NGLL2+J*NGLLX+l;
- tempx1l += s_tempx1[offset]*fac1;
- tempy1l += s_tempy1[offset]*fac1;
- tempz1l += s_tempz1[offset]*fac1;
-
- fac2 = d_hprimewgll_xx[J*NGLLX+l];
- offset = K*NGLL2+l*NGLLX+I;
- tempx2l += s_tempx2[offset]*fac2;
- tempy2l += s_tempy2[offset]*fac2;
- tempz2l += s_tempz2[offset]*fac2;
+ fac1 = d_hprimewgll_xx[I*NGLLX+l];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l += s_tempx1[offset]*fac1;
+ tempy1l += s_tempy1[offset]*fac1;
+ tempz1l += s_tempz1[offset]*fac1;
+
+ fac2 = d_hprimewgll_xx[J*NGLLX+l];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l += s_tempx2[offset]*fac2;
+ tempy2l += s_tempy2[offset]*fac2;
+ tempz2l += s_tempz2[offset]*fac2;
- fac3 = d_hprimewgll_xx[K*NGLLX+l];
- offset = l*NGLL2+J*NGLLX+I;
- tempx3l += s_tempx3[offset]*fac3;
- tempy3l += s_tempy3[offset]*fac3;
- tempz3l += s_tempz3[offset]*fac3;
+ fac3 = d_hprimewgll_xx[K*NGLLX+l];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l += s_tempx3[offset]*fac3;
+ tempy3l += s_tempy3[offset]*fac3;
+ tempz3l += s_tempz3[offset]*fac3;
- if(working_element == 169)
- if(l==0)
- if(I+J+K == 0) {
- // atomicAdd(&d_debug[0],1.0);
- // d_debug[0] = fac3;
- // d_debug[1] = offset;
- // d_debug[2] = s_tempz3[offset];
- }
- }
+ if(working_element == 169)
+ if(l==0)
+ if(I+J+K == 0) {
+ // atomicAdd(&d_debug[0],1.0);
+ // d_debug[0] = fac3;
+ // d_debug[1] = offset;
+ // d_debug[2] = s_tempz3[offset];
+ }
+ }
#else
- tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
- + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
- + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
- + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
- + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
-
- tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
- + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
- + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
- + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
- + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+ tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
- tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
- + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
- + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
- + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
- + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+ tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
- tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
- + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
- + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
- + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
- + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+ tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
- tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
- + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
- + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
- + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
- + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+ tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+ + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+ + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+ + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+ + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
- tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
- + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
- + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
- + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
- + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
+ tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+ + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+ + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+ + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+ + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
- tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
- + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
- + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
- + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
- + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+ tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_xx[J*NGLLX]
+ + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_xx[J*NGLLX+1]
+ + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+2]
+ + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+3]
+ + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_xx[J*NGLLX+4];
- tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
- + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
- + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
- + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
- + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+ tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+ + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+ + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+ + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+ + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
- tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
- + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
- + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
- + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
- + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+ tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+ + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+ + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+ + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+ + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+ tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX]
+ + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+1]
+ + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+2]
+ + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+3]
+ + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_xx[K*NGLLX+4];
+
#endif
- fac1 = d_wgllwgll_yz[K*NGLLX+J];
- fac2 = d_wgllwgll_xz[K*NGLLX+I];
- fac3 = d_wgllwgll_xy[J*NGLLX+I];
+ fac1 = d_wgllwgll_yz[K*NGLLX+J];
+ fac2 = d_wgllwgll_xz[K*NGLLX+I];
+ fac3 = d_wgllwgll_xy[J*NGLLX+I];
#ifdef USE_TEXTURES
- d_accel[iglob] = tex1Dfetch(tex_accel, iglob) - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
- d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
- d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+ d_accel[iglob] = tex1Dfetch(tex_accel, iglob) - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+ d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+ d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
#else
/* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
- // d_accel[iglob*3] -= (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
- // d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
- // d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+ // d_accel[iglob*3] -= (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+ // d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+ // d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
- if(iglob*3+2 == 41153) {
- // int ot = d_debug[5];
- // d_debug[0+1+ot] = d_accel[iglob*3+2];
- // // d_debug[1+1+ot] = fac1*tempz1l;
- // // d_debug[2+1+ot] = fac2*tempz2l;
- // // d_debug[3+1+ot] = fac3*tempz3l;
- // d_debug[1+1+ot] = fac1;
- // d_debug[2+1+ot] = fac2;
- // d_debug[3+1+ot] = fac3;
- // d_debug[4+1+ot] = d_accel[iglob*3+2]-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
- // atomicAdd(&d_debug[0],1.0);
- // d_debug[6+ot] = d_displ[iglob*3+2];
- }
+ if(iglob*3+2 == 41153) {
+ // int ot = d_debug[5];
+ // d_debug[0+1+ot] = d_accel[iglob*3+2];
+ // // d_debug[1+1+ot] = fac1*tempz1l;
+ // // d_debug[2+1+ot] = fac2*tempz2l;
+ // // d_debug[3+1+ot] = fac3*tempz3l;
+ // d_debug[1+1+ot] = fac1;
+ // d_debug[2+1+ot] = fac2;
+ // d_debug[3+1+ot] = fac3;
+ // d_debug[4+1+ot] = d_accel[iglob*3+2]-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+ // atomicAdd(&d_debug[0],1.0);
+ // d_debug[6+ot] = d_displ[iglob*3+2];
+ }
- atomicAdd(&d_accel[iglob*3],-(fac1*tempx1l + fac2*tempx2l + fac3*tempx3l));
- atomicAdd(&d_accel[iglob*3+1],-(fac1*tempy1l + fac2*tempy2l + fac3*tempy3l));
- atomicAdd(&d_accel[iglob*3+2],-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l));
+ atomicAdd(&d_accel[iglob*3],-(fac1*tempx1l + fac2*tempx2l + fac3*tempx3l));
+ atomicAdd(&d_accel[iglob*3+1],-(fac1*tempy1l + fac2*tempy2l + fac3*tempy3l));
+ atomicAdd(&d_accel[iglob*3+2],-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l));
#endif
+
+ // update memory variables based upon the Runge-Kutta scheme
+ if( ATTENUATION ){
+ compute_element_att_memory(tx,working_element,NSPEC,
+ d_muv,
+ factor_common,alphaval,betaval,gammaval,
+ R_xx,R_yy,R_xy,R_xz,R_yz,
+ epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+ epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc);
+ }
+
+ // save deviatoric strain for Runge-Kutta scheme
+ if( COMPUTE_AND_STORE_STRAIN ){
+ int ijk_ispec = tx + working_element*125;
+
+ // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+ epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
+ epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
+ epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
+ epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
+ epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
+ }
+
}
#else // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
- d_accel[iglob] -= 0.00000001f;
- d_accel[iglob + NGLOB] -= 0.00000001f;
- d_accel[iglob + 2*NGLOB] -= 0.00000001f;
+ d_accel[iglob] -= 0.00000001f;
+ d_accel[iglob + NGLOB] -= 0.00000001f;
+ d_accel[iglob + 2*NGLOB] -= 0.00000001f;
#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
}
/* ----------------------------------------------------------------------------------------------- */
-__global__ void kernel_3_cuda_device(real* veloc,
- real* accel, int size,
- real deltatover2, real* rmass);
+// KERNEL 3
/* ----------------------------------------------------------------------------------------------- */
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-10-07 01:29:29 UTC (rev 19039)
@@ -159,29 +159,14 @@
#endif
}
+
/* ----------------------------------------------------------------------------------------------- */
-extern "C"
-void FC_FUNC_(transfer_sensitivity_kernels_to_host,
- TRANSFER_SENSITIVITY_KERNELS_TO_HOST)(long* Mesh_pointer, float* h_rho_kl,
- float* h_mu_kl, float* h_kappa_kl,
- float* h_Sigma_kl,int* NSPEC_AB,int* NSPEC_AB_VAL) {
+// NOISE SIMULATIONS
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
- print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*125*sizeof(float),
- cudaMemcpyDeviceToHost),1);
- print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*125*sizeof(float),
- cudaMemcpyDeviceToHost),1);
- print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*125*sizeof(float),
- cudaMemcpyDeviceToHost),1);
- print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,125*(*NSPEC_AB_VAL)*sizeof(float),
- cudaMemcpyHostToDevice),4);
-
-}
-
/* ----------------------------------------------------------------------------------------------- */
+
__global__ void compute_kernels_strength_noise_cuda_kernel(float* displ,
int* free_surface_ispec,
int* free_surface_ijk,
@@ -236,6 +221,9 @@
float* h_noise_surface_movie,
int* num_free_surface_faces_f,
float* deltat) {
+
+TRACE("compute_kernels_strength_noise_cuda");
+
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
int num_free_surface_faces = *num_free_surface_faces_f;
@@ -498,24 +486,3 @@
#endif
}
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(transfer_sensitivity_kernels_acoustic_to_host,
- TRANSFER_SENSITIVITY_KERNELS_ACOUSTIC_TO_HOST)(long* Mesh_pointer,
- float* h_rho_ac_kl,
- float* h_kappa_ac_kl,
- int* NSPEC_AB) {
-
-TRACE("transfer_sensitivity_kernels_acoustic_to_host");
-
- //get mesh pointer out of fortran integer container
- Mesh* mp = (Mesh*)(*Mesh_pointer);
- int size = *NSPEC_AB*125;
-
- // copies kernel values over to CPU host
- print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(float),
- cudaMemcpyDeviceToHost),911);
- print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(float),
- cudaMemcpyDeviceToHost),922);
-}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2011-10-07 01:29:29 UTC (rev 19039)
@@ -53,7 +53,9 @@
// determines bulk sound speed
rhol = rhostore[INDEX4_PADDED(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+
kappal = kappastore[INDEX4(NGLLX,NGLLX,NGLLX,i,j,k,ispec)];
+
cpl = sqrt( kappal / rhol );
// gets associated, weighted jacobian
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2011-10-07 01:29:29 UTC (rev 19039)
@@ -63,6 +63,7 @@
#define NDIM 3
#define NGLLX 5
#define NGLL2 25
+#define N_SLS 3
typedef float real; // type of variables passed into function
typedef float realw; // type of "working" variables
@@ -156,13 +157,29 @@
// surface movie elements to save for noise tomography
float* d_noise_surface_movie;
+ // attenuation
+ float* d_R_xx;
+ float* d_R_yy;
+ float* d_R_xy;
+ float* d_R_xz;
+ float* d_R_yz;
+
+ float* d_one_minus_sum_beta;
+ float* d_factor_common;
+
+ float* d_alphaval;
+ float* d_betaval;
+ float* d_gammaval;
+
+ // attenuation & kernel
float* d_epsilondev_xx;
float* d_epsilondev_yy;
float* d_epsilondev_xy;
float* d_epsilondev_xz;
float* d_epsilondev_yz;
float* d_epsilon_trace_over_3;
-
+
+ // noise
float* d_normal_x_noise;
float* d_normal_y_noise;
float* d_normal_z_noise;
@@ -171,17 +188,30 @@
float* d_noise_sourcearray;
+ // attenuation & kernel backward fields
+ float* d_b_R_xx;
+ float* d_b_R_yy;
+ float* d_b_R_xy;
+ float* d_b_R_xz;
+ float* d_b_R_yz;
+
float* d_b_epsilondev_xx;
float* d_b_epsilondev_yy;
float* d_b_epsilondev_xy;
float* d_b_epsilondev_xz;
float* d_b_epsilondev_yz;
float* d_b_epsilon_trace_over_3;
+
+ float* d_b_alphaval;
+ float* d_b_betaval;
+ float* d_b_gammaval;
// sensitivity kernels
float* d_rho_kl;
float* d_mu_kl;
float* d_kappa_kl;
+
+ // noise sensitivity kernel
float* d_Sigma_kl;
// ------------------------------------------------------------------ //
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2011-10-07 01:29:29 UTC (rev 19039)
@@ -120,7 +120,17 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void noise_read_add_surface_movie_cuda_kernel(real* accel, int* ibool, int* free_surface_ispec,int* free_surface_ijk, int num_free_surface_faces, real* noise_surface_movie, real* normal_x_noise, real* normal_y_noise, real* normal_z_noise, real* mask_noise, real* free_surface_jacobian2Dw, real* wgllwgll_xy,float* d_debug) {
+__global__ void noise_read_add_surface_movie_cuda_kernel(real* accel, int* ibool,
+ int* free_surface_ispec,int* free_surface_ijk,
+ int num_free_surface_faces,
+ real* noise_surface_movie,
+ real* normal_x_noise,
+ real* normal_y_noise,
+ real* normal_z_noise,
+ real* mask_noise,
+ real* free_surface_jacobian2Dw,
+ real* wgllwgll_xy,
+ float* d_debug) {
int iface = blockIdx.x + gridDim.x*blockIdx.y; // surface element id
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2011-10-07 01:29:29 UTC (rev 19039)
@@ -92,7 +92,7 @@
void setConst_wgllwgll_xz(float* array, Mesh* mp);
void setConst_wgllwgll_yz(float* array, Mesh* mp);
-void exit_on_cuda_error(char* kernel_name);
-void show_free_memory(char* info_str);
+//void exit_on_cuda_error(char* kernel_name);
+//void show_free_memory(char* info_str);
#endif //CUDA_HEADER_H
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-10-07 01:29:29 UTC (rev 19039)
@@ -70,11 +70,24 @@
/* ----------------------------------------------------------------------------------------------- */
+void exit_on_error(char* info)
+{
+ printf("\nERROR: %s\n",info);
+ fflush(stdout);
+#ifdef USE_MPI
+ MPI_Abort(MPI_COMM_WORLD,1);
+#endif
+ exit(EXIT_FAILURE);
+ return;
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
void print_CUDA_error_if_any(cudaError_t err, int num)
{
if (cudaSuccess != err)
{
- printf("\nCUDA error !!!!! <%s> !!!!! at CUDA call # %d\n",cudaGetErrorString(err),num);
+ printf("\nCUDA error !!!!! <%s> !!!!! \nat CUDA call # %d\n",cudaGetErrorString(err),num);
fflush(stdout);
#ifdef USE_MPI
MPI_Abort(MPI_COMM_WORLD,1);
@@ -114,17 +127,6 @@
double free_db,used_db,total_db;
get_free_memory(&free_db,&used_db,&total_db);
- // size_t free_byte ;
- // size_t total_byte ;
- // cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
- // if ( cudaSuccess != cuda_status ){
- // printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
- // exit(1);
- // }
- //
- // double free_db = (double)free_byte ;
- // double total_db = (double)total_byte ;
- // double used_db = total_db - free_db ;
sprintf(filename,"../in_out_files/OUTPUT_FILES/gpu_mem_usage_proc_%03d.txt",myrank);
fp = fopen(filename,"a+");
@@ -157,18 +159,6 @@
get_free_memory(&free_db,&used_db,&total_db);
-// size_t free_byte ;
-// size_t total_byte ;
-// cudaError_t cuda_status = cudaMemGetInfo( &free_byte, &total_byte ) ;
-// if ( cudaSuccess != cuda_status ){
-// printf("Error: cudaMemGetInfo fails, %s \n", cudaGetErrorString(cuda_status) );
-// exit(1);
-// }
-//
-// double free_db = (double)free_byte ;
-// double total_db = (double)total_byte ;
-// double used_db = total_db - free_db ;
-
printf("%d: @%s GPU memory usage: used = %f MB, free = %f MB, total = %f MB\n", myrank, info_str,
used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
@@ -209,183 +199,6 @@
/* ----------------------------------------------------------------------------------------------- */
-//void prepare_constants(int NGLLX, int NSPEC_AB, int NGLOB_AB,
-// float* h_xix, float* h_xiy, float* h_xiz,
-// float** d_xix, float** d_xiy, float** d_xiz,
-// float* h_etax, float* h_etay, float* h_etaz,
-// float** d_etax, float** d_etay, float** d_etaz,
-// float* h_gammax, float* h_gammay, float* h_gammaz,
-// float** d_gammax, float** d_gammay, float** d_gammaz,
-// float* h_kappav, float* h_muv,
-// float** d_kappav, float** d_muv,
-// int* h_ibool, int** d_ibool,
-// //int* h_phase_ispec_inner_elastic, int** d_phase_ispec_inner_elastic,
-// //int num_phase_ispec_elastic,
-// //float* h_rmass, float** d_rmass,
-// int num_interfaces_ext_mesh, int max_nibool_interfaces_ext_mesh,
-// int* h_nibool_interfaces_ext_mesh, int** d_nibool_interfaces_ext_mesh,
-// int* h_ibool_interfaces_ext_mesh, int** d_ibool_interfaces_ext_mesh,
-// float* h_hprime_xx, float* h_hprimewgll_xx,
-// float* h_wgllwgll_xy, float* h_wgllwgll_xz,
-// float* h_wgllwgll_yz,
-// int* h_abs_boundary_ispec, int** d_abs_boundary_ispec,
-// int* h_abs_boundary_ijk, int** d_abs_boundary_ijk,
-// float* h_abs_boundary_normal, float** d_abs_boundary_normal,
-// //float* h_rho_vp,float** d_rho_vp,
-// //float* h_rho_vs,float** d_rho_vs,
-// float* h_abs_boundary_jacobian2Dw,float** d_abs_boundary_jacobian2Dw,
-// float* h_b_absorb_field,float** d_b_absorb_field,
-// int num_abs_boundary_faces, int b_num_abs_boundary_faces,
-// int* h_ispec_is_inner, int** d_ispec_is_inner,
-// //int* h_ispec_is_elastic, int** d_ispec_is_elastic,
-// int NSOURCES,
-// float* h_sourcearrays,float** d_sourcearrays,
-// int* h_islice_selected_source, int** d_islice_selected_source,
-// int* h_ispec_selected_source, int** d_ispec_selected_source,
-// int SIMULATION_TYPE){
-//
-//TRACE("prepare_constants");
-//
-// // EPIK_USER_REG(r_name,"compute_forces");
-// // EPIK_USER_REG(r_name,
-//
-// /* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
-// int size_padded = 128*NSPEC_AB;
-// int size = NGLLX*NGLLX*NGLLX*NSPEC_AB;
-//
-// // mesh
-// print_CUDA_error_if_any(cudaMalloc((void**) d_xix, size_padded*sizeof(float)),5);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_xiy, size_padded*sizeof(float)),6);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_xiz, size_padded*sizeof(float)),7);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_etax, size_padded*sizeof(float)),8);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_etay, size_padded*sizeof(float)),9);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_etaz, size_padded*sizeof(float)),10);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_gammax, size_padded*sizeof(float)),11);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_gammay, size_padded*sizeof(float)),12);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_gammaz, size_padded*sizeof(float)),13);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_kappav, size_padded*sizeof(float)),14);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_muv, size_padded*sizeof(float)),15);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_ibool, size_padded*sizeof(int)),16);
-//
-//// print_CUDA_error_if_any(cudaMalloc((void**) d_phase_ispec_inner_elastic, num_phase_ispec_elastic*2*sizeof(int)),17);
-//// print_CUDA_error_if_any(cudaMalloc((void**) d_rmass, NGLOB_AB*sizeof(float)),17);
-//
-// // absorbing boundaries
-// if( num_abs_boundary_faces > 0 ){
-// print_CUDA_error_if_any(cudaMalloc((void**) d_abs_boundary_ispec,
-// num_abs_boundary_faces*sizeof(int)),769);
-// print_CUDA_error_if_any(cudaMemcpy(*d_abs_boundary_ispec, h_abs_boundary_ispec,
-// num_abs_boundary_faces*sizeof(int),cudaMemcpyHostToDevice),770);
-//
-// print_CUDA_error_if_any(cudaMalloc((void**) d_abs_boundary_ijk,
-// 3*25*num_abs_boundary_faces*sizeof(int)),772);
-// print_CUDA_error_if_any(cudaMemcpy(*d_abs_boundary_ijk, h_abs_boundary_ijk,
-// 3*25*num_abs_boundary_faces*sizeof(int),cudaMemcpyHostToDevice),773);
-//
-// print_CUDA_error_if_any(cudaMalloc((void**) d_abs_boundary_normal,
-// 3*25*num_abs_boundary_faces*sizeof(int)),783);
-// print_CUDA_error_if_any(cudaMemcpy(*d_abs_boundary_normal, h_abs_boundary_normal,
-// 3*25*num_abs_boundary_faces*sizeof(int),cudaMemcpyHostToDevice),783);
-//
-// print_CUDA_error_if_any(cudaMalloc((void**) d_abs_boundary_jacobian2Dw,
-// 25*num_abs_boundary_faces*sizeof(float)),784);
-// print_CUDA_error_if_any(cudaMemcpy(*d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
-// 25*num_abs_boundary_faces*sizeof(float),cudaMemcpyHostToDevice),784);
-// }
-//
-//
-///*
-// print_CUDA_error_if_any(cudaMalloc((void**) d_rho_vp, size*sizeof(float)),5);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_rho_vs, size*sizeof(float)),6);
-// print_CUDA_error_if_any(cudaMemcpy(*d_rho_vp,h_rho_vp,size*sizeof(float),
-// cudaMemcpyHostToDevice),5);
-// print_CUDA_error_if_any(cudaMemcpy(*d_rho_vs,h_rho_vs,size*sizeof(float),
-// cudaMemcpyHostToDevice),5);
-// print_CUDA_error_if_any(cudaMalloc((void**) d_b_absorb_field, 3*25*b_num_abs_boundary_faces*sizeof(float)),7);
-// print_CUDA_error_if_any(cudaMemcpy(*d_b_absorb_field, h_b_absorb_field,
-// 3*25*b_num_abs_boundary_faces*sizeof(float),
-// cudaMemcpyHostToDevice),7);
-//
-// print_CUDA_error_if_any(cudaMemcpy(*d_rmass,h_rmass,NGLOB_AB*sizeof(float),cudaMemcpyHostToDevice),18);
-//*/
-//
-// // prepare interprocess-edge exchange information
-// print_CUDA_error_if_any(cudaMalloc((void**) d_nibool_interfaces_ext_mesh,
-// num_interfaces_ext_mesh*sizeof(int)),19);
-// print_CUDA_error_if_any(cudaMemcpy(*d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
-// num_interfaces_ext_mesh*sizeof(int),cudaMemcpyHostToDevice),19);
-//
-// print_CUDA_error_if_any(cudaMalloc((void**) d_ibool_interfaces_ext_mesh,
-// num_interfaces_ext_mesh*max_nibool_interfaces_ext_mesh*sizeof(int)),20);
-// print_CUDA_error_if_any(cudaMemcpy(*d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
-// num_interfaces_ext_mesh*max_nibool_interfaces_ext_mesh*sizeof(int),
-// cudaMemcpyHostToDevice),20);
-//
-// print_CUDA_error_if_any(cudaMalloc((void**) d_ispec_is_inner,NSPEC_AB*sizeof(int)),21);
-// print_CUDA_error_if_any(cudaMemcpy(*d_ispec_is_inner, h_ispec_is_inner,
-// NSPEC_AB*sizeof(int),
-// cudaMemcpyHostToDevice),21);
-//
-///* print_CUDA_error_if_any(cudaMalloc((void**) d_ispec_is_elastic,NSPEC_AB*sizeof(int)),21);
-// print_CUDA_error_if_any(cudaMemcpy(*d_ispec_is_elastic, h_ispec_is_elastic,
-// NSPEC_AB*sizeof(int),
-// cudaMemcpyHostToDevice),21);
-//*/
-//
-// print_CUDA_error_if_any(cudaMemcpy(*d_ibool, h_ibool,
-// size*sizeof(int) ,cudaMemcpyHostToDevice),512);
-//
-// // sources
-// if (SIMULATION_TYPE == 1 || SIMULATION_TYPE == 3){
-// print_CUDA_error_if_any(cudaMalloc((void**)d_sourcearrays, sizeof(float)*NSOURCES*3*125),22);
-// print_CUDA_error_if_any(cudaMemcpy(*d_sourcearrays, h_sourcearrays, sizeof(float)*NSOURCES*3*125,
-// cudaMemcpyHostToDevice),522);
-// }
-//
-//
-// print_CUDA_error_if_any(cudaMalloc((void**)d_islice_selected_source, sizeof(int)*NSOURCES),23);
-// print_CUDA_error_if_any(cudaMemcpy(*d_islice_selected_source, h_islice_selected_source, sizeof(int)*NSOURCES,
-// cudaMemcpyHostToDevice),523);
-//
-// print_CUDA_error_if_any(cudaMalloc((void**)d_ispec_selected_source, sizeof(int)*NSOURCES),24);
-// print_CUDA_error_if_any(cudaMemcpy(*d_ispec_selected_source, h_ispec_selected_source,sizeof(int)*NSOURCES,
-// cudaMemcpyHostToDevice),524);
-//
-// // transfer constant element data with padding
-// for(int i=0;i<NSPEC_AB;i++) {
-// print_CUDA_error_if_any(cudaMemcpy(*d_xix + i*128, &h_xix[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),70);
-// print_CUDA_error_if_any(cudaMemcpy(*d_xiy+i*128, &h_xiy[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),71);
-// print_CUDA_error_if_any(cudaMemcpy(*d_xiz+i*128, &h_xiz[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),72);
-// print_CUDA_error_if_any(cudaMemcpy(*d_etax+i*128, &h_etax[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),73);
-// print_CUDA_error_if_any(cudaMemcpy(*d_etay+i*128, &h_etay[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),74);
-// print_CUDA_error_if_any(cudaMemcpy(*d_etaz+i*128, &h_etaz[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),75);
-// print_CUDA_error_if_any(cudaMemcpy(*d_gammax+i*128,&h_gammax[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),76);
-// print_CUDA_error_if_any(cudaMemcpy(*d_gammay+i*128,&h_gammay[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),78);
-// print_CUDA_error_if_any(cudaMemcpy(*d_gammaz+i*128,&h_gammaz[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),79);
-// print_CUDA_error_if_any(cudaMemcpy(*d_kappav+i*128,&h_kappav[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),80);
-// print_CUDA_error_if_any(cudaMemcpy(*d_muv+i*128, &h_muv[i*125],
-// 125*sizeof(float),cudaMemcpyHostToDevice),81);
-// }
-//
-//// print_CUDA_error_if_any(cudaMemcpy(*d_phase_ispec_inner_elastic, h_phase_ispec_inner_elastic, num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),13);
-//
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-// exit_on_cuda_error("prepare_constants");
-//#endif
-//}
-
-/* ----------------------------------------------------------------------------------------------- */
-
extern "C"
void FC_FUNC_(prepare_constants_device,
PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
@@ -396,8 +209,6 @@
float* h_gammax, float* h_gammay, float* h_gammaz,
float* h_kappav, float* h_muv,
int* h_ibool,
- //int* h_phase_ispec_inner_elastic,int* num_phase_ispec_elastic,
- //float* h_rmass,
int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
float* h_hprime_xx,
@@ -407,18 +218,12 @@
float* h_wgllwgll_xy,
float* h_wgllwgll_xz,
float* h_wgllwgll_yz,
- //float* h_hprime_xx, float* h_hprimewgll_xx,
- //float* h_wgllwgll_xy, float* h_wgllwgll_xz,
- //float* h_wgllwgll_yz,
int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
float* h_abs_boundary_normal,
- //float* h_rho_vp,
- //float* h_rho_vs,
float* h_abs_boundary_jacobian2Dw,
float* h_b_absorb_field,
int* num_abs_boundary_faces, int* b_num_abs_boundary_faces,
int* h_ispec_is_inner,
- //int* h_ispec_is_elastic,
int* NSOURCES,
float* h_sourcearrays,
int* h_islice_selected_source,
@@ -431,14 +236,18 @@
TRACE("prepare_constants_device");
- int device_count,procid;
+ int procid;
+ int device_count = 0;
// cuda initialization (needs -lcuda library)
- cuInit(0);
-
+ //cuInit(0);
+ CUresult status = cuInit(0);
+ if ( CUDA_SUCCESS != status ) exit_on_error("CUDA device initialization failed");
+
// Gets number of GPU devices
cudaGetDeviceCount(&device_count);
//printf("Cuda Devices: %d\n", device_count);
+ if (device_count == 0) exit_on_error("There is no device supporting CUDA\n");
// Gets rank number of MPI process
MPI_Comm_rank(MPI_COMM_WORLD, &procid);
@@ -451,15 +260,14 @@
exit_on_cuda_error("cudaSetDevice");
}
- //printf("GPU_MODE Active. Preparing Fields and Constants on Device.\n");
-
// allocates mesh parameter structure
Mesh* mp = (Mesh*)malloc(sizeof(Mesh));
+ if (mp == NULL) exit_on_error("error allocating mesh pointer");
*Mesh_pointer = (long)mp;
// checks if NGLLX == 5
if( *h_NGLLX != NGLLX ){
- exit_on_cuda_error("NGLLX must be 5 for CUDA devices");
+ exit_on_error("NGLLX must be 5 for CUDA devices");
}
// sets global parameters
@@ -476,150 +284,120 @@
setConst_wgllwgll_xy(h_wgllwgll_xy,mp);
setConst_wgllwgll_xz(h_wgllwgll_xz,mp);
setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
-
-/* setConst_hprime_xx (h_hprime_xx );
- setConst_hprimewgll_xx(h_hprimewgll_xx);
- setConst_wgllwgll_xy (h_wgllwgll_xy,mp);
- setConst_wgllwgll_xz (h_wgllwgll_xz,mp);
- setConst_wgllwgll_yz (h_wgllwgll_yz,mp);
-*/
-
+
/* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
int size_padded = 128 * (*NSPEC_AB);
int size = 125 * (*NSPEC_AB);
// mesh
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(float)),5);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(float)),6);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(float)),7);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax, size_padded*sizeof(float)),8);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay, size_padded*sizeof(float)),9);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz, size_padded*sizeof(float)),10);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax, size_padded*sizeof(float)),11);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay, size_padded*sizeof(float)),12);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz, size_padded*sizeof(float)),13);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappav, size_padded*sizeof(float)),14);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muv, size_padded*sizeof(float)),15);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool, size_padded*sizeof(int)),16);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(float)),1001);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(float)),1002);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(float)),1003);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax, size_padded*sizeof(float)),1004);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay, size_padded*sizeof(float)),1005);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz, size_padded*sizeof(float)),1006);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax, size_padded*sizeof(float)),1007);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay, size_padded*sizeof(float)),1008);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz, size_padded*sizeof(float)),1009);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappav, size_padded*sizeof(float)),1010);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muv, size_padded*sizeof(float)),1011);
+ // global indexing
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool, size_padded*sizeof(int)),1021);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool, h_ibool,
+ size*sizeof(int) ,cudaMemcpyHostToDevice),1022);
+
-// print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_phase_ispec_inner_elastic, *num_phase_ispec_elastic*2*sizeof(int)),17);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic, h_phase_ispec_inner_elastic, *num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),13);
-
-// print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rmass, *NGLOB_AB*sizeof(float)),17);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,h_rmass,*NGLOB_AB*sizeof(float),cudaMemcpyHostToDevice),18);
+ // prepare interprocess-edge exchange information
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_ext_mesh,
+ *num_interfaces_ext_mesh*sizeof(int)),1201);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
+ *num_interfaces_ext_mesh*sizeof(int),cudaMemcpyHostToDevice),1202);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_ext_mesh,
+ *num_interfaces_ext_mesh* *max_nibool_interfaces_ext_mesh*
+ sizeof(int)),1203);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
+ *num_interfaces_ext_mesh* *max_nibool_interfaces_ext_mesh*sizeof(int),
+ cudaMemcpyHostToDevice),1204);
+
+ // inner elements
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,*NSPEC_AB*sizeof(int)),1205);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
+ *NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),1206);
+
+
// absorbing boundaries
if( *num_abs_boundary_faces > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_ispec,
- (*num_abs_boundary_faces)*sizeof(int)),771);
+ (*num_abs_boundary_faces)*sizeof(int)),1101);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ispec, h_abs_boundary_ispec,
- (*num_abs_boundary_faces)*sizeof(int),
- cudaMemcpyHostToDevice),771);
+ (*num_abs_boundary_faces)*sizeof(int),
+ cudaMemcpyHostToDevice),1102);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_ijk,
- 3*25*(*num_abs_boundary_faces)*sizeof(int)),772);
+ 3*25*(*num_abs_boundary_faces)*sizeof(int)),1103);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
- 3*25*(*num_abs_boundary_faces)*sizeof(int),
- cudaMemcpyHostToDevice),772);
+ 3*25*(*num_abs_boundary_faces)*sizeof(int),
+ cudaMemcpyHostToDevice),1104);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_normal,
- 3*25*(*num_abs_boundary_faces)*sizeof(int)),773);
+ 3*25*(*num_abs_boundary_faces)*sizeof(int)),1105);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
- 3*25*(*num_abs_boundary_faces)*sizeof(int),
- cudaMemcpyHostToDevice),773);
+ 3*25*(*num_abs_boundary_faces)*sizeof(int),
+ cudaMemcpyHostToDevice),1106);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_abs_boundary_jacobian2Dw,
- 25*(*num_abs_boundary_faces)*sizeof(float)),774);
+ 25*(*num_abs_boundary_faces)*sizeof(float)),1107);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
- 25*(*num_abs_boundary_faces)*sizeof(float),
- cudaMemcpyHostToDevice),774);
+ 25*(*num_abs_boundary_faces)*sizeof(float),
+ cudaMemcpyHostToDevice),1108);
}
-
-/*
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rho_vp, size*sizeof(float)),5);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_rho_vs, size*sizeof(float)),6);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp,h_rho_vp,size*sizeof(float),
- cudaMemcpyHostToDevice),5);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs,h_rho_vs,size*sizeof(float),
- cudaMemcpyHostToDevice),5);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_b_absorb_field, 3*25* *b_num_abs_boundary_faces*sizeof(float)),7);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
- 3*25* *b_num_abs_boundary_faces*sizeof(float),
- cudaMemcpyHostToDevice),7);
-*/
-
- // prepare interprocess-edge exchange information
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_ext_mesh,
- *num_interfaces_ext_mesh*sizeof(int)),19);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
- *num_interfaces_ext_mesh*sizeof(int),cudaMemcpyHostToDevice),19);
-
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_ext_mesh,
- *num_interfaces_ext_mesh* *max_nibool_interfaces_ext_mesh*
- sizeof(int)),20);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
- *num_interfaces_ext_mesh* *max_nibool_interfaces_ext_mesh*sizeof(int),
- cudaMemcpyHostToDevice),20);
-
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_inner,*NSPEC_AB*sizeof(int)),21);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_inner, h_ispec_is_inner,
- *NSPEC_AB*sizeof(int),
- cudaMemcpyHostToDevice),21);
-
-// print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ispec_is_elastic,*NSPEC_AB*sizeof(int)),21);
-// print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic, h_ispec_is_elastic,
-// *NSPEC_AB*sizeof(int),
-// cudaMemcpyHostToDevice),21);
-
- print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool, h_ibool,
- size*sizeof(int) ,cudaMemcpyHostToDevice),22);
-
// sources
if (*SIMULATION_TYPE == 1 || *SIMULATION_TYPE == 3){
// not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2)
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_sourcearrays, sizeof(float)* *NSOURCES*3*125),522);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_sourcearrays, sizeof(float)* *NSOURCES*3*125),1301);
print_CUDA_error_if_any(cudaMemcpy(mp->d_sourcearrays, h_sourcearrays, sizeof(float)* *NSOURCES*3*125,
- cudaMemcpyHostToDevice),522);
+ cudaMemcpyHostToDevice),1302);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_stf_pre_compute),
- *NSOURCES*sizeof(double)),525);
+ *NSOURCES*sizeof(double)),1303);
}
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_source, sizeof(int) * *NSOURCES),523);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_source, sizeof(int) * *NSOURCES),1401);
print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_source, h_islice_selected_source, sizeof(int)* *NSOURCES,
- cudaMemcpyHostToDevice),523);
+ cudaMemcpyHostToDevice),1402);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_ispec_selected_source, sizeof(int)* *NSOURCES),524);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_ispec_selected_source, sizeof(int)* *NSOURCES),1403);
print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_source, h_ispec_selected_source,sizeof(int)* *NSOURCES,
- cudaMemcpyHostToDevice),524);
+ cudaMemcpyHostToDevice),1404);
// transfer constant element data with padding
for(int i=0;i<*NSPEC_AB;i++) {
print_CUDA_error_if_any(cudaMemcpy(mp->d_xix + i*128, &h_xix[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),70);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1501);
print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy+i*128, &h_xiy[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),71);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1502);
print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz+i*128, &h_xiz[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),72);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1503);
print_CUDA_error_if_any(cudaMemcpy(mp->d_etax+i*128, &h_etax[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),73);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1504);
print_CUDA_error_if_any(cudaMemcpy(mp->d_etay+i*128, &h_etay[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),74);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1505);
print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz+i*128, &h_etaz[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),75);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1506);
print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax+i*128,&h_gammax[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),76);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1507);
print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay+i*128,&h_gammay[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),77);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1508);
print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz+i*128,&h_gammaz[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),78);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1509);
print_CUDA_error_if_any(cudaMemcpy(mp->d_kappav+i*128,&h_kappav[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),79);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1510);
print_CUDA_error_if_any(cudaMemcpy(mp->d_muv+i*128, &h_muv[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),80);
+ 125*sizeof(float),cudaMemcpyHostToDevice),1511);
}
@@ -633,15 +411,12 @@
mp->nrec_local = nrec_local;
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_number_receiver_global),nrec_local*sizeof(int)),1);
print_CUDA_error_if_any(cudaMemcpy(mp->d_number_receiver_global,h_number_receiver_global,
- nrec_local*sizeof(int),cudaMemcpyHostToDevice),602);
+ nrec_local*sizeof(int),cudaMemcpyHostToDevice),1512);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_selected_rec),nrec*sizeof(int)),603);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_selected_rec),nrec*sizeof(int)),1513);
print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_selected_rec,h_ispec_selected_rec,
- nrec*sizeof(int),cudaMemcpyHostToDevice),604);
+ nrec*sizeof(int),cudaMemcpyHostToDevice),1514);
-// print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),3*125*nrec_local*sizeof(float)),605);
-// mp->h_station_seismo_field = (float*)malloc(3*125*nrec_local*sizeof(float));
-
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_constants_device");
#endif
@@ -649,254 +424,10 @@
/* ----------------------------------------------------------------------------------------------- */
-extern "C"
-void FC_FUNC_(prepare_and_transfer_noise_backward_fields,
- PREPARE_AND_TRANSFER_NOISE_BACKWARD_FIELDS)(long* Mesh_pointer_f,
- int* size,
- real* b_displ,
- real* b_veloc,
- real* b_accel,
- real* b_epsilondev_xx,
- real* b_epsilondev_yy,
- real* b_epsilondev_xy,
- real* b_epsilondev_xz,
- real* b_epsilondev_yz,
- int* NSPEC_STRAIN_ONLY) {
+// purely adjoint & kernel simulations
-TRACE("prepare_and_transfer_noise_backward_fields_");
-
- //show_free_memory("prep_and_xfer_noise_bwd_fields");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
- int epsilondev_size = 128*(*NSPEC_STRAIN_ONLY);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),*size*sizeof(real)),1);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),*size*sizeof(real)),2);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),*size*sizeof(real)),3);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
- epsilondev_size*sizeof(real)),4);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
- epsilondev_size*sizeof(real)),4);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
- epsilondev_size*sizeof(real)),4);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
- epsilondev_size*sizeof(real)),4);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
- epsilondev_size*sizeof(real)),4);
-
-
- cudaMemcpy(mp->d_b_displ,b_displ,*size*sizeof(real),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_veloc,b_veloc,*size*sizeof(real),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_accel,b_accel,*size*sizeof(real),cudaMemcpyHostToDevice);
-
- cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
- epsilondev_size*sizeof(real),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
- epsilondev_size*sizeof(real),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
- epsilondev_size*sizeof(real),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
- epsilondev_size*sizeof(real),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
- epsilondev_size*sizeof(real),cudaMemcpyHostToDevice);
-
-}
-
/* ----------------------------------------------------------------------------------------------- */
-extern "C"
-void FC_FUNC_(prepare_and_transfer_noise_backward_constants,
- PREPARE_AND_TRANSFER_NOISE_BACKWARD_CONSTANTS)(long* Mesh_pointer_f,
- float* normal_x_noise,
- float* normal_y_noise,
- float* normal_z_noise,
- float* mask_noise,
- float* free_surface_jacobian2Dw,
- int* nfaces_surface_ext_mesh
- ) {
-
-TRACE("prepare_and_transfer_noise_backward_constants_");
-
- //show_free_memory("prep_and_xfer_noise_bwd_constants");
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- int nface_size = 5*5*(*nfaces_surface_ext_mesh);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
- nface_size*sizeof(float)),1);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
- nface_size*sizeof(float)),2);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
- nface_size*sizeof(float)),3);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise, nface_size*sizeof(float)),4);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
- nface_size*sizeof(float)),5);
-
- cudaMemcpy(mp->d_normal_x_noise, normal_x_noise, nface_size*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_normal_y_noise, normal_y_noise, nface_size*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_normal_z_noise, normal_z_noise, nface_size*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_mask_noise, mask_noise, nface_size*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw, nface_size*sizeof(float),cudaMemcpyHostToDevice);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- printf("jacobian_size = %d\n",25*(*nfaces_surface_ext_mesh));
- exit_on_cuda_error("prepare_and_transfer_noise_backward_constants_");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(prepare_noise_constants_device,
- PREPARE_NOISE_CONSTANTS_DEVICE)(long* Mesh_pointer_f,
- int* h_NGLLX,
- int* NSPEC_AB, int* NGLOB_AB,
- int* free_surface_ispec,int* free_surface_ijk,
- int* num_free_surface_faces,
- int* size_free_surface_ijk, int* SIMULATION_TYPE) {
-
-TRACE("prepare_noise_constants_device_");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
-
- mp->num_free_surface_faces = *num_free_surface_faces;
-
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec, *num_free_surface_faces*sizeof(int)),1);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec, *num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),1);
-
- // alloc storage for the surface buffer to be copied
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie, 3*25*(*num_free_surface_faces)*sizeof(float)),1);
-
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk, (*size_free_surface_ijk)*sizeof(float)),1);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,(*size_free_surface_ijk)*sizeof(float),cudaMemcpyHostToDevice),1);
-
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(prepare_sensitivity_kernels,
- PREPARE_SENSITIVITY_KERNELS)(long* Mesh_pointer_f,
- float* rho_kl,
- float* mu_kl,
- float* kappa_kl,
- float* epsilon_trace_over_3,
- float* b_epsilon_trace_over_3,
- float* Sigma_kl,
- int* NSPEC_ADJOINTf) {
-
-TRACE("prepare_sensitivity_kernels_");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
- int NSPEC_ADJOINT = *NSPEC_ADJOINTf;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),
- 125*mp->NSPEC_AB*sizeof(float)),800);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),
- 125*mp->NSPEC_AB*sizeof(float)),801);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),
- 125*mp->NSPEC_AB*sizeof(float)),802);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
- 125*mp->NSPEC_AB*sizeof(float)),803);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
- 125*mp->NSPEC_AB*sizeof(float)),804);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
- 125*(mp->NSPEC_AB)*sizeof(float)),805);
-
- cudaMemcpy(mp->d_rho_kl,rho_kl, 125*NSPEC_ADJOINT*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_mu_kl,mu_kl, 125*NSPEC_ADJOINT*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_kappa_kl,kappa_kl, 125*NSPEC_ADJOINT*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
- 125*NSPEC_ADJOINT*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
- 125*NSPEC_ADJOINT*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_Sigma_kl, Sigma_kl, 125*(NSPEC_ADJOINT)*sizeof(float),
- cudaMemcpyHostToDevice);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("prepare_sensitivity_kernels");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(prepare_adjoint_constants_device,
- PREPARE_ADJOINT_CONSTANTS_DEVICE)(long* Mesh_pointer_f,
- //int* ispec_selected_rec,
- //int* islice_selected_rec,
- //int* islice_selected_rec_size,
- //int* nrec,
- float* noise_sourcearray,
- int* NSTEP,
- float* epsilondev_xx,
- float* epsilondev_yy,
- float* epsilondev_xy,
- float* epsilondev_xz,
- float* epsilondev_yz,
- int* NSPEC_STRAIN_ONLY
- ) {
-TRACE("prepare_adjoint_constants_device_");
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("prepare_adjoint_constants_device 1");
-#endif
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
- int epsilondev_size = 128*(*NSPEC_STRAIN_ONLY);
-
- // already done earlier
- // print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_ispec_selected_rec,
- // *nrec*sizeof(int)),1);
- // cudaMemcpy(mp->d_ispec_selected_rec,ispec_selected_rec, *nrec*sizeof(int),
- // cudaMemcpyHostToDevice);
-
- //print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
- // *islice_selected_rec_size*sizeof(int)),901);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
- 3*125*(*NSTEP)*sizeof(float)),902);
-
-
- cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
- 3*125*(*NSTEP)*sizeof(float),
- cudaMemcpyHostToDevice);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("prepare_adjoint_constants_device 2");
-#endif
-
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
- epsilondev_size*sizeof(float)),903);
- cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(float),
- cudaMemcpyHostToDevice);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
- epsilondev_size*sizeof(float)),904);
- cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(float),
- cudaMemcpyHostToDevice);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
- epsilondev_size*sizeof(float)),905);
- cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(float),
- cudaMemcpyHostToDevice);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
- epsilondev_size*sizeof(float)),906);
- cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(float),
- cudaMemcpyHostToDevice);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
- epsilondev_size*sizeof(float)),907);
- cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(float),
- cudaMemcpyHostToDevice);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("prepare_adjoint_constants_device 3");
-#endif
-
- // these don't seem necessary and crash code for NOISE_TOMOGRAPHY >
- // 0 b/c rho_kl, etc not yet allocated when NT=1
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
extern "C"
void FC_FUNC_(prepare_adjoint_sim2_or_3_constants_device,
PREPARE_ADJOINT_SIM2_OR_3_CONSTANTS_DEVICE)(
@@ -910,11 +441,11 @@
// allocates arrays for receivers
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_islice_selected_rec,
- *islice_selected_rec_size*sizeof(int)),802);
+ *islice_selected_rec_size*sizeof(int)),7001);
// copies arrays to GPU device
print_CUDA_error_if_any(cudaMemcpy(mp->d_islice_selected_rec,islice_selected_rec,
- *islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),804);
+ *islice_selected_rec_size*sizeof(int),cudaMemcpyHostToDevice),7002);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_adjoint_sim2_or_3_constants_device");
@@ -923,28 +454,126 @@
/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C" {
- void prepare_fields_device_(long* Mesh_pointer_f, int* size);
- void transfer_fields_to_device_(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f);
- void transfer_fields_from_device_(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f);
-}
-*/
+// for ACOUSTIC simulations
/* ----------------------------------------------------------------------------------------------- */
-/*
-void prepare_fields_device_(long* Mesh_pointer_f, int* size) {
+extern "C"
+void FC_FUNC_(prepare_fields_acoustic_device,
+ PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
+ float* rmass_acoustic,
+ float* rhostore,
+ float* kappastore,
+ int* num_phase_ispec_acoustic,
+ int* phase_ispec_inner_acoustic,
+ int* ispec_is_acoustic,
+ int* NOISE_TOMOGRAPHY,
+ int* num_free_surface_faces,
+ int* free_surface_ispec,
+ int* free_surface_ijk,
+ int* ABSORBING_CONDITIONS,
+ int* b_reclen_potential,
+ float* b_absorb_potential,
+ int* SIMULATION_TYPE,
+ float* rho_ac_kl,
+ float* kappa_ac_kl
+ ) {
+ TRACE("prepare_fields_acoustic_device");
+
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(float)*(*size)),0);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(float)*(*size)),1);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(float)*(*size)),2);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),sizeof(float)*(*size)),2);
-
+ /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
+ int size_padded = 128 * mp->NSPEC_AB;
+ int size_nonpadded = 125 * mp->NSPEC_AB;
+ int size = mp->NGLOB_AB;
+
+ // allocates arrays on device (GPU)
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(float)*size),9001);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(float)*size),9002);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(float)*size),9003);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),sizeof(float)*size),9004);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(float)*size),9005);
+
+ // padded array
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(float)),9006);
+
+ // non-padded array
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(float)),9007);
+
+ mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic), mp->num_phase_ispec_acoustic*2*sizeof(int)),9008);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),mp->NSPEC_AB*sizeof(int)),9009);
+
+ // free surface
+ if( *NOISE_TOMOGRAPHY == 0 ){
+ // allocate surface arrays
+ mp->num_free_surface_faces = *num_free_surface_faces;
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),mp->num_free_surface_faces*sizeof(int)),9201);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),3*25*mp->num_free_surface_faces*sizeof(int)),9202);
+
+ // transfers values onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
+ mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9203);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+ 3*25*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9204);
+ }
+
+ // absorbing boundaries
+ if( *ABSORBING_CONDITIONS == 1 ){
+ mp->d_b_reclen_potential = *b_reclen_potential;
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),9301);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
+ mp->d_b_reclen_potential,cudaMemcpyHostToDevice),9302);
+ }
+
+ // kernel simulations
+ if( *SIMULATION_TYPE == 3 ){
+ // allocates backward/reconstructed arrays on device (GPU)
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(float)*size),9014);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(float)*size),9015);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(float)*size),9016);
+
+ // allocates kernels
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),125*mp->NSPEC_AB*sizeof(float)),9017);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),125*mp->NSPEC_AB*sizeof(float)),9018);
+ // copies over initial values
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_ac_kl,rho_ac_kl,
+ 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),9019);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_kappa_ac_kl,kappa_ac_kl,
+ 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),9020);
+
+ }
+
+ // transfer element data
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
+ sizeof(float)*size,cudaMemcpyHostToDevice),9100);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
+ mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),9101);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
+ mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),9102);
+
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
+ size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),9105);
+
+ // transfer constant element data with padding
+ for(int i=0;i<mp->NSPEC_AB;i++) {
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*128, &rhostore[i*125],
+ 125*sizeof(float),cudaMemcpyHostToDevice),9106);
+ }
+
+ // for seismograms
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
+ mp->nrec_local*125*sizeof(float)),9107);
+ mp->h_station_seismo_potential = (float*)malloc(mp->nrec_local*125*sizeof(float));
+ if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("prepare_fields_acoustic_device");
+#endif
}
-*/
+
/* ----------------------------------------------------------------------------------------------- */
// for ELASTIC simulations
@@ -963,164 +592,549 @@
int* ispec_is_elastic,
int* ABSORBING_CONDITIONS,
float* h_b_absorb_field,
- int* b_num_abs_boundary_faces){
+ int* b_num_abs_boundary_faces,
+ int* SIMULATION_TYPE,
+ float* rho_kl,
+ float* mu_kl,
+ float* kappa_kl,
+ int* COMPUTE_AND_STORE_STRAIN,
+ float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
+ float* epsilondev_xz,float* epsilondev_yz,
+ float* epsilon_trace_over_3,
+ float* b_epsilondev_xx,float* b_epsilondev_yy,float* b_epsilondev_xy,
+ float* b_epsilondev_xz,float* b_epsilondev_yz,
+ float* b_epsilon_trace_over_3,
+ int* ATTENUATION, int* R_size,
+ float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+ float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
+ float* one_minus_sum_beta,float* factor_common,
+ float* alphaval,float* betaval,float* gammaval,
+ float* b_alphaval,float* b_betaval,float* b_gammaval
+ ){
TRACE("prepare_fields_elastic_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
/* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
- int size_padded = 128 * mp->NSPEC_AB;
+ //int size_padded = 128 * mp->NSPEC_AB;
int size_nonpadded = 125 * mp->NSPEC_AB;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(float)*(*size)),200);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(float)*(*size)),201);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(float)*(*size)),202);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),sizeof(float)*(*size)),203);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(float)*(*size)),8001);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(float)*(*size)),8002);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(float)*(*size)),8003);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),sizeof(float)*(*size)),8004);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(float)*mp->NGLOB_AB),204);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_padded*sizeof(float)),205);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_padded*sizeof(float)),206);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(float)*mp->NGLOB_AB),8005);
+ // non-padded arrays
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(float)),8006);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(float)),8007);
+
+ // element indices
mp->d_num_phase_ispec_elastic = *num_phase_ispec_elastic;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic), mp->d_num_phase_ispec_elastic*2*sizeof(int)),207);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),208);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
+ mp->d_num_phase_ispec_elastic*2*sizeof(int)),8008);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),8009);
// transfer element data
print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
- sizeof(float)*mp->NGLOB_AB,cudaMemcpyHostToDevice),209);
+ sizeof(float)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8010);
print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
- mp->d_num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),210);
+ mp->d_num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),8011);
print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
- mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),211);
+ mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),8012);
// daniel: not sure if rho_vp, rho_vs needs padding... they are needed for stacey boundary condition
print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
- size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),212);
+ size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),8013);
print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
- size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),213);
+ size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),8014);
// for seismograms
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),3*125*(mp->nrec_local)*sizeof(float)),214);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
+ 3*125*(mp->nrec_local)*sizeof(float)),8015);
mp->h_station_seismo_field = (float*)malloc(3*125*(mp->nrec_local)*sizeof(float));
// absorbing conditions
if( *ABSORBING_CONDITIONS == 1 ){
mp->b_num_abs_boundary_faces = *b_num_abs_boundary_faces;
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
- 3*25*mp->b_num_abs_boundary_faces*sizeof(float)),791);
+ 3*25*mp->b_num_abs_boundary_faces*sizeof(float)),8016);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
- 3*25*mp->b_num_abs_boundary_faces*sizeof(float),cudaMemcpyHostToDevice),792);
+ 3*25*mp->b_num_abs_boundary_faces*sizeof(float),cudaMemcpyHostToDevice),8017);
}
+
+ // kernel simulations
+ if( *SIMULATION_TYPE == 3 ){
+ // allocates backward/reconstructed arrays on device (GPU)
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(float)*(*size)),8201);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(float)*(*size)),8202);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(float)*(*size)),8203);
+
+ // allocates kernels
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),125*mp->NSPEC_AB*sizeof(float)),8204);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),125*mp->NSPEC_AB*sizeof(float)),8205);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),125*mp->NSPEC_AB*sizeof(float)),8206);
+
+ // copies over initial values
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_kl,rho_kl,
+ 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8207);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_mu_kl,mu_kl,
+ 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8208);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_kappa_kl,kappa_kl,
+ 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8209);
+
+ }
+ // strains used for attenuation and kernel simulations
+ if( *COMPUTE_AND_STORE_STRAIN == 1 ){
+ // strains
+ int epsilondev_size = 125*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
+ epsilondev_size*sizeof(float)),8301);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(float),
+ cudaMemcpyHostToDevice),8302);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
+ epsilondev_size*sizeof(float)),8302);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(float),
+ cudaMemcpyHostToDevice),8303);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
+ epsilondev_size*sizeof(float)),8304);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(float),
+ cudaMemcpyHostToDevice),8305);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
+ epsilondev_size*sizeof(float)),8306);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(float),
+ cudaMemcpyHostToDevice),8307);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
+ epsilondev_size*sizeof(float)),8308);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(float),
+ cudaMemcpyHostToDevice),8309);
+
+ if( *SIMULATION_TYPE == 3 ){
+ // solid pressure
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
+ 125*mp->NSPEC_AB*sizeof(float)),8310);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
+ 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8311);
+ // backward solid pressure
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
+ 125*mp->NSPEC_AB*sizeof(float)),8312);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
+ 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8313);
+ // prepares backward strains
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
+ epsilondev_size*sizeof(float)),8321);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
+ epsilondev_size*sizeof(float)),8322);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
+ epsilondev_size*sizeof(float)),8323);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
+ epsilondev_size*sizeof(float)),8324);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
+ epsilondev_size*sizeof(float)),8325);
+
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
+ epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8326);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
+ epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8327);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
+ epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8328);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
+ epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8329);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
+ epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8330);
+ }
+ }
+
+ // attenuation memory variables
+ if( *ATTENUATION == 1 ){
+ // memory arrays
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
+ (*R_size)*sizeof(float)),8401);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(float),
+ cudaMemcpyHostToDevice),8402);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
+ (*R_size)*sizeof(float)),8403);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(float),
+ cudaMemcpyHostToDevice),8404);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
+ (*R_size)*sizeof(float)),8405);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(float),
+ cudaMemcpyHostToDevice),8406);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
+ (*R_size)*sizeof(float)),8407);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(float),
+ cudaMemcpyHostToDevice),8408);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
+ (*R_size)*sizeof(float)),8409);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(float),
+ cudaMemcpyHostToDevice),8410);
+ if( *SIMULATION_TYPE == 3 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
+ (*R_size)*sizeof(float)),8421);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(float),
+ cudaMemcpyHostToDevice),8422);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
+ (*R_size)*sizeof(float)),8423);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(float),
+ cudaMemcpyHostToDevice),8424);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
+ (*R_size)*sizeof(float)),8425);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(float),
+ cudaMemcpyHostToDevice),8426);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
+ (*R_size)*sizeof(float)),8427);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(float),
+ cudaMemcpyHostToDevice),8428);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
+ (*R_size)*sizeof(float)),8429);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(float),
+ cudaMemcpyHostToDevice),8420);
+ }
+
+ // attenuation factors
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
+ 125*mp->NSPEC_AB*sizeof(float)),8430);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
+ 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8431);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
+ N_SLS*125*mp->NSPEC_AB*sizeof(float)),8432);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
+ N_SLS*125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8433);
+
+ // alpha,beta,gamma factors
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
+ N_SLS*sizeof(float)),8434);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
+ N_SLS*sizeof(float),cudaMemcpyHostToDevice),8435);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
+ N_SLS*sizeof(float)),8436);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
+ N_SLS*sizeof(float),cudaMemcpyHostToDevice),8437);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
+ N_SLS*sizeof(float)),8438);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
+ N_SLS*sizeof(float),cudaMemcpyHostToDevice),8439);
+
+ if( *SIMULATION_TYPE == 3 ){
+ // alpha,beta,gamma factors for backward fields
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
+ N_SLS*sizeof(float)),8434);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
+ N_SLS*sizeof(float),cudaMemcpyHostToDevice),8435);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
+ N_SLS*sizeof(float)),8436);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
+ N_SLS*sizeof(float),cudaMemcpyHostToDevice),8437);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
+ N_SLS*sizeof(float)),8438);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
+ N_SLS*sizeof(float),cudaMemcpyHostToDevice),8439);
+ }
+
+
+ }
+
+
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("prepare_fields_elastic_device");
#endif
}
+
/* ----------------------------------------------------------------------------------------------- */
-// for ACOUSTIC simulations
+// for NOISE simulations
/* ----------------------------------------------------------------------------------------------- */
extern "C"
-void FC_FUNC_(prepare_fields_acoustic_device,
- PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
- float* rmass_acoustic,
- float* rhostore,
- float* kappastore,
- int* num_phase_ispec_acoustic,
- int* phase_ispec_inner_acoustic,
- int* ispec_is_acoustic,
- int* num_free_surface_faces,
- int* free_surface_ispec,
- int* free_surface_ijk,
- int* ABSORBING_CONDITIONS,
- int* b_reclen_potential,
- float* b_absorb_potential,
- int* SIMULATION_TYPE,
- float* rho_ac_kl,
- float* kappa_ac_kl) {
+void FC_FUNC_(prepare_fields_noise_device,
+ PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
+ int* NSPEC_AB, int* NGLOB_AB,
+ int* free_surface_ispec,int* free_surface_ijk,
+ int* num_free_surface_faces,
+ int* size_free_surface_ijk,
+ int* SIMULATION_TYPE,
+ int* NOISE_TOMOGRAPHY,
+ int* NSTEP,
+ float* noise_sourcearray,
+ float* normal_x_noise,
+ float* normal_y_noise,
+ float* normal_z_noise,
+ float* mask_noise,
+ float* free_surface_jacobian2Dw,
+ float* Sigma_kl
+ ) {
-TRACE("prepare_fields_acoustic_device");
+ TRACE("prepare_fields_noise_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
- /* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
- int size_padded = 128 * mp->NSPEC_AB;
- int size_nonpadded = 125 * mp->NSPEC_AB;
- int size = mp->NGLOB_AB;
- // allocates arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(float)*size),100);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(float)*size),101);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(float)*size),102);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),sizeof(float)*size),103);
+ // free surface
+ mp->num_free_surface_faces = *num_free_surface_faces;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(float)*size),104);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(float)),105);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_padded*sizeof(float)),106);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ispec,
+ *num_free_surface_faces*sizeof(int)),4001);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec, free_surface_ispec,
+ *num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4002);
- mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic), mp->num_phase_ispec_acoustic*2*sizeof(int)),107);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_acoustic),mp->NSPEC_AB*sizeof(int)),108);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
+ (*size_free_surface_ijk)*sizeof(int)),4003);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
+ (*size_free_surface_ijk)*sizeof(int),cudaMemcpyHostToDevice),4004);
- mp->num_free_surface_faces = *num_free_surface_faces;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),mp->num_free_surface_faces*sizeof(int)),109);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),3*25*mp->num_free_surface_faces*sizeof(int)),110);
+ // alloc storage for the surface buffer to be copied
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
+ 3*25*(*num_free_surface_faces)*sizeof(float)),4005);
+ // prepares noise source array
+ if( *NOISE_TOMOGRAPHY == 1 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
+ 3*125*(*NSTEP)*sizeof(float)),4101);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
+ 3*125*(*NSTEP)*sizeof(float),cudaMemcpyHostToDevice),4102);
+ }
+
+ // prepares noise directions
+ if( *NOISE_TOMOGRAPHY > 1 ){
+ int nface_size = 25*(*num_free_surface_faces);
+ // allocates memory on GPU
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
+ nface_size*sizeof(float)),4301);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
+ nface_size*sizeof(float)),4302);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
+ nface_size*sizeof(float)),4303);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
+ nface_size*sizeof(float)),4304);
+ print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
+ nface_size*sizeof(float)),4305);
+ // transfers data onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
+ nface_size*sizeof(float),cudaMemcpyHostToDevice),4306);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
+ nface_size*sizeof(float),cudaMemcpyHostToDevice),4307);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
+ nface_size*sizeof(float),cudaMemcpyHostToDevice),4308);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
+ nface_size*sizeof(float),cudaMemcpyHostToDevice),4309);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
+ nface_size*sizeof(float),cudaMemcpyHostToDevice),4310);
+ }
+
+ // prepares noise strength kernel
+ if( *NOISE_TOMOGRAPHY == 3 ){
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
+ 125*(mp->NSPEC_AB)*sizeof(float)),4401);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_Sigma_kl, Sigma_kl,
+ 125*(mp->NSPEC_AB)*sizeof(float),cudaMemcpyHostToDevice),4403);
+ }
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ //printf("jacobian_size = %d\n",25*(*num_free_surface_faces));
+ exit_on_cuda_error("prepare_fields_noise_device");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// cleanup
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(prepare_cleanup_device,
+ PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
+ int* num_abs_boundary_faces,
+ int* SIMULATION_TYPE,
+ int* ACOUSTIC_SIMULATION,
+ int* ELASTIC_SIMULATION,
+ int* ABSORBING_CONDITIONS,
+ int* NOISE_TOMOGRAPHY,
+ int* COMPUTE_AND_STORE_STRAIN,
+ int* ATTENUATION) {
+
+TRACE("prepare_cleanup_device");
+
+ // frees allocated memory arrays
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f);
+
+ // frees memory on GPU
+ // mesh
+ cudaFree(mp->d_xix);
+ cudaFree(mp->d_xiy);
+ cudaFree(mp->d_xiz);
+ cudaFree(mp->d_etax);
+ cudaFree(mp->d_etay);
+ cudaFree(mp->d_etaz);
+ cudaFree(mp->d_gammax);
+ cudaFree(mp->d_gammay);
+ cudaFree(mp->d_gammaz);
+ cudaFree(mp->d_muv);
+
// absorbing boundaries
- if( *ABSORBING_CONDITIONS == 1 ){
- mp->d_b_reclen_potential = *b_reclen_potential;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_potential),mp->d_b_reclen_potential),111);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_potential,b_absorb_potential,
- mp->d_b_reclen_potential,cudaMemcpyHostToDevice),112);
+ if( *num_abs_boundary_faces > 0 ){
+ cudaFree(mp->d_abs_boundary_ispec);
+ cudaFree(mp->d_abs_boundary_ijk);
+ cudaFree(mp->d_abs_boundary_normal);
+ cudaFree(mp->d_abs_boundary_jacobian2Dw);
}
- // kernel simulations
- if( *SIMULATION_TYPE == 3 ){
- // allocates backward/reconstructed arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(float)*size),113);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(float)*size),114);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(float)*size),115);
+ // interfaces
+ cudaFree(mp->d_nibool_interfaces_ext_mesh);
+ cudaFree(mp->d_ibool_interfaces_ext_mesh);
+
+ // global indexing
+ cudaFree(mp->d_ispec_is_inner);
+ cudaFree(mp->d_ibool);
+
+ // sources
+ if (*SIMULATION_TYPE == 1 || *SIMULATION_TYPE == 3){
+ cudaFree(mp->d_sourcearrays);
+ cudaFree(mp->d_stf_pre_compute);
+ }
+
+ cudaFree(mp->d_islice_selected_source);
+ cudaFree(mp->d_ispec_selected_source);
+
+ // receivers
+ cudaFree(mp->d_number_receiver_global);
+ cudaFree(mp->d_ispec_selected_rec);
+
+ // ACOUSTIC arrays
+ if( *ACOUSTIC_SIMULATION == 1 ){
+ cudaFree(mp->d_potential_acoustic);
+ cudaFree(mp->d_potential_dot_acoustic);
+ cudaFree(mp->d_potential_dot_dot_acoustic);
+ cudaFree(mp->d_send_potential_dot_dot_buffer);
+ cudaFree(mp->d_rmass_acoustic);
+ cudaFree(mp->d_rhostore);
+ cudaFree(mp->d_kappastore);
+ cudaFree(mp->d_phase_ispec_inner_acoustic);
+ cudaFree(mp->d_ispec_is_acoustic);
- // allocates kernels
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),125*mp->NSPEC_AB*sizeof(float)),181);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),125*mp->NSPEC_AB*sizeof(float)),182);
- // copies over initial values
- print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_ac_kl,rho_ac_kl,
- 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),183);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_kappa_ac_kl,kappa_ac_kl,
- 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),184);
+ if( *NOISE_TOMOGRAPHY == 0 ){
+ cudaFree(mp->d_free_surface_ispec);
+ cudaFree(mp->d_free_surface_ijk);
+ }
+ if( *ABSORBING_CONDITIONS == 1 ) cudaFree(mp->d_b_absorb_potential);
+
+ if( *SIMULATION_TYPE == 3 ) {
+ cudaFree(mp->d_b_potential_acoustic);
+ cudaFree(mp->d_b_potential_dot_acoustic);
+ cudaFree(mp->d_b_potential_dot_dot_acoustic);
+ cudaFree(mp->d_rho_ac_kl);
+ cudaFree(mp->d_kappa_ac_kl);
+ }
+
+ cudaFree(mp->d_station_seismo_potential);
+
+ free(mp->h_station_seismo_potential);
}
+
+ // ELASTIC arrays
+ if( *ELASTIC_SIMULATION == 1 ){
+ cudaFree(mp->d_displ);
+ cudaFree(mp->d_veloc);
+ cudaFree(mp->d_accel);
+ cudaFree(mp->d_send_accel_buffer);
+ cudaFree(mp->d_rmass);
+ cudaFree(mp->d_rho_vp);
+ cudaFree(mp->d_rho_vs);
+ cudaFree(mp->d_phase_ispec_inner_elastic);
+ cudaFree(mp->d_ispec_is_elastic);
+ cudaFree(mp->d_station_seismo_field);
+
+ if( *ABSORBING_CONDITIONS == 1 ) cudaFree(mp->d_b_absorb_field);
+
+ if( *SIMULATION_TYPE == 3 ) {
+ cudaFree(mp->d_b_displ);
+ cudaFree(mp->d_b_veloc);
+ cudaFree(mp->d_b_accel);
+ cudaFree(mp->d_rho_kl);
+ cudaFree(mp->d_mu_kl);
+ cudaFree(mp->d_kappa_kl);
+ }
+
+ if( *COMPUTE_AND_STORE_STRAIN == 1 ){
+ cudaFree(mp->d_epsilondev_xx);
+ cudaFree(mp->d_epsilondev_yy);
+ cudaFree(mp->d_epsilondev_xy);
+ cudaFree(mp->d_epsilondev_xz);
+ cudaFree(mp->d_epsilondev_yz);
+ if( *SIMULATION_TYPE == 3 ){
+ cudaFree(mp->d_epsilon_trace_over_3);
+ cudaFree(mp->d_b_epsilon_trace_over_3);
+ cudaFree(mp->d_b_epsilondev_xx);
+ cudaFree(mp->d_b_epsilondev_yy);
+ cudaFree(mp->d_b_epsilondev_xy);
+ cudaFree(mp->d_b_epsilondev_xz);
+ cudaFree(mp->d_b_epsilondev_yz);
+ }
+ }
+
+ if( *ATTENUATION == 1 ){
+ cudaFree(mp->d_factor_common);
+ cudaFree(mp->d_one_minus_sum_beta);
+ cudaFree(mp->d_alphaval);
+ cudaFree(mp->d_betaval);
+ cudaFree(mp->d_gammaval);
+ cudaFree(mp->d_R_xx);
+ cudaFree(mp->d_R_yy);
+ cudaFree(mp->d_R_xy);
+ cudaFree(mp->d_R_xz);
+ cudaFree(mp->d_R_yz);
+ if( *SIMULATION_TYPE == 3){
+ cudaFree(mp->d_b_R_xx);
+ cudaFree(mp->d_b_R_yy);
+ cudaFree(mp->d_b_R_xy);
+ cudaFree(mp->d_b_R_xz);
+ cudaFree(mp->d_b_R_yz);
+ cudaFree(mp->d_b_alphaval);
+ cudaFree(mp->d_b_betaval);
+ cudaFree(mp->d_b_gammaval);
+ }
+ }
+ }
- // transfer element data
- print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
- sizeof(float)*size,cudaMemcpyHostToDevice),116);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_acoustic,phase_ispec_inner_acoustic,
- mp->num_phase_ispec_acoustic*2*sizeof(int),cudaMemcpyHostToDevice),117);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_acoustic,ispec_is_acoustic,
- mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),118);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
- mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),119);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
- 3*25*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),120);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
- size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),121);
-
- // transfer constant element data with padding
- for(int i=0;i<mp->NSPEC_AB;i++) {
- print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*128, &rhostore[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),122);
+ // purely adjoint & kernel array
+ if( *SIMULATION_TYPE == 2 || *SIMULATION_TYPE == 3 ) cudaFree(mp->d_islice_selected_rec);
+
+ // NOISE arrays
+ if( *NOISE_TOMOGRAPHY > 0 ){
+ cudaFree(mp->d_free_surface_ispec);
+ cudaFree(mp->d_free_surface_ijk);
+ cudaFree(mp->d_noise_surface_movie);
+ if( *NOISE_TOMOGRAPHY == 1 ) cudaFree(mp->d_noise_sourcearray);
+ if( *NOISE_TOMOGRAPHY > 1 ){
+ cudaFree(mp->d_normal_x_noise);
+ cudaFree(mp->d_normal_y_noise);
+ cudaFree(mp->d_normal_z_noise);
+ cudaFree(mp->d_mask_noise);
+ cudaFree(mp->d_free_surface_jacobian2Dw);
+ }
+ if( *NOISE_TOMOGRAPHY == 3 ) cudaFree(mp->d_Sigma_kl);
}
- // for seismograms
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),mp->nrec_local*125*sizeof(float)),123);
- mp->h_station_seismo_potential = (float*)malloc(mp->nrec_local*125*sizeof(float));
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("prepare_fields_acoustic_device");
-#endif
-}
-
+ // mesh pointer - not needed anymore
+ free(mp);
+}
\ No newline at end of file
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2011-10-07 01:29:29 UTC (rev 19039)
@@ -24,8 +24,6 @@
/* ----------------------------------------------------------------------------------------------- */
-/* ----------------------------------------------------------------------------------------------- */
-
extern "C"
void FC_FUNC_(transfer_b_fields_to_device,
TRANSFER_B_FIELDS_TO_DEVICE)(int* size, float* b_displ, float* b_veloc, float* b_accel,
@@ -50,9 +48,9 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(float)*(*size),cudaMemcpyHostToDevice),3);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(float)*(*size),cudaMemcpyHostToDevice),4);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),5);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(float)*(*size),cudaMemcpyHostToDevice),40003);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(float)*(*size),cudaMemcpyHostToDevice),40004);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),40005);
}
@@ -82,9 +80,9 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
- print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(float)*(*size),cudaMemcpyDeviceToHost),7);
- print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),8);
+ print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40006);
+ print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40007);
+ print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40008);
// printf("Transfered Fields From Device\n");
// int procid;
@@ -106,7 +104,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),6);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),40016);
}
@@ -120,7 +118,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
+ print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40026);
}
@@ -134,7 +132,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
+ print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40036);
}
@@ -148,7 +146,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
+ print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40046);
}
@@ -162,7 +160,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
+ print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40056);
}
@@ -176,7 +174,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),6);
+ print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40066);
}
@@ -198,7 +196,7 @@
}
/* ----------------------------------------------------------------------------------------------- */
-
+/*
extern "C"
void FC_FUNC_(transfer_compute_kernel_fields_from_device,
TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
@@ -249,17 +247,132 @@
exit_on_cuda_error("after transfer_compute_kernel_fields_from_device");
#endif
}
+*/
+/* ----------------------------------------------------------------------------------------------- */
+// attenuation fields
+
+extern "C"
+void FC_FUNC_(transfer_b_fields_att_to_device,
+ TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
+ float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
+ int* size_R,
+ float* b_epsilondev_xx,
+ float* b_epsilondev_yy,
+ float* b_epsilondev_xy,
+ float* b_epsilondev_xz,
+ float* b_epsilondev_yz,
+ int* size_epsilondev) {
+ TRACE("transfer_b_fields_att_to_device");
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+ cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+
+ cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("after transfer_b_fields_att_to_device");
+#endif
+}
+
/* ----------------------------------------------------------------------------------------------- */
-// for ACOUSTIC simulations
+// attenuation fields
+extern "C"
+void FC_FUNC_(transfer_fields_att_from_device,
+ TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
+ float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+ int* size_R,
+ float* epsilondev_xx,
+ float* epsilondev_yy,
+ float* epsilondev_xy,
+ float* epsilondev_xz,
+ float* epsilondev_yz,
+ int* size_epsilondev) {
+ TRACE("transfer_fields_att_from_device");
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+ cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+
+ cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("after transfer_fields_att_from_device");
+#endif
+}
+
+
/* ----------------------------------------------------------------------------------------------- */
+extern "C"
+void FC_FUNC_(transfer_sensitivity_kernels_to_host,
+ TRANSFER_SENSITIVITY_KERNELS_TO_HOST)(long* Mesh_pointer,
+ float* h_rho_kl,
+ float* h_mu_kl,
+ float* h_kappa_kl,
+ int* NSPEC_AB) {
+TRACE("transfer_sensitivity_kernels_to_host");
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+
+ print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*125*sizeof(float),
+ cudaMemcpyDeviceToHost),40101);
+ print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*125*sizeof(float),
+ cudaMemcpyDeviceToHost),40102);
+ print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*125*sizeof(float),
+ cudaMemcpyDeviceToHost),40103);
+
+}
+
/* ----------------------------------------------------------------------------------------------- */
+// for NOISE simulations
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_sensitivity_kernels_noise_to_host,
+ TRANSFER_SENSITIVITY_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
+ float* h_Sigma_kl,
+ int* NSPEC_AB) {
+TRACE("transfer_sensitivity_kernels_noise_to_host");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+ print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,125*(*NSPEC_AB)*sizeof(float),
+ cudaMemcpyHostToDevice),40201);
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// for ACOUSTIC simulations
+
+/* ----------------------------------------------------------------------------------------------- */
+
extern "C"
void FC_FUNC_(transfer_fields_acoustic_to_device,
TRANSFER_FIELDS_ACOUSTIC_TO_DEVICE)(
@@ -273,11 +386,11 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_acoustic,potential_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),110);
+ sizeof(float)*(*size),cudaMemcpyHostToDevice),50110);
print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_acoustic,potential_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),120);
+ sizeof(float)*(*size),cudaMemcpyHostToDevice),50120);
print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),130);
+ sizeof(float)*(*size),cudaMemcpyHostToDevice),50130);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_fields_acoustic_to_device");
@@ -299,11 +412,11 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_acoustic,b_potential_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),110);
+ sizeof(float)*(*size),cudaMemcpyHostToDevice),51110);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),120);
+ sizeof(float)*(*size),cudaMemcpyHostToDevice),51120);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),130);
+ sizeof(float)*(*size),cudaMemcpyHostToDevice),51130);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_b_fields_acoustic_to_device");
@@ -325,11 +438,11 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(potential_acoustic,mp->d_potential_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),111);
+ sizeof(float)*(*size),cudaMemcpyDeviceToHost),52111);
print_CUDA_error_if_any(cudaMemcpy(potential_dot_acoustic,mp->d_potential_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),121);
+ sizeof(float)*(*size),cudaMemcpyDeviceToHost),52121);
print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),131);
+ sizeof(float)*(*size),cudaMemcpyDeviceToHost),52131);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_fields_acoustic_from_device");
@@ -351,11 +464,11 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(b_potential_acoustic,mp->d_b_potential_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),111);
+ sizeof(float)*(*size),cudaMemcpyDeviceToHost),53111);
print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_acoustic,mp->d_b_potential_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),121);
+ sizeof(float)*(*size),cudaMemcpyDeviceToHost),53121);
print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),131);
+ sizeof(float)*(*size),cudaMemcpyDeviceToHost),53131);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_b_fields_acoustic_from_device");
@@ -363,3 +476,25 @@
}
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(transfer_sensitivity_kernels_acoustic_to_host,
+ TRANSFER_SENSITIVITY_KERNELS_ACOUSTIC_TO_HOST)(long* Mesh_pointer,
+ float* h_rho_ac_kl,
+ float* h_kappa_ac_kl,
+ int* NSPEC_AB) {
+
+ TRACE("transfer_sensitivity_kernels_acoustic_to_host");
+
+ //get mesh pointer out of fortran integer container
+ Mesh* mp = (Mesh*)(*Mesh_pointer);
+ int size = *NSPEC_AB*125;
+
+ // copies kernel values over to CPU host
+ print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(float),
+ cudaMemcpyDeviceToHost),54101);
+ print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(float),
+ cudaMemcpyDeviceToHost),54102);
+}
+
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90 2011-10-07 01:29:29 UTC (rev 19039)
@@ -81,7 +81,7 @@
else ! GPU_MODE==.true.
! contains both forward SIM_TYPE==1 and backward SIM_TYPE==3 simulations
call compute_forces_elastic_cuda(Mesh_pointer, iphase, nspec_outer_elastic, &
- nspec_inner_elastic,COMPUTE_AND_STORE_STRAIN,SIMULATION_TYPE)
+ nspec_inner_elastic,COMPUTE_AND_STORE_STRAIN,SIMULATION_TYPE,ATTENUATION)
endif
else if (NGLLX == 6) then
call compute_forces_elastic_Dev_6points(iphase, NSPEC_AB,NGLOB_AB,displ,accel, &
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_kernels.f90 2011-10-07 01:29:29 UTC (rev 19039)
@@ -63,7 +63,6 @@
use specfem_par
use specfem_par_elastic
- use specfem_par_movie,only: nfaces_surface_ext_mesh
implicit none
! local parameters
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90 2011-10-07 01:29:29 UTC (rev 19039)
@@ -205,9 +205,20 @@
stop 'Deville et al. (2002) routines can only be used if NGLLX = NGLLY = NGLLZ is in [5-10]'
endif
+ ! check for GPU runs
if( GPU_MODE ) then
if( NGLLX /= 5 .or. NGLLY /= 5 .or. NGLLZ /= 5 ) &
stop 'GPU mode can only be used if NGLLX == NGLLY == NGLLZ == 5'
+ if( CUSTOM_REAL /= 4 ) &
+ stop 'GPU mode runs only with CUSTOM_REAL == 4'
+ if( SAVE_MOHO_MESH ) &
+ stop 'GPU mode does not support SAVE_MOHO_MESH yet'
+ if( ATTENUATION ) then
+ if( N_SLS /= 3 ) &
+ stop 'GPU mode does not support N_SLS /= 3 yet'
+ endif
+ if( ANISOTROPY ) &
+ stop 'GPU mode does not support ANISOTROPY yet'
endif
! absorbing surfaces
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90 2011-10-07 01:29:29 UTC (rev 19039)
@@ -468,10 +468,11 @@
read(27) b_epsilondev_yz
! puts elastic attenuation arrays to GPU
- ! daniel: TODO transfer R_xx,... and epsilondev_xx,... as well
if(GPU_MODE) &
- call exit_MPI(myrank,'read forward arrays: not fully implemented yet for elastic domains with attenuation')
-
+ call transfer_b_fields_att_to_device(Mesh_pointer, &
+ b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), &
+ b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
+ size(b_epsilondev_xx))
endif
endif
@@ -523,10 +524,13 @@
! puts elastic fields onto GPU
if(GPU_MODE) then
+ ! wavefields
call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
-
- ! daniel: TODO transfer R_xx,... and epsilondev_xx,... as well
- call exit_MPI(myrank,'store attenuation arrays: not fully implemented yet for elastic domains')
+ ! attenuation arrays
+ call transfer_b_fields_att_to_device(Mesh_pointer, &
+ b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz,size(b_R_xx), &
+ b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
+ size(b_epsilondev_xx))
endif
endif
@@ -552,9 +556,12 @@
! gets elastic fields from GPU onto CPU
if(GPU_MODE) then
call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
-
- ! daniel: TODO transfer R_xx,... and epsilondev_xx,... as well
- call exit_MPI(myrank,'store attenuation arrays: not fully implemented yet for elastic domains')
+
+ ! attenuation arrays
+ call transfer_fields_att_from_device(Mesh_pointer, &
+ R_xx,R_yy,R_xy,R_xz,R_yz,size(R_xx), &
+ epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
+ size(epsilondev_xx))
endif
! writes to disk file
@@ -603,25 +610,58 @@
implicit none
- ! acoustic potentials
- if( ACOUSTIC_SIMULATION ) then
- call transfer_fields_acoustic_from_device(NGLOB_AB,potential_acoustic, &
+ ! to store forward wave fields
+ if (SIMULATION_TYPE == 1 .and. SAVE_FORWARD) then
+
+ ! acoustic potentials
+ if( ACOUSTIC_SIMULATION ) &
+ call transfer_fields_acoustic_from_device(NGLOB_AB,potential_acoustic, &
potential_dot_acoustic, potential_dot_dot_acoustic, Mesh_pointer)
- if( SIMULATION_TYPE == 3 ) then
- call transfer_b_fields_acoustic_from_device(NGLOB_AB,b_potential_acoustic, &
- b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
+
+ ! elastic wavefield
+ if( ELASTIC_SIMULATION ) then
+ call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
+
+ if (ATTENUATION) &
+ call transfer_fields_att_from_device(Mesh_pointer, &
+ R_xx,R_yy,R_xy,R_xz,R_yz,size(R_xx), &
+ epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
+ size(epsilondev_xx))
+
+ endif
+ else if (SIMULATION_TYPE == 3) then
+
+ ! to store kernels
+
+ if( ACOUSTIC_SIMULATION ) then
+ ! only in case needed...
+ !call transfer_b_fields_acoustic_from_device(NGLOB_AB,b_potential_acoustic, &
+ ! b_potential_dot_acoustic, b_potential_dot_dot_acoustic, Mesh_pointer)
+
+ ! acoustic kernels
call transfer_sensitivity_kernels_acoustic_to_host(Mesh_pointer,rho_ac_kl,kappa_ac_kl,NSPEC_AB)
endif
+
+ if( ELASTIC_SIMULATION ) then
+ ! only in case needed...
+ !call transfer_b_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
+
+ ! elastic kernels
+ call transfer_sensitivity_kernels_to_host(Mesh_pointer,rho_kl,mu_kl,kappa_kl,NSPEC_AB)
+ endif
+
+ ! specific noise strength kernel
+ if( NOISE_TOMOGRAPHY == 3 ) then
+ call transfer_sensitivity_kernels_to_host(Mesh_pointer,Sigma_kl,NSPEC_AB)
+ endif
+
endif
- ! elastic wavefield
- if( ELASTIC_SIMULATION ) then
- call transfer_fields_from_device(NDIM*NGLOB_AB,displ,veloc, accel, Mesh_pointer)
- if( SIMULATION_TYPE == 3 ) then
- call transfer_b_fields_from_device(NDIM*NGLOB_AB,b_displ,b_veloc,b_accel, Mesh_pointer)
- call transfer_sensitivity_kernels_to_host(Mesh_pointer, rho_kl, mu_kl, kappa_kl,Sigma_kl, &
- NSPEC_AB)
- endif
- endif
+ ! frees allocated memory on GPU
+ call prepare_cleanup_device(Mesh_pointer,num_abs_boundary_faces, &
+ SIMULATION_TYPE,ACOUSTIC_SIMULATION,ELASTIC_SIMULATION, &
+ ABSORBING_CONDITIONS,NOISE_TOMOGRAPHY,COMPUTE_AND_STORE_STRAIN, &
+ ATTENUATION)
+
end subroutine it_transfer_from_GPU
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-10-07 00:32:19 UTC (rev 19038)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-10-07 01:29:29 UTC (rev 19039)
@@ -732,30 +732,28 @@
! prepares general fields on GPU
call prepare_constants_device(Mesh_pointer, &
- NGLLX, NSPEC_AB, NGLOB_AB, &
- xix, xiy, xiz, etax,etay,etaz, gammax, gammay, gammaz, &
- kappastore, mustore,ibool, &
- num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
- nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
- hprime_xx, hprime_yy, hprime_zz, &
- hprimewgll_xx, wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
- abs_boundary_ispec, abs_boundary_ijk, &
- abs_boundary_normal, &
- abs_boundary_jacobian2Dw, &
- b_absorb_field, num_abs_boundary_faces, b_num_abs_boundary_faces, &
- ispec_is_inner, &
- NSOURCES, sourcearrays, islice_selected_source, ispec_selected_source, &
- number_receiver_global, ispec_selected_rec, nrec, nrec_local, &
- SIMULATION_TYPE)
+ NGLLX, NSPEC_AB, NGLOB_AB, &
+ xix, xiy, xiz, etax,etay,etaz, gammax, gammay, gammaz, &
+ kappastore, mustore,ibool, &
+ num_interfaces_ext_mesh, max_nibool_interfaces_ext_mesh, &
+ nibool_interfaces_ext_mesh, ibool_interfaces_ext_mesh, &
+ hprime_xx, hprime_yy, hprime_zz, &
+ hprimewgll_xx, wgllwgll_xy, wgllwgll_xz, wgllwgll_yz, &
+ abs_boundary_ispec, abs_boundary_ijk, &
+ abs_boundary_normal, &
+ abs_boundary_jacobian2Dw, &
+ b_absorb_field, num_abs_boundary_faces, b_num_abs_boundary_faces, &
+ ispec_is_inner, &
+ NSOURCES, sourcearrays, islice_selected_source, ispec_selected_source, &
+ number_receiver_global, ispec_selected_rec, nrec, nrec_local, &
+ SIMULATION_TYPE)
-! call prepare_fields_device(Mesh_pointer, NDIM*NGLOB_AB);
-
! prepares fields on GPU for acoustic simulations
if( ACOUSTIC_SIMULATION ) &
call prepare_fields_acoustic_device(Mesh_pointer,rmass_acoustic,rhostore,kappastore, &
num_phase_ispec_acoustic,phase_ispec_inner_acoustic, &
ispec_is_acoustic, &
- num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
+ NOISE_TOMOGRAPHY,num_free_surface_faces,free_surface_ispec,free_surface_ijk, &
ABSORBING_CONDITIONS,b_reclen_potential,b_absorb_potential, &
SIMULATION_TYPE,rho_ac_kl,kappa_ac_kl)
@@ -765,59 +763,38 @@
rmass,rho_vp,rho_vs, &
num_phase_ispec_elastic,phase_ispec_inner_elastic, &
ispec_is_elastic, &
- ABSORBING_CONDITIONS,b_absorb_field,b_num_abs_boundary_faces)
+ ABSORBING_CONDITIONS,b_absorb_field,b_num_abs_boundary_faces, &
+ SIMULATION_TYPE,rho_kl,mu_kl,kappa_kl, &
+ COMPUTE_AND_STORE_STRAIN, &
+ epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz, &
+ epsilon_trace_over_3, &
+ b_epsilondev_xx,b_epsilondev_yy,b_epsilondev_xy,b_epsilondev_xz,b_epsilondev_yz, &
+ b_epsilon_trace_over_3, &
+ ATTENUATION,size(R_xx), &
+ R_xx,R_yy,R_xy,R_xz,R_yz, &
+ b_R_xx,b_R_yy,b_R_xy,b_R_xz,b_R_yz, &
+ one_minus_sum_beta,factor_common, &
+ alphaval,betaval,gammaval, &
+ b_alphaval,b_betaval,b_gammaval)
- ! prepares receiver arrays for adjoint runs
- if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) then
+ ! prepares needed receiver array for adjoint runs
+ if( SIMULATION_TYPE == 2 .or. SIMULATION_TYPE == 3 ) &
call prepare_adjoint_sim2_or_3_constants_device(Mesh_pointer, &
- islice_selected_rec, &
- size(islice_selected_rec))
+ islice_selected_rec,size(islice_selected_rec))
- endif
-
! prepares fields on GPU for noise simulations
if ( NOISE_TOMOGRAPHY > 0 ) then
! note: noise tomography is only supported for elastic domains so far.
- ! copies noise (free) surface arrays to GPU
- call prepare_noise_constants_device(Mesh_pointer, NGLLX, NSPEC_AB, NGLOB_AB, &
- free_surface_ispec,free_surface_ijk,num_free_surface_faces,size(free_surface_ijk), &
- SIMULATION_TYPE)
+ ! copies noise arrays to GPU
+ call prepare_fields_noise_device(Mesh_pointer, NSPEC_AB, NGLOB_AB, &
+ free_surface_ispec,free_surface_ijk,num_free_surface_faces,size(free_surface_ijk), &
+ SIMULATION_TYPE,NOISE_TOMOGRAPHY, &
+ NSTEP,noise_sourcearray, &
+ normal_x_noise,normal_y_noise,normal_z_noise, &
+ mask_noise,free_surface_jacobian2Dw, &
+ Sigma_kl)
- call prepare_adjoint_constants_device(Mesh_pointer, &
- !ispec_selected_rec,islice_selected_rec,nrec,size(islice_selected_rec),&
- noise_sourcearray, NSTEP,&
- epsilondev_xx,&
- epsilondev_yy,&
- epsilondev_xy,&
- epsilondev_xz,&
- epsilondev_yz,&
- NSPEC_STRAIN_ONLY)
-
- if(NOISE_TOMOGRAPHY > 1) &
- call prepare_and_transfer_noise_backward_constants(Mesh_pointer,&
- normal_x_noise,&
- normal_y_noise,&
- normal_z_noise,&
- mask_noise,&
- free_surface_jacobian2Dw,&
- nfaces_surface_ext_mesh)
-
- if( SIMULATION_TYPE == 3) then
- ! now have backward fields in addition to standard fields
- call prepare_and_transfer_noise_backward_fields(Mesh_pointer, NDIM*NGLOB_AB, &
- b_displ, b_veloc, b_accel,&
- b_epsilondev_xx, b_epsilondev_yy, b_epsilondev_xy,&
- b_epsilondev_xz, b_epsilondev_yz, &
- NSPEC_STRAIN_ONLY)
-
- call prepare_sensitivity_kernels(Mesh_pointer,&
- rho_kl,mu_kl,kappa_kl,&
- epsilon_trace_over_3,b_epsilon_trace_over_3,&
- Sigma_kl,NSPEC_AB)
-
-
- endif
endif ! NOISE_TOMOGRAPHY
! sends initial data to device
@@ -839,9 +816,6 @@
call transfer_b_fields_to_device(NDIM*NGLOB_AB,b_displ,b_veloc, b_accel,Mesh_pointer)
endif
- ! outputs GPU usage to files for all processes
- call output_free_device_memory(myrank)
-
! outputs usage
if( myrank == 0 ) then
call get_free_device_memory(free_mb,used_mb,total_mb)
@@ -850,5 +824,8 @@
write(IMAIN,*)" total =",total_mb," MB"
write(IMAIN,*)
endif
+
+ ! outputs GPU usage to files for all processes
+ call output_free_device_memory(myrank)
end subroutine prepare_timerun_GPU
More information about the CIG-COMMITS
mailing list