[cig-commits] r20536 - in seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER: UTILS src/cuda src/shared src/specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Mon Jul 23 14:58:26 PDT 2012
Author: danielpeter
Date: 2012-07-23 14:58:26 -0700 (Mon, 23 Jul 2012)
New Revision: 20536
Modified:
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/UTILS/update_headers_change_word_f90.pl
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90
seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
Log:
updates texture usage for cuda routines
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/UTILS/update_headers_change_word_f90.pl
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/UTILS/update_headers_change_word_f90.pl 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/UTILS/update_headers_change_word_f90.pl 2012-07-23 21:58:26 UTC (rev 20536)
@@ -22,7 +22,7 @@
#
- at objects = `ls src/*/*.f90 src/*/*.F90 src/*/*.h.in src/*/*.h src/*/*.c src/*/*.cu`;
+ at objects = `ls src/*/*.f90 src/*/*.F90 src/*/*.h.in src/*/*.h src/*/*.c src/*/*.cu setup/*.h.in`;
foreach $name (@objects) {
chop $name;
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/check_fields_cuda.cu 2012-07-23 21:58:26 UTC (rev 20536)
@@ -39,9 +39,7 @@
#include "config.h"
#include "mesh_constants_cuda.h"
-#include "prepare_constants_cuda.h"
-
/* ----------------------------------------------------------------------------------------------- */
// Helper functions
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_crust_mantle_cuda.cu 2012-07-23 21:58:26 UTC (rev 20536)
@@ -38,226 +38,18 @@
#include "mesh_constants_cuda.h"
-// cuda constant arrays
-__device__ realw d_hprime_xx[NGLL2];
-__device__ realw d_hprime_yy[NGLL2];
-__device__ realw d_hprime_zz[NGLL2];
-__device__ realw d_hprimewgll_xx[NGLL2];
-__device__ realw d_hprimewgll_yy[NGLL2];
-__device__ realw d_hprimewgll_zz[NGLL2];
-__device__ realw d_wgllwgll_xy[NGLL2];
-__device__ realw d_wgllwgll_xz[NGLL2];
-__device__ realw d_wgllwgll_yz[NGLL2];
+#ifdef USE_TEXTURES_FIELDS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_cm_tex;
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_cm_tex;
+#endif
-__constant__ realw d_wgll_cube[NGLL3]; // needed only for gravity case
+#ifdef USE_TEXTURES_CONSTANTS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_cm_tex;
+#endif
-/* ----------------------------------------------------------------------------------------------- */
-// CONSTANT arrays setup
-
/* ----------------------------------------------------------------------------------------------- */
-/* note:
- constant arrays when used in other compute_forces_***_cuda.cu routines stay zero,
- constant declaration and cudaMemcpyToSymbol would have to be in the same file...
-
- extern keyword doesn't work for __constant__ declarations.
-
- also:
- cudaMemcpyToSymbol("deviceCaseParams", caseParams, sizeof(CaseParams));
- ..
- and compile with -arch=sm_20
-
- see also: http://stackoverflow.com/questions/4008031/how-to-use-cuda-constant-memory-in-a-programmer-pleasant-way
- doesn't seem to work.
-
- we could keep arrays separated for acoustic and elastic routines...
-
- workaround:
-
- for now, we store pointers with cudaGetSymbolAddress() function calls.
- we pass those pointers in all other compute_forces_..() routines
-
- in this file, we can use the above constant array declarations without need of the pointers.
-
- */
-
-// constant arrays
-
-void setConst_hprime_xx(realw* array,Mesh* mp)
-{
-
- cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err));
- fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
- exit(1);
- }
-
- err = cudaGetSymbolAddress((void**)&(mp->d_hprime_xx),"d_hprime_xx");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_hprime_xx: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-}
-
-void setConst_hprime_yy(realw* array,Mesh* mp)
-{
-
- cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
- fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
- exit(1);
- }
-
- err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-}
-
-void setConst_hprime_zz(realw* array,Mesh* mp)
-{
-
- cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
- fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
- exit(1);
- }
-
- err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-}
-
-
-void setConst_hprimewgll_xx(realw* array,Mesh* mp)
-{
- cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-
- err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_xx),"d_hprimewgll_xx");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_hprimewgll_xx: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-}
-
-void setConst_hprimewgll_yy(realw* array,Mesh* mp)
-{
- cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-
- err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_yy),"d_hprimewgll_yy");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_hprimewgll_yy: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-}
-
-void setConst_hprimewgll_zz(realw* array,Mesh* mp)
-{
- cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-
- err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_zz),"d_hprimewgll_zz");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_hprimewgll_zz: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-}
-
-void setConst_wgllwgll_xy(realw* array,Mesh* mp)
-{
- cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_wgllwgll_xy: %s\n", cudaGetErrorString(err));
- exit(1);
- }
- //mp->d_wgllwgll_xy = d_wgllwgll_xy;
- err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xy),"d_wgllwgll_xy");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_wgllwgll_xy: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-
-}
-
-void setConst_wgllwgll_xz(realw* array,Mesh* mp)
-{
- cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_wgllwgll_xz: %s\n", cudaGetErrorString(err));
- exit(1);
- }
- //mp->d_wgllwgll_xz = d_wgllwgll_xz;
- err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xz),"d_wgllwgll_xz");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_wgllwgll_xz: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-
-}
-
-void setConst_wgllwgll_yz(realw* array,Mesh* mp)
-{
- cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_wgllwgll_yz: %s\n", cudaGetErrorString(err));
- exit(1);
- }
- //mp->d_wgllwgll_yz = d_wgllwgll_yz;
- err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_yz),"d_wgllwgll_yz");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_wgllwgll_yz: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-
-}
-
-void setConst_wgll_cube(realw* array,Mesh* mp)
-{
- cudaError_t err = cudaMemcpyToSymbol(d_wgll_cube, array, NGLL3*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in setConst_wgll_cube: %s\n", cudaGetErrorString(err));
- exit(1);
- }
- //mp->d_wgll_cube = d_wgll_cube;
- err = cudaGetSymbolAddress((void**)&(mp->d_wgll_cube),"d_wgll_cube");
- if(err != cudaSuccess) {
- fprintf(stderr, "Error with d_wgll_cube: %s\n", cudaGetErrorString(err));
- exit(1);
- }
-
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
// elemental routines
/* ----------------------------------------------------------------------------------------------- */
@@ -270,31 +62,29 @@
realw* R_xy,
realw* R_xz,
realw* R_yz,
- reald* sigma_xx,
- reald* sigma_yy,
- reald* sigma_zz,
- reald* sigma_xy,
- reald* sigma_xz,
- reald* sigma_yz) {
+ realw* sigma_xx,
+ realw* sigma_yy,
+ realw* sigma_zz,
+ realw* sigma_xy,
+ realw* sigma_xz,
+ realw* sigma_yz) {
- int i_sls,offset;
- reald R_xx_val,R_yy_val;
+ int i_sls;
+ realw R_xx_val,R_yy_val;
for(i_sls = 0; i_sls < N_SLS; i_sls++){
// index
// note: index for R_xx,.. here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
// local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
- offset = i_sls + N_SLS*(tx + NGLL3*working_element);
+ R_xx_val = R_xx[i_sls + N_SLS*(tx + NGLL3*working_element)];
+ R_yy_val = R_yy[i_sls + N_SLS*(tx + NGLL3*working_element)];
- R_xx_val = R_xx[offset];
- R_yy_val = R_yy[offset];
-
*sigma_xx = *sigma_xx - R_xx_val;
*sigma_yy = *sigma_yy - R_yy_val;
*sigma_zz = *sigma_zz + R_xx_val + R_yy_val;
- *sigma_xy = *sigma_xy - R_xy[offset];
- *sigma_xz = *sigma_xz - R_xz[offset];
- *sigma_yz = *sigma_yz - R_yz[offset];
+ *sigma_xy = *sigma_xy - R_xy[i_sls + N_SLS*(tx + NGLL3*working_element)];
+ *sigma_xz = *sigma_xz - R_xz[i_sls + N_SLS*(tx + NGLL3*working_element)];
+ *sigma_yz = *sigma_yz - R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)];
}
return;
}
@@ -310,29 +100,23 @@
realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
realw* epsilondev_xz,realw* epsilondev_yz,
- reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
- reald epsilondev_xz_loc,reald epsilondev_yz_loc,
+ realw epsilondev_xx_loc,realw epsilondev_yy_loc,realw epsilondev_xy_loc,
+ realw epsilondev_xz_loc,realw epsilondev_yz_loc,
int ANISOTROPY,
realw* d_c44store,
int ATTENUATION_3D
){
int i_sls;
- int ijk_ispec;
- int offset_align,offset;
- reald fac;
- reald alphaval_loc,betaval_loc,gammaval_loc;
- reald factor_loc,Sn,Snp1;
+ realw fac;
+ realw alphaval_loc,betaval_loc,gammaval_loc;
+ realw factor_loc,Sn,Snp1;
- // indices
- offset_align = tx + NGLL3_PADDED * working_element;
- ijk_ispec = tx + NGLL3 * working_element;
-
// shear moduli for common factor (only Q_mu attenuation)
if( ANISOTROPY ){
- fac = d_c44store[offset_align];
+ fac = d_c44store[tx + NGLL3_PADDED * working_element];
}else{
- fac = d_muvstore[offset_align];
+ fac = d_muvstore[tx + NGLL3_PADDED * working_element];
}
// use Runge-Kutta scheme to march in time
@@ -340,48 +124,49 @@
// indices
// note: index for R_xx,... here is (i_sls,i,j,k,ispec) and not (i,j,k,ispec,i_sls) as in local version
// local version: offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
- // index for (i_sls,i,j,k,ispec)
- offset = i_sls + N_SLS*(tx + NGLL3*working_element);
- // index for (i,j,k,ispec,i_sls)
- //offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
-
+ //
// either mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
// or factor_common(i_sls,:,:,:,ispec) * c44store(:,:,:,ispec)
if( ATTENUATION_3D ){
- factor_loc = fac * factor_common[offset];
+ factor_loc = fac * factor_common[i_sls + N_SLS*(tx + NGLL3*working_element)];
}else{
- factor_loc = fac * factor_common[i_sls + N_SLS*working_element];
+ factor_loc = fac * factor_common[i_sls + N_SLS*working_element];
}
-
+
alphaval_loc = alphaval[i_sls]; // (i_sls)
betaval_loc = betaval[i_sls];
gammaval_loc = gammaval[i_sls];
// term in xx
- Sn = factor_loc * epsilondev_xx[ijk_ispec]; //(i,j,k,ispec)
+ Sn = factor_loc * epsilondev_xx[tx + NGLL3 * working_element]; //(i,j,k,ispec)
Snp1 = factor_loc * epsilondev_xx_loc; //(i,j,k)
- R_xx[offset] = alphaval_loc * R_xx[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+ R_xx[i_sls + N_SLS*(tx + NGLL3*working_element)] =
+ alphaval_loc * R_xx[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
// term in yy
- Sn = factor_loc * epsilondev_yy[ijk_ispec];
+ Sn = factor_loc * epsilondev_yy[tx + NGLL3 * working_element];
Snp1 = factor_loc * epsilondev_yy_loc;
- R_yy[offset] = alphaval_loc * R_yy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+ R_yy[i_sls + N_SLS*(tx + NGLL3*working_element)] =
+ alphaval_loc * R_yy[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
// term in zz not computed since zero trace
// term in xy
- Sn = factor_loc * epsilondev_xy[ijk_ispec];
+ Sn = factor_loc * epsilondev_xy[tx + NGLL3 * working_element];
Snp1 = factor_loc * epsilondev_xy_loc;
- R_xy[offset] = alphaval_loc * R_xy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+ R_xy[i_sls + N_SLS*(tx + NGLL3*working_element)] =
+ alphaval_loc * R_xy[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
// term in xz
- Sn = factor_loc * epsilondev_xz[ijk_ispec];
+ Sn = factor_loc * epsilondev_xz[tx + NGLL3 * working_element];
Snp1 = factor_loc * epsilondev_xz_loc;
- R_xz[offset] = alphaval_loc * R_xz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+ R_xz[i_sls + N_SLS*(tx + NGLL3*working_element)] =
+ alphaval_loc * R_xz[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
// term in yz
- Sn = factor_loc * epsilondev_yz[ijk_ispec];
+ Sn = factor_loc * epsilondev_yz[tx + NGLL3 * working_element];
Snp1 = factor_loc * epsilondev_yz_loc;
- R_yz[offset] = alphaval_loc * R_yz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
+ R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)] =
+ alphaval_loc * R_yz[i_sls + N_SLS*(tx + NGLL3*working_element)] + betaval_loc * Sn + gammaval_loc * Snp1;
}
return;
}
@@ -397,38 +182,38 @@
realw* d_minus_deriv_gravity_table,
realw* d_density_table,
realw* wgll_cube,
- reald jacobianl,
- reald* s_dummyx_loc,
- reald* s_dummyy_loc,
- reald* s_dummyz_loc,
- reald* sigma_xx,
- reald* sigma_yy,
- reald* sigma_zz,
- reald* sigma_xy,
- reald* sigma_yx,
- reald* sigma_xz,
- reald* sigma_zx,
- reald* sigma_yz,
- reald* sigma_zy,
- reald* rho_s_H1,
- reald* rho_s_H2,
- reald* rho_s_H3){
+ realw jacobianl,
+ realw* s_dummyx_loc,
+ realw* s_dummyy_loc,
+ realw* s_dummyz_loc,
+ realw* sigma_xx,
+ realw* sigma_yy,
+ realw* sigma_zz,
+ realw* sigma_xy,
+ realw* sigma_yx,
+ realw* sigma_xz,
+ realw* sigma_zx,
+ realw* sigma_yz,
+ realw* sigma_zy,
+ realw* rho_s_H1,
+ realw* rho_s_H2,
+ realw* rho_s_H3){
- reald radius,theta,phi;
- reald cos_theta,sin_theta,cos_phi,sin_phi;
- reald minus_g,minus_dg;
- reald rho;
- reald gxl,gyl,gzl;
- reald minus_g_over_radius,minus_dg_plus_g_over_radius;
- reald cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
- reald Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
- reald sx_l,sy_l,sz_l;
- reald factor;
+ realw radius,theta,phi;
+ realw cos_theta,sin_theta,cos_phi,sin_phi;
+ realw minus_g,minus_dg;
+ realw rho;
+ realw gxl,gyl,gzl;
+ realw minus_g_over_radius,minus_dg_plus_g_over_radius;
+ realw cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
+ realw Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
+ realw sx_l,sy_l,sz_l;
+ realw factor;
// R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
- const reald R_EARTH_KM = 6371.0f;
+ //const realw R_EARTH_KM = 6371.0f;
// uncomment line below for PREM with oceans
- //const reald R_EARTH_KM = 6368.0f;
+ //const realw R_EARTH_KM = 6368.0f;
// compute non-symmetric terms for gravity
@@ -521,18 +306,17 @@
realw* d_c44store,realw* d_c45store,realw* d_c46store,
realw* d_c55store,realw* d_c56store,realw* d_c66store,
int ATTENUATION,
- reald minus_sum_beta,
- reald duxdxl,reald duxdyl,reald duxdzl,
- reald duydxl,reald duydyl,reald duydzl,
- reald duzdxl,reald duzdyl,reald duzdzl,
- reald duxdxl_plus_duydyl,reald duxdxl_plus_duzdzl,reald duydyl_plus_duzdzl,
- reald duxdyl_plus_duydxl,reald duzdxl_plus_duxdzl,reald duzdyl_plus_duydzl,
- reald* sigma_xx,reald* sigma_yy,reald* sigma_zz,
- reald* sigma_xy,reald* sigma_xz,reald* sigma_yz
+ realw minus_sum_beta,
+ realw duxdxl,realw duxdyl,realw duxdzl,
+ realw duydxl,realw duydyl,realw duydzl,
+ realw duzdxl,realw duzdyl,realw duzdzl,
+ realw duxdyl_plus_duydxl,realw duzdxl_plus_duxdzl,realw duzdyl_plus_duydzl,
+ realw* sigma_xx,realw* sigma_yy,realw* sigma_zz,
+ realw* sigma_xy,realw* sigma_xz,realw* sigma_yz
){
- reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
- reald mul;
+ realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+ realw mul;
c11 = d_c11store[offset];
c12 = d_c12store[offset];
@@ -592,14 +376,14 @@
__device__ void compute_element_cm_iso(int offset,
realw* d_kappavstore,realw* d_muvstore,
int ATTENUATION,
- reald one_minus_sum_beta_use,
- reald duxdxl,reald duydyl,reald duzdzl,
- reald duxdxl_plus_duydyl,reald duxdxl_plus_duzdzl,reald duydyl_plus_duzdzl,
- reald duxdyl_plus_duydxl,reald duzdxl_plus_duxdzl,reald duzdyl_plus_duydzl,
- reald* sigma_xx,reald* sigma_yy,reald* sigma_zz,
- reald* sigma_xy,reald* sigma_xz,reald* sigma_yz){
+ realw one_minus_sum_beta_use,
+ realw duxdxl,realw duydyl,realw duzdzl,
+ realw duxdxl_plus_duydyl,realw duxdxl_plus_duzdzl,realw duydyl_plus_duzdzl,
+ realw duxdyl_plus_duydxl,realw duzdxl_plus_duxdzl,realw duzdyl_plus_duydzl,
+ realw* sigma_xx,realw* sigma_yy,realw* sigma_zz,
+ realw* sigma_xy,realw* sigma_xz,realw* sigma_yz){
- reald lambdal,mul,lambdalplus2mul,kappal;
+ realw lambdal,mul,lambdalplus2mul,kappal;
// compute elements with an elastic isotropic rheology
kappal = d_kappavstore[offset];
@@ -630,31 +414,30 @@
realw* d_kappavstore,realw* d_muvstore,
realw* d_kappahstore,realw* d_muhstore,realw* d_eta_anisostore,
int ATTENUATION,
- reald one_minus_sum_beta_use,
- reald duxdxl,reald duxdyl,reald duxdzl,
- reald duydxl,reald duydyl,reald duydzl,
- reald duzdxl,reald duzdyl,reald duzdzl,
- reald duxdxl_plus_duydyl,reald duxdxl_plus_duzdzl,reald duydyl_plus_duzdzl,
- reald duxdyl_plus_duydxl,reald duzdxl_plus_duxdzl,reald duzdyl_plus_duydzl,
+ realw one_minus_sum_beta_use,
+ realw duxdxl,realw duxdyl,realw duxdzl,
+ realw duydxl,realw duydyl,realw duydzl,
+ realw duzdxl,realw duzdyl,realw duzdzl,
+ realw duxdyl_plus_duydxl,realw duzdxl_plus_duxdzl,realw duzdyl_plus_duydzl,
int iglob,int NGLOB,
realw* d_ystore, realw* d_zstore,
- reald* sigma_xx,reald* sigma_yy,reald* sigma_zz,
- reald* sigma_xy,reald* sigma_xz,reald* sigma_yz){
+ realw* sigma_xx,realw* sigma_yy,realw* sigma_zz,
+ realw* sigma_xy,realw* sigma_xz,realw* sigma_yz){
- reald kappavl,muvl,kappahl,muhl;
- reald rhovpvsq,rhovphsq,rhovsvsq,rhovshsq,eta_aniso;
- reald costheta,sintheta,cosphi,sinphi;
- reald costhetasq,sinthetasq,cosphisq,sinphisq,costhetafour,sinthetafour,cosphifour,sinphifour;
- reald costwotheta,sintwotheta,costwophi,sintwophi,cosfourtheta,cosfourphi;
- reald costwothetasq,costwophisq,sintwophisq;
- reald etaminone,twoetaminone;
- reald two_eta_aniso,four_eta_aniso,six_eta_aniso;
- reald two_rhovsvsq,two_rhovshsq; // two_rhovpvsq,two_rhovphsq
- reald four_rhovsvsq,four_rhovshsq; // four_rhovpvsq,four_rhovphsq
- reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
+ realw kappavl,muvl,kappahl,muhl;
+ realw rhovpvsq,rhovphsq,rhovsvsq,rhovshsq,eta_aniso;
+ realw costheta,sintheta,cosphi,sinphi;
+ realw costhetasq,sinthetasq,cosphisq,sinphisq,costhetafour,sinthetafour,cosphifour,sinphifour;
+ realw costwotheta,sintwotheta,costwophi,sintwophi,cosfourtheta,cosfourphi;
+ realw costwothetasq,costwophisq,sintwophisq;
+ realw etaminone,twoetaminone;
+ realw two_eta_aniso,four_eta_aniso,six_eta_aniso;
+ realw two_rhovsvsq,two_rhovshsq; // two_rhovpvsq,two_rhovphsq
+ realw four_rhovsvsq,four_rhovshsq; // four_rhovpvsq,four_rhovphsq
+ realw c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
// cosine and sine function in CUDA only supported for float
- reald theta,phi;
+ realw theta,phi;
// use Kappa and mu from transversely isotropic model
kappavl = d_kappavstore[offset];
@@ -699,10 +482,14 @@
//sinphi = sinf(phi);
sincosf(phi, &sinphi, &cosphi);
- costwotheta = cosf(2.0f * theta);
- sintwotheta = sinf(2.0f * theta);
- costwophi = cosf(2.0f * phi);
- sintwophi = sinf(2.0f * phi);
+ //costwotheta = cosf(2.0f * theta);
+ //sintwotheta = sinf(2.0f * theta);
+ sincosf(2.0f * theta, &sintwotheta, &costwotheta);
+
+ //costwophi = cosf(2.0f * phi);
+ //sintwophi = sinf(2.0f * phi);
+ sincosf(2.0f * phi, &sintwophi, &costwophi);
+
cosfourtheta = cosf(4.0f * theta);
cosfourphi = cosf(4.0f * phi);
}else{
@@ -911,6 +698,9 @@
realw* d_xix, realw* d_xiy, realw* d_xiz,
realw* d_etax, realw* d_etay, realw* d_etaz,
realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+ realw* d_hprime_xx,
+ realw* d_hprimewgll_xx, realw* d_hprimewgll_yy, realw* d_hprimewgll_zz,
+ realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
realw* d_kappavstore, realw* d_muvstore,
realw* d_kappahstore, realw* d_muhstore,
realw* d_eta_anisostore,
@@ -947,11 +737,6 @@
/* int bx = blockIdx.x; */
int tx = threadIdx.x;
- //const int NGLLX = 5;
- // const int NGLL2 = 25;
- //const int NGLL3 = NGLL3;
- const int NGLL3_ALIGN = NGLL3_PADDED;
-
int K = (tx/NGLL2);
int J = ((tx-K*NGLL2)/NGLLX);
int I = (tx-K*NGLL2-J*NGLLX);
@@ -960,680 +745,670 @@
int iglob = 0;
int working_element;
- reald tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
- reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
- reald duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
- reald duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
- reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+ realw tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+ realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+ realw duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+ realw duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+ realw duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
- reald tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
- reald duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
- reald duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+ realw tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
+ realw duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+ realw duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
- reald fac1,fac2,fac3;
- reald minus_sum_beta,one_minus_sum_beta_use;
+ realw fac1,fac2,fac3;
+ realw minus_sum_beta,one_minus_sum_beta_use;
- reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
- reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
- //reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
- reald sum_terms1,sum_terms2,sum_terms3;
+ realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+ realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+ realw sum_terms1,sum_terms2,sum_terms3;
// gravity variables
- reald sigma_yx,sigma_zx,sigma_zy;
- reald rho_s_H1,rho_s_H2,rho_s_H3;
+ realw sigma_yx,sigma_zx,sigma_zy;
+ realw rho_s_H1,rho_s_H2,rho_s_H3;
#ifndef MANUALLY_UNROLLED_LOOPS
int l;
- realw hp1,hp2,hp3;
#endif
- __shared__ reald s_dummyx_loc[NGLL3];
- __shared__ reald s_dummyy_loc[NGLL3];
- __shared__ reald s_dummyz_loc[NGLL3];
+ __shared__ realw s_dummyx_loc[NGLL3];
+ __shared__ realw s_dummyy_loc[NGLL3];
+ __shared__ realw s_dummyz_loc[NGLL3];
- __shared__ reald s_dummyx_loc_att[NGLL3];
- __shared__ reald s_dummyy_loc_att[NGLL3];
- __shared__ reald s_dummyz_loc_att[NGLL3];
+ __shared__ realw s_dummyx_loc_att[NGLL3];
+ __shared__ realw s_dummyy_loc_att[NGLL3];
+ __shared__ realw s_dummyz_loc_att[NGLL3];
- __shared__ reald s_tempx1[NGLL3];
- __shared__ reald s_tempx2[NGLL3];
- __shared__ reald s_tempx3[NGLL3];
- __shared__ reald s_tempy1[NGLL3];
- __shared__ reald s_tempy2[NGLL3];
- __shared__ reald s_tempy3[NGLL3];
- __shared__ reald s_tempz1[NGLL3];
- __shared__ reald s_tempz2[NGLL3];
- __shared__ reald s_tempz3[NGLL3];
+ __shared__ realw s_tempx1[NGLL3];
+ __shared__ realw s_tempx2[NGLL3];
+ __shared__ realw s_tempx3[NGLL3];
+ __shared__ realw s_tempy1[NGLL3];
+ __shared__ realw s_tempy2[NGLL3];
+ __shared__ realw s_tempy3[NGLL3];
+ __shared__ realw s_tempz1[NGLL3];
+ __shared__ realw s_tempz2[NGLL3];
+ __shared__ realw s_tempz3[NGLL3];
+ __shared__ realw sh_hprime_xx[NGLL2];
+
// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
- active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+ active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
// copy from global memory to shared memory
// each thread writes one of the NGLL^3 = 125 data points
- if (active) {
+ if (active) {
#ifdef USE_MESH_COLORING_GPU
- working_element = bx;
+ working_element = bx;
#else
- //mesh coloring
- if( use_mesh_coloring_gpu ){
- working_element = bx;
- }else{
- // iphase-1 and working_element-1 for Fortran->C array conventions
- working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
- }
+ //mesh coloring
+ if( use_mesh_coloring_gpu ){
+ working_element = bx;
+ }else{
+ // iphase-1 and working_element-1 for Fortran->C array conventions
+ working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
+ }
#endif
- // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
- iglob = d_ibool[working_element*NGLL3 + tx]-1;
+ // iglob = d_ibool[working_element*NGLL3_PADDED + tx]-1;
+ iglob = d_ibool[working_element*NGLL3 + tx]-1;
-#ifdef USE_TEXTURES
- s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
- s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
- s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
+#ifdef USE_TEXTURES_FIELDS
+ s_dummyx_loc[tx] = tex1Dfetch(d_displ_cm_tex, iglob*3);
+ s_dummyy_loc[tx] = tex1Dfetch(d_displ_cm_tex, iglob*3 + 1);
+ s_dummyz_loc[tx] = tex1Dfetch(d_displ_cm_tex, iglob*3 + 2);
#else
- // changing iglob indexing to match fortran row changes fast style
- s_dummyx_loc[tx] = d_displ[iglob*3];
- s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
- s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+ // changing iglob indexing to match fortran row changes fast style
+ s_dummyx_loc[tx] = d_displ[iglob*3];
+ s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+ s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
#endif
- if(ATTENUATION){
- if(ATTENUATION_NEW){
- // takes new routines
- // use first order Taylor expansion of displacement for local storage of stresses
- // at this current time step, to fix attenuation in a consistent way
-#ifdef USE_TEXTURES
- s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob);
- s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + NGLOB);
- s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + 2*NGLOB);
+ if(ATTENUATION){
+ if(ATTENUATION_NEW){
+ // takes new routines
+ // use first order Taylor expansion of displacement for local storage of stresses
+ // at this current time step, to fix attenuation in a consistent way
+#ifdef USE_TEXTURES_FIELDS
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(d_displ_cm_tex, iglob*3);
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(d_displ_cm_tex, iglob*3 + 1);
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(d_displ_cm_tex, iglob*3 + 2);
#else
- s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
- s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
- s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
#endif
- }
- else{
+ }
+ else{
// takes old routines
s_dummyx_loc_att[tx] = s_dummyx_loc[tx];
s_dummyy_loc_att[tx] = s_dummyy_loc[tx];
s_dummyz_loc_att[tx] = s_dummyz_loc[tx];
- }
}
}
+ }
+ if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+ sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_cm_tex,tx);
+#else
+ sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+ }
+
// synchronize all the threads (one thread for each of the NGLL grid points of the
// current spectral element) because we need the whole element to be ready in order
// to be able to compute the matrix products along cut planes of the 3D element below
- __syncthreads();
+ __syncthreads();
-#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+ if (active) {
- if (active) {
-
#ifndef MANUALLY_UNROLLED_LOOPS
- tempx1l = 0.f;
- tempx2l = 0.f;
- tempx3l = 0.f;
+ tempx1l = 0.f;
+ tempx2l = 0.f;
+ tempx3l = 0.f;
- tempy1l = 0.f;
- tempy2l = 0.f;
- tempy3l = 0.f;
+ tempy1l = 0.f;
+ tempy2l = 0.f;
+ tempy3l = 0.f;
- tempz1l = 0.f;
- tempz2l = 0.f;
- tempz3l = 0.f;
+ tempz1l = 0.f;
+ tempz2l = 0.f;
+ tempz3l = 0.f;
- for (l=0;l<NGLLX;l++) {
- hp1 = d_hprime_xx[l*NGLLX+I];
- offset = K*NGLL2+J*NGLLX+l;
- tempx1l += s_dummyx_loc[offset]*hp1;
- tempy1l += s_dummyy_loc[offset]*hp1;
- tempz1l += s_dummyz_loc[offset]*hp1;
+ for (l=0;l<NGLLX;l++) {
+ fac1 = sh_hprime_xx[l*NGLLX+I];
+ tempx1l += s_dummyx_loc[K*NGLL2+J*NGLLX+l]*fac1;
+ tempy1l += s_dummyy_loc[K*NGLL2+J*NGLLX+l]*fac1;
+ tempz1l += s_dummyz_loc[K*NGLL2+J*NGLLX+l]*fac1;
- hp2 = d_hprime_xx[l*NGLLX+J];
- offset = K*NGLL2+l*NGLLX+I;
- tempx2l += s_dummyx_loc[offset]*hp2;
- tempy2l += s_dummyy_loc[offset]*hp2;
- tempz2l += s_dummyz_loc[offset]*hp2;
+ fac2 = sh_hprime_xx[l*NGLLX+J];
+ tempx2l += s_dummyx_loc[K*NGLL2+l*NGLLX+I]*fac2;
+ tempy2l += s_dummyy_loc[K*NGLL2+l*NGLLX+I]*fac2;
+ tempz2l += s_dummyz_loc[K*NGLL2+l*NGLLX+I]*fac2;
- hp3 = d_hprime_xx[l*NGLLX+K];
- offset = l*NGLL2+J*NGLLX+I;
- tempx3l += s_dummyx_loc[offset]*hp3;
- tempy3l += s_dummyy_loc[offset]*hp3;
- tempz3l += s_dummyz_loc[offset]*hp3;
+ fac3 = sh_hprime_xx[l*NGLLX+K];
+ tempx3l += s_dummyx_loc[l*NGLL2+J*NGLLX+I]*fac3;
+ tempy3l += s_dummyy_loc[l*NGLL2+J*NGLLX+I]*fac3;
+ tempz3l += s_dummyz_loc[l*NGLL2+J*NGLLX+I]*fac3;
+ }
- }
+ if( ATTENUATION){
+ // temporary variables used for fixing attenuation in a consistent way
+ tempx1l_att = 0.f;
+ tempx2l_att = 0.f;
+ tempx3l_att = 0.f;
- if( ATTENUATION){
- // temporary variables used for fixing attenuation in a consistent way
- tempx1l_att = 0.f;
- tempx2l_att = 0.f;
- tempx3l_att = 0.f;
+ tempy1l_att = 0.f;
+ tempy2l_att = 0.f;
+ tempy3l_att = 0.f;
- tempy1l_att = 0.f;
- tempy2l_att = 0.f;
- tempy3l_att = 0.f;
+ tempz1l_att = 0.f;
+ tempz2l_att = 0.f;
+ tempz3l_att = 0.f;
- tempz1l_att = 0.f;
- tempz2l_att = 0.f;
- tempz3l_att = 0.f;
+ for (l=0;l<NGLLX;l++) {
+ fac1 = sh_hprime_xx[l*NGLLX+I];
+ tempx1l_att += s_dummyx_loc_att[K*NGLL2+J*NGLLX+l]*fac1;
+ tempy1l_att += s_dummyy_loc_att[K*NGLL2+J*NGLLX+l]*fac1;
+ tempz1l_att += s_dummyz_loc_att[K*NGLL2+J*NGLLX+l]*fac1;
- for (l=0;l<NGLLX;l++) {
- hp1 = d_hprime_xx[l*NGLLX+I];
- offset = K*NGLL2+J*NGLLX+l;
- tempx1l_att += s_dummyx_loc_att[offset]*hp1;
- tempy1l_att += s_dummyy_loc_att[offset]*hp1;
- tempz1l_att += s_dummyz_loc_att[offset]*hp1;
+ fac2 = sh_hprime_xx[l*NGLLX+J];
+ tempx2l_att += s_dummyx_loc_att[K*NGLL2+l*NGLLX+I]*fac2;
+ tempy2l_att += s_dummyy_loc_att[K*NGLL2+l*NGLLX+I]*fac2;
+ tempz2l_att += s_dummyz_loc_att[K*NGLL2+l*NGLLX+I]*fac2;
- hp2 = d_hprime_xx[l*NGLLX+J];
- offset = K*NGLL2+l*NGLLX+I;
- tempx2l_att += s_dummyx_loc_att[offset]*hp2;
- tempy2l_att += s_dummyy_loc_att[offset]*hp2;
- tempz2l_att += s_dummyz_loc_att[offset]*hp2;
-
- hp3 = d_hprime_xx[l*NGLLX+K];
- offset = l*NGLL2+J*NGLLX+I;
- tempx3l_att += s_dummyx_loc_att[offset]*hp3;
- tempy3l_att += s_dummyy_loc_att[offset]*hp3;
- tempz3l_att += s_dummyz_loc_att[offset]*hp3;
-
- }
+ fac3 = sh_hprime_xx[l*NGLLX+K];
+ tempx3l_att += s_dummyx_loc_att[l*NGLL2+J*NGLLX+I]*fac3;
+ tempy3l_att += s_dummyy_loc_att[l*NGLL2+J*NGLLX+I]*fac3;
+ tempz3l_att += s_dummyz_loc_att[l*NGLL2+J*NGLLX+I]*fac3;
}
+ }
#else
- tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- if( ATTENUATION){
- // temporary variables used for fixing attenuation in a consistent way
- tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ if( ATTENUATION){
+ // temporary variables used for fixing attenuation in a consistent way
+ tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- }
+ tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ }
#endif
- // compute derivatives of ux, uy and uz with respect to x, y and z
- offset = working_element*NGLL3_ALIGN + tx;
+ // compute derivatives of ux, uy and uz with respect to x, y and z
+ offset = working_element*NGLL3_PADDED + tx;
- xixl = d_xix[offset];
- xiyl = d_xiy[offset];
- xizl = d_xiz[offset];
- etaxl = d_etax[offset];
- etayl = d_etay[offset];
- etazl = d_etaz[offset];
- gammaxl = d_gammax[offset];
- gammayl = d_gammay[offset];
- gammazl = d_gammaz[offset];
+ xixl = d_xix[offset];
+ xiyl = d_xiy[offset];
+ xizl = d_xiz[offset];
+ etaxl = d_etax[offset];
+ etayl = d_etay[offset];
+ etazl = d_etaz[offset];
+ gammaxl = d_gammax[offset];
+ gammayl = d_gammay[offset];
+ gammazl = d_gammaz[offset];
- duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
- duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
- duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+ duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+ duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+ duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
- duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
- duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
- duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+ duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+ duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+ duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
- duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
- duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
- duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+ duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+ duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+ duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
- // precompute some sums to save CPU time
- duxdxl_plus_duydyl = duxdxl + duydyl;
- duxdxl_plus_duzdzl = duxdxl + duzdzl;
- duydyl_plus_duzdzl = duydyl + duzdzl;
- duxdyl_plus_duydxl = duxdyl + duydxl;
- duzdxl_plus_duxdzl = duzdxl + duxdzl;
- duzdyl_plus_duydzl = duzdyl + duydzl;
+ // precompute some sums to save CPU time
+ duxdxl_plus_duydyl = duxdxl + duydyl;
+ duxdxl_plus_duzdzl = duxdxl + duzdzl;
+ duydyl_plus_duzdzl = duydyl + duzdzl;
+ duxdyl_plus_duydxl = duxdyl + duydxl;
+ duzdxl_plus_duxdzl = duzdxl + duxdzl;
+ duzdyl_plus_duydzl = duzdyl + duydzl;
- if( ATTENUATION){
- // temporary variables used for fixing attenuation in a consistent way
- duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
- duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
- duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
+ if( ATTENUATION){
+ // temporary variables used for fixing attenuation in a consistent way
+ duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
+ duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
+ duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
- duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
- duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
- duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
+ duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
+ duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
+ duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
- duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
- duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
- duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
+ duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
+ duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
+ duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
- // precompute some sums to save CPU time
- duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
- duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
- duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
+ // precompute some sums to save CPU time
+ duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
+ duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
+ duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
- // computes deviatoric strain attenuation and/or for kernel calculations
- if(COMPUTE_AND_STORE_STRAIN) {
- realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
+ // computes deviatoric strain attenuation and/or for kernel calculations
+ if(COMPUTE_AND_STORE_STRAIN) {
+ realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
- // local storage: stresses at this current time step
- epsilondev_xx_loc = duxdxl_att - templ;
- epsilondev_yy_loc = duydyl_att - templ;
- epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
- epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
- epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
+ // local storage: stresses at this current time step
+ epsilondev_xx_loc = duxdxl_att - templ;
+ epsilondev_yy_loc = duydyl_att - templ;
+ epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
+ epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
+ epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
- if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) {
- epsilon_trace_over_3[tx] = templ;
- }else{
- epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
- }
+ if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) {
+ epsilon_trace_over_3[tx] = templ;
+ }else{
+ epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
}
- }else{
- // computes deviatoric strain attenuation and/or for kernel calculations
- if(COMPUTE_AND_STORE_STRAIN) {
- realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+ }
+ }else{
+ // computes deviatoric strain attenuation and/or for kernel calculations
+ if(COMPUTE_AND_STORE_STRAIN) {
+ realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
- // local storage: stresses at this current time step
- epsilondev_xx_loc = duxdxl - templ;
- epsilondev_yy_loc = duydyl - templ;
- epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
- epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
- epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+ // local storage: stresses at this current time step
+ epsilondev_xx_loc = duxdxl - templ;
+ epsilondev_yy_loc = duydyl - templ;
+ epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+ epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+ epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
- if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) {
- epsilon_trace_over_3[tx] = templ;
- }else{
- epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
- }
+ if(NSPEC_CRUST_MANTLE_STRAIN_ONLY == 1) {
+ epsilon_trace_over_3[tx] = templ;
+ }else{
+ epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
}
}
+ }
- // attenuation
- if(ATTENUATION){
- // use unrelaxed parameters if attenuation
- if( ATTENUATION_3D ){
- one_minus_sum_beta_use = one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
- }else{
- one_minus_sum_beta_use = one_minus_sum_beta[working_element]; // (1,1,1,ispec)
- }
- minus_sum_beta = one_minus_sum_beta_use - 1.0f;
+ // attenuation
+ if(ATTENUATION){
+ // use unrelaxed parameters if attenuation
+ if( ATTENUATION_3D ){
+ one_minus_sum_beta_use = one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
+ }else{
+ one_minus_sum_beta_use = one_minus_sum_beta[working_element]; // (1,1,1,ispec)
}
+ minus_sum_beta = one_minus_sum_beta_use - 1.0f;
+ }
- // computes stresses
- if(ANISOTROPY){
- // full anisotropic case, stress calculations
- compute_element_cm_aniso(offset,
- d_c11store,d_c12store,d_c13store,d_c14store,d_c15store,d_c16store,d_c22store,
- d_c23store,d_c24store,d_c25store,d_c26store,d_c33store,d_c34store,d_c35store,
- d_c36store,d_c44store,d_c45store,d_c46store,d_c55store,d_c56store,d_c66store,
- ATTENUATION,
- minus_sum_beta,
- duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl,
- duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
- duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
- &sigma_xx,&sigma_yy,&sigma_zz,
- &sigma_xy,&sigma_xz,&sigma_yz);
+ // computes stresses
+ if(ANISOTROPY){
+ // full anisotropic case, stress calculations
+ compute_element_cm_aniso(offset,
+ d_c11store,d_c12store,d_c13store,d_c14store,d_c15store,d_c16store,d_c22store,
+ d_c23store,d_c24store,d_c25store,d_c26store,d_c33store,d_c34store,d_c35store,
+ d_c36store,d_c44store,d_c45store,d_c46store,d_c55store,d_c56store,d_c66store,
+ ATTENUATION,
+ minus_sum_beta,
+ duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl,
+ duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
+ &sigma_xx,&sigma_yy,&sigma_zz,
+ &sigma_xy,&sigma_xz,&sigma_yz);
+ }else{
+ if( ! d_ispec_is_tiso[working_element] ){
+ // isotropic case
+ compute_element_cm_iso(offset,
+ d_kappavstore,d_muvstore,
+ ATTENUATION,
+ one_minus_sum_beta_use,
+ duxdxl,duydyl,duzdzl,
+ duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
+ duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
+ &sigma_xx,&sigma_yy,&sigma_zz,
+ &sigma_xy,&sigma_xz,&sigma_yz);
}else{
- if( ! d_ispec_is_tiso[working_element] ){
- // isotropic case
- compute_element_cm_iso(offset,
+ // transverse isotropy
+ compute_element_cm_tiso(offset,
d_kappavstore,d_muvstore,
+ d_kappahstore,d_muhstore,d_eta_anisostore,
ATTENUATION,
one_minus_sum_beta_use,
- duxdxl,duydyl,duzdzl,
- duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
+ duxdxl,duxdyl,duxdzl,
+ duydxl,duydyl,duydzl,
+ duzdxl,duzdyl,duzdzl,
duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
+ iglob, NGLOB,
+ d_ystore,d_zstore,
&sigma_xx,&sigma_yy,&sigma_zz,
&sigma_xy,&sigma_xz,&sigma_yz);
- }else{
- // transverse isotropy
- compute_element_cm_tiso(offset,
- d_kappavstore,d_muvstore,
- d_kappahstore,d_muhstore,d_eta_anisostore,
- ATTENUATION,
- one_minus_sum_beta_use,
- duxdxl,duxdyl,duxdzl,
- duydxl,duydyl,duydzl,
- duzdxl,duzdyl,duzdzl,
- duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl,
- duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl,
- iglob, NGLOB,
- d_ystore,d_zstore,
- &sigma_xx,&sigma_yy,&sigma_zz,
- &sigma_xy,&sigma_xz,&sigma_yz);
- }
- } // ! end of test whether isotropic or anisotropic element
+ }
+ } // ! end of test whether isotropic or anisotropic element
- if(ATTENUATION && (! USE_ATTENUATION_MIMIC ) ){
- // subtracts memory variables if attenuation
- compute_element_cm_att_stress(tx,working_element,
- R_xx,R_yy,R_xy,R_xz,R_yz,
- &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
- }
+ if(ATTENUATION && (! USE_ATTENUATION_MIMIC ) ){
+ // subtracts memory variables if attenuation
+ compute_element_cm_att_stress(tx,working_element,
+ R_xx,R_yy,R_xy,R_xz,R_yz,
+ &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+ }
- // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
- sigma_yx = sigma_xy;
- sigma_zx = sigma_xz;
- sigma_zy = sigma_yz;
+ // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+ sigma_yx = sigma_xy;
+ sigma_zx = sigma_xz;
+ sigma_zy = sigma_yz;
- // jacobian
- jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
- -xiyl*(etaxl*gammazl-etazl*gammaxl)
- +xizl*(etaxl*gammayl-etayl*gammaxl));
+ // jacobian
+ jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
+ -xiyl*(etaxl*gammazl-etazl*gammaxl)
+ +xizl*(etaxl*gammayl-etayl*gammaxl));
- if( GRAVITY ){
- // computes non-symmetric terms for gravity
- compute_element_cm_gravity(tx,working_element,
- d_ibool,d_xstore,d_ystore,d_zstore,
- d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
- wgll_cube,jacobianl,
- s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
- &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
- &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,
- &rho_s_H1,&rho_s_H2,&rho_s_H3);
- }
+ if( GRAVITY ){
+ // computes non-symmetric terms for gravity
+ compute_element_cm_gravity(tx,working_element,
+ d_ibool,d_xstore,d_ystore,d_zstore,
+ d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
+ wgll_cube,jacobianl,
+ s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+ &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
+ &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,
+ &rho_s_H1,&rho_s_H2,&rho_s_H3);
+ }
- // form dot product with test vector, non-symmetric form
- s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
- s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
- s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+ // form dot product with test vector, non-symmetric form
+ s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+ s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+ s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
- s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
- s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
- s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+ s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+ s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+ s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
- s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
- s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
- s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+ s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+ s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+ s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
- }
+ }
// synchronize all the threads (one thread for each of the NGLL grid points of the
// current spectral element) because we need the whole element to be ready in order
// to be able to compute the matrix products along cut planes of the 3D element below
- __syncthreads();
+ __syncthreads();
- if (active) {
+ if (active) {
#ifndef MANUALLY_UNROLLED_LOOPS
- tempx1l = 0.f;
- tempy1l = 0.f;
- tempz1l = 0.f;
+ tempx1l = 0.f;
+ tempy1l = 0.f;
+ tempz1l = 0.f;
- tempx2l = 0.f;
- tempy2l = 0.f;
- tempz2l = 0.f;
+ tempx2l = 0.f;
+ tempy2l = 0.f;
+ tempz2l = 0.f;
- tempx3l = 0.f;
- tempy3l = 0.f;
- tempz3l = 0.f;
+ tempx3l = 0.f;
+ tempy3l = 0.f;
+ tempz3l = 0.f;
- for (l=0;l<NGLLX;l++) {
+ for (l=0;l<NGLLX;l++) {
- fac1 = d_hprimewgll_xx[I*NGLLX+l];
- offset = K*NGLL2+J*NGLLX+l;
- tempx1l += s_tempx1[offset]*fac1;
- tempy1l += s_tempy1[offset]*fac1;
- tempz1l += s_tempz1[offset]*fac1;
+ fac1 = d_hprimewgll_xx[I*NGLLX+l];
+ tempx1l += s_tempx1[K*NGLL2+J*NGLLX+l]*fac1;
+ tempy1l += s_tempy1[K*NGLL2+J*NGLLX+l]*fac1;
+ tempz1l += s_tempz1[K*NGLL2+J*NGLLX+l]*fac1;
- fac2 = d_hprimewgll_yy[J*NGLLX+l];
- offset = K*NGLL2+l*NGLLX+I;
- tempx2l += s_tempx2[offset]*fac2;
- tempy2l += s_tempy2[offset]*fac2;
- tempz2l += s_tempz2[offset]*fac2;
+ fac2 = d_hprimewgll_yy[J*NGLLX+l];
+ tempx2l += s_tempx2[K*NGLL2+l*NGLLX+I]*fac2;
+ tempy2l += s_tempy2[K*NGLL2+l*NGLLX+I]*fac2;
+ tempz2l += s_tempz2[K*NGLL2+l*NGLLX+I]*fac2;
- fac3 = d_hprimewgll_zz[K*NGLLX+l];
- offset = l*NGLL2+J*NGLLX+I;
- tempx3l += s_tempx3[offset]*fac3;
- tempy3l += s_tempy3[offset]*fac3;
- tempz3l += s_tempz3[offset]*fac3;
+ fac3 = d_hprimewgll_zz[K*NGLLX+l];
+ tempx3l += s_tempx3[l*NGLL2+J*NGLLX+I]*fac3;
+ tempy3l += s_tempy3[l*NGLL2+J*NGLLX+I]*fac3;
+ tempz3l += s_tempz3[l*NGLL2+J*NGLLX+I]*fac3;
- }
+ }
#else
- tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
- + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
- + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
- + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
- + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+ tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
- tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
- + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
- + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
- + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
- + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+ tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
- tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
- + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
- + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
- + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
- + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+ tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
- tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
- + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
- + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
- + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
- + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+ tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+ + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+ + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+ + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+ + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
- tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
- + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
- + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
- + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
- + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+ tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+ + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+ + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+ + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+ + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
- tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
- + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
- + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
- + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
- + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+ tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+ + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+ + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+ + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+ + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
- tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
- + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
- + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
- + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
- + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+ tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+ + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+ + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+ + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+ + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
- tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
- + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
- + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
- + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
- + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+ tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+ + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+ + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+ + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+ + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
- tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
- + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
- + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
- + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
- + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+ tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+ + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+ + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+ + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+ + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
#endif
- fac1 = d_wgllwgll_yz[K*NGLLX+J];
- fac2 = d_wgllwgll_xz[K*NGLLX+I];
- fac3 = d_wgllwgll_xy[J*NGLLX+I];
+ fac1 = d_wgllwgll_yz[K*NGLLX+J];
+ fac2 = d_wgllwgll_xz[K*NGLLX+I];
+ fac3 = d_wgllwgll_xy[J*NGLLX+I];
- sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
- sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
- sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+ sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+ sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+ sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
- // adds gravity term
- if( GRAVITY ){
- sum_terms1 += rho_s_H1;
- sum_terms2 += rho_s_H2;
- sum_terms3 += rho_s_H3;
- }
+ // adds gravity term
+ if( GRAVITY ){
+ sum_terms1 += rho_s_H1;
+ sum_terms2 += rho_s_H2;
+ sum_terms3 += rho_s_H3;
+ }
-#ifdef USE_TEXTURES
- d_accel[iglob] = tex1Dfetch(tex_accel, iglob) + sum_terms1 ;
- d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) + sum_terms2 ;
- d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) + sum_terms3 ;
+
+#ifdef USE_MESH_COLORING_GPU
+ // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+ d_accel[iglob*3] = tex1Dfetch(d_accel_cm_tex, iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 2) + sum_terms3;
#else
- /* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
+ d_accel[iglob*3] += sum_terms1;
+ d_accel[iglob*3 + 1] += sum_terms2;
+ d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+#else // MESH_COLORING
-#ifdef USE_MESH_COLORING_GPU
+ //mesh coloring
+ if( use_mesh_coloring_gpu ){
+
// no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+ d_accel[iglob*3] = tex1Dfetch(d_accel_cm_tex, iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_cm_tex, iglob*3 + 2) + sum_terms3;
+#else
d_accel[iglob*3] += sum_terms1;
d_accel[iglob*3 + 1] += sum_terms2;
d_accel[iglob*3 + 2] += sum_terms3;
-#else
- //mesh coloring
- if( use_mesh_coloring_gpu ){
+#endif // USE_TEXTURES_FIELDS
- // no atomic operation needed, colors don't share global points between elements
- d_accel[iglob*3] += sum_terms1;
- d_accel[iglob*3 + 1] += sum_terms2;
- d_accel[iglob*3 + 2] += sum_terms3;
+ }else{
- }else{
+ // for testing purposes only: w/out atomic updates
+ //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+ //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+ //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
- // for testing purposes only: w/out atomic updates
- //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
- //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
- //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+ atomicAdd(&d_accel[iglob*3], sum_terms1);
+ atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+ atomicAdd(&d_accel[iglob*3+2], sum_terms3);
- atomicAdd(&d_accel[iglob*3], sum_terms1);
- atomicAdd(&d_accel[iglob*3+1], sum_terms2);
- atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+ }
+#endif // MESH_COLORING
- }
-#endif
-
-#endif
-
- // update memory variables based upon the Runge-Kutta scheme
- if( ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
- compute_element_cm_att_memory(tx,working_element,
- d_muvstore,
- factor_common,alphaval,betaval,gammaval,
- R_xx,R_yy,R_xy,R_xz,R_yz,
- epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
- epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc,
- ANISOTROPY,d_c44store,ATTENUATION_3D);
- }
-
- // save deviatoric strain for Runge-Kutta scheme
- if( COMPUTE_AND_STORE_STRAIN ){
- int ijk_ispec = tx + working_element*NGLL3;
-
- // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
- epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
- epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
- epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
- epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
- epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
- }
-
+ // update memory variables based upon the Runge-Kutta scheme
+ if( ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+ compute_element_cm_att_memory(tx,working_element,
+ d_muvstore,
+ factor_common,alphaval,betaval,gammaval,
+ R_xx,R_yy,R_xy,R_xz,R_yz,
+ epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+ epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc,
+ ANISOTROPY,d_c44store,ATTENUATION_3D);
}
-#else // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
- d_accel[iglob] -= 0.00000001f;
- d_accel[iglob + NGLOB] -= 0.00000001f;
- d_accel[iglob + 2*NGLOB] -= 0.00000001f;
-#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-
+ // save deviatoric strain for Runge-Kutta scheme
+ if( COMPUTE_AND_STORE_STRAIN ){
+ // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+ epsilondev_xx[tx + working_element*NGLL3] = epsilondev_xx_loc;
+ epsilondev_yy[tx + working_element*NGLL3] = epsilondev_yy_loc;
+ epsilondev_xy[tx + working_element*NGLL3] = epsilondev_xy_loc;
+ epsilondev_xz[tx + working_element*NGLL3] = epsilondev_xz_loc;
+ epsilondev_yz[tx + working_element*NGLL3] = epsilondev_yz_loc;
+ }
+ }
}
+
/* ----------------------------------------------------------------------------------------------- */
void Kernel_2_crust_mantle(int nb_blocks_to_compute,Mesh* mp,
@@ -1707,98 +1482,104 @@
// cudaEventRecord( start, 0 );
Kernel_2_crust_mantle_impl<<<grid,threads>>>(nb_blocks_to_compute,
- mp->NGLOB_CRUST_MANTLE,
- d_ibool,
- d_ispec_is_tiso,
- mp->d_phase_ispec_inner_crust_mantle,
- mp->num_phase_ispec_crust_mantle,
- d_iphase,
- d_deltat,
- mp->use_mesh_coloring_gpu,
- mp->d_displ_crust_mantle,
- mp->d_veloc_crust_mantle,
- mp->d_accel_crust_mantle,
- d_xix, d_xiy, d_xiz,
- d_etax, d_etay, d_etaz,
- d_gammax, d_gammay, d_gammaz,
- d_kappavstore, d_muvstore,
- d_kappahstore, d_muhstore,
- d_eta_anisostore,
- mp->compute_and_store_strain,
- d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
- d_epsilondev_xz,d_epsilondev_yz,
- d_epsilon_trace_over_3,
- mp->simulation_type,
- mp->attenuation,
- mp->attenuation_new,
- mp->use_attenuation_mimic,
- mp->attenuation_3D,
- d_one_minus_sum_beta,d_factor_common,
- d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
- mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
- mp->anisotropic_3D_mantle,
- d_c11store,d_c12store,d_c13store,
- d_c14store,d_c15store,d_c16store,
- d_c22store,d_c23store,d_c24store,
- d_c25store,d_c26store,d_c33store,
- d_c34store,d_c35store,d_c36store,
- d_c44store,d_c45store,d_c46store,
- d_c55store,d_c56store,d_c66store,
- mp->gravity,
- mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
- mp->d_minus_gravity_table,
- mp->d_minus_deriv_gravity_table,
- mp->d_density_table,
- mp->d_wgll_cube,
- mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
+ mp->NGLOB_CRUST_MANTLE,
+ d_ibool,
+ d_ispec_is_tiso,
+ mp->d_phase_ispec_inner_crust_mantle,
+ mp->num_phase_ispec_crust_mantle,
+ d_iphase,
+ d_deltat,
+ mp->use_mesh_coloring_gpu,
+ mp->d_displ_crust_mantle,
+ mp->d_veloc_crust_mantle,
+ mp->d_accel_crust_mantle,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ d_kappavstore, d_muvstore,
+ d_kappahstore, d_muhstore,
+ d_eta_anisostore,
+ mp->compute_and_store_strain,
+ d_epsilondev_xx,d_epsilondev_yy,d_epsilondev_xy,
+ d_epsilondev_xz,d_epsilondev_yz,
+ d_epsilon_trace_over_3,
+ mp->simulation_type,
+ mp->attenuation,
+ mp->attenuation_new,
+ mp->use_attenuation_mimic,
+ mp->attenuation_3D,
+ d_one_minus_sum_beta,d_factor_common,
+ d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
+ mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+ mp->anisotropic_3D_mantle,
+ d_c11store,d_c12store,d_c13store,
+ d_c14store,d_c15store,d_c16store,
+ d_c22store,d_c23store,d_c24store,
+ d_c25store,d_c26store,d_c33store,
+ d_c34store,d_c35store,d_c36store,
+ d_c44store,d_c45store,d_c46store,
+ d_c55store,d_c56store,d_c66store,
+ mp->gravity,
+ mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
+ mp->d_minus_gravity_table,
+ mp->d_minus_deriv_gravity_table,
+ mp->d_density_table,
+ mp->d_wgll_cube,
+ mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
if(mp->simulation_type == 3) {
Kernel_2_crust_mantle_impl<<< grid,threads>>>(nb_blocks_to_compute,
- mp->NGLOB_CRUST_MANTLE,
- d_ibool,
- d_ispec_is_tiso,
- mp->d_phase_ispec_inner_crust_mantle,
- mp->num_phase_ispec_crust_mantle,
- d_iphase,
- d_deltat,
- mp->use_mesh_coloring_gpu,
- mp->d_b_displ_crust_mantle,
- mp->d_b_veloc_crust_mantle,
- mp->d_b_accel_crust_mantle,
- d_xix, d_xiy, d_xiz,
- d_etax, d_etay, d_etaz,
- d_gammax, d_gammay, d_gammaz,
- d_kappavstore, d_muvstore,
- d_kappahstore, d_muhstore,
- d_eta_anisostore,
- mp->compute_and_store_strain,
- d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
- d_b_epsilondev_xz,d_b_epsilondev_yz,
- d_b_epsilon_trace_over_3,
- mp->simulation_type,
- mp->attenuation,
- mp->attenuation_new,
- mp->use_attenuation_mimic,
- mp->attenuation_3D,
- d_one_minus_sum_beta,d_factor_common,
- d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
- mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
- mp->anisotropic_3D_mantle,
- d_c11store,d_c12store,d_c13store,
- d_c14store,d_c15store,d_c16store,
- d_c22store,d_c23store,d_c24store,
- d_c25store,d_c26store,d_c33store,
- d_c34store,d_c35store,d_c36store,
- d_c44store,d_c45store,d_c46store,
- d_c55store,d_c56store,d_c66store,
- mp->gravity,
- mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
- mp->d_minus_gravity_table,
- mp->d_minus_deriv_gravity_table,
- mp->d_density_table,
- mp->d_wgll_cube,
- mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
+ mp->NGLOB_CRUST_MANTLE,
+ d_ibool,
+ d_ispec_is_tiso,
+ mp->d_phase_ispec_inner_crust_mantle,
+ mp->num_phase_ispec_crust_mantle,
+ d_iphase,
+ d_deltat,
+ mp->use_mesh_coloring_gpu,
+ mp->d_b_displ_crust_mantle,
+ mp->d_b_veloc_crust_mantle,
+ mp->d_b_accel_crust_mantle,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ d_kappavstore, d_muvstore,
+ d_kappahstore, d_muhstore,
+ d_eta_anisostore,
+ mp->compute_and_store_strain,
+ d_b_epsilondev_xx,d_b_epsilondev_yy,d_b_epsilondev_xy,
+ d_b_epsilondev_xz,d_b_epsilondev_yz,
+ d_b_epsilon_trace_over_3,
+ mp->simulation_type,
+ mp->attenuation,
+ mp->attenuation_new,
+ mp->use_attenuation_mimic,
+ mp->attenuation_3D,
+ d_one_minus_sum_beta,d_factor_common,
+ d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
+ mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+ mp->anisotropic_3D_mantle,
+ d_c11store,d_c12store,d_c13store,
+ d_c14store,d_c15store,d_c16store,
+ d_c22store,d_c23store,d_c24store,
+ d_c25store,d_c26store,d_c33store,
+ d_c34store,d_c35store,d_c36store,
+ d_c44store,d_c45store,d_c46store,
+ d_c55store,d_c56store,d_c66store,
+ mp->gravity,
+ mp->d_xstore_crust_mantle,mp->d_ystore_crust_mantle,mp->d_zstore_crust_mantle,
+ mp->d_minus_gravity_table,
+ mp->d_minus_deriv_gravity_table,
+ mp->d_density_table,
+ mp->d_wgll_cube,
+ mp->NSPEC_CRUST_MANTLE_STRAIN_ONLY);
}
// cudaEventRecord( stop, 0 );
@@ -1876,7 +1657,7 @@
if( mp->attenuation_3D ){
color_offset_nonpadded_att2 = (mp->nspec_outer_crust_mantle) * NGLL3 * N_SLS;
}else{
- color_offset_nonpadded_att2 = (mp->nspec_outer_crust_mantle) * 1 * N_SLS;
+ color_offset_nonpadded_att2 = (mp->nspec_outer_crust_mantle) * 1 * N_SLS;
}
color_offset_ispec = mp->nspec_outer_crust_mantle;
}
@@ -1966,7 +1747,7 @@
if( mp->attenuation_3D ){
color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
}else{
- color_offset_nonpadded_att2 += nb_blocks_to_compute * 1 * N_SLS;
+ color_offset_nonpadded_att2 += nb_blocks_to_compute * 1 * N_SLS;
}
// for array(ispec)
color_offset_ispec += nb_blocks_to_compute;
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_inner_core_cuda.cu 2012-07-23 21:58:26 UTC (rev 20536)
@@ -37,6 +37,16 @@
#include "config.h"
#include "mesh_constants_cuda.h"
+#ifdef USE_TEXTURES_FIELDS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_ic_tex;
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_ic_tex;
+#endif
+
+#ifdef USE_TEXTURES_CONSTANTS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_ic_tex;
+#endif
+
+
/* ----------------------------------------------------------------------------------------------- */
// elemental routines
@@ -51,15 +61,15 @@
realw* R_xy,
realw* R_xz,
realw* R_yz,
- reald* sigma_xx,
- reald* sigma_yy,
- reald* sigma_zz,
- reald* sigma_xy,
- reald* sigma_xz,
- reald* sigma_yz) {
+ realw* sigma_xx,
+ realw* sigma_yy,
+ realw* sigma_zz,
+ realw* sigma_xy,
+ realw* sigma_xz,
+ realw* sigma_yz) {
int i_sls,offset;
- reald R_xx_val,R_yy_val;
+ realw R_xx_val,R_yy_val;
for(i_sls = 0; i_sls < N_SLS; i_sls++){
// index
@@ -91,24 +101,19 @@
realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
realw* epsilondev_xz,realw* epsilondev_yz,
- reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
- reald epsilondev_xz_loc,reald epsilondev_yz_loc,
+ realw epsilondev_xx_loc,realw epsilondev_yy_loc,realw epsilondev_xy_loc,
+ realw epsilondev_xz_loc,realw epsilondev_yz_loc,
int ATTENUATION_3D
){
int i_sls;
- int ijk_ispec;
- int offset_align,offset;
- reald mul;
- reald alphaval_loc,betaval_loc,gammaval_loc;
- reald factor_loc,Sn,Snp1;
+ int offset;
+ realw mul;
+ realw alphaval_loc,betaval_loc,gammaval_loc;
+ realw factor_loc,Sn,Snp1;
- // indices
- offset_align = tx + NGLL3_PADDED * working_element;
- ijk_ispec = tx + NGLL3 * working_element;
+ mul = d_muv[tx + NGLL3_PADDED * working_element];
- mul = d_muv[offset_align];
-
// use Runge-Kutta scheme to march in time
for(i_sls = 0; i_sls < N_SLS; i_sls++){
@@ -128,28 +133,28 @@
gammaval_loc = gammaval[i_sls];
// term in xx
- Sn = factor_loc * epsilondev_xx[ijk_ispec]; //(i,j,k,ispec)
+ Sn = factor_loc * epsilondev_xx[tx + NGLL3 * working_element]; //(i,j,k,ispec)
Snp1 = factor_loc * epsilondev_xx_loc; //(i,j,k)
R_xx[offset] = alphaval_loc * R_xx[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
// term in yy
- Sn = factor_loc * epsilondev_yy[ijk_ispec];
+ Sn = factor_loc * epsilondev_yy[tx + NGLL3 * working_element];
Snp1 = factor_loc * epsilondev_yy_loc;
R_yy[offset] = alphaval_loc * R_yy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
// term in zz not computed since zero trace
// term in xy
- Sn = factor_loc * epsilondev_xy[ijk_ispec];
+ Sn = factor_loc * epsilondev_xy[tx + NGLL3 * working_element];
Snp1 = factor_loc * epsilondev_xy_loc;
R_xy[offset] = alphaval_loc * R_xy[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
// term in xz
- Sn = factor_loc * epsilondev_xz[ijk_ispec];
+ Sn = factor_loc * epsilondev_xz[tx + NGLL3 * working_element];
Snp1 = factor_loc * epsilondev_xz_loc;
R_xz[offset] = alphaval_loc * R_xz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
// term in yz
- Sn = factor_loc * epsilondev_yz[ijk_ispec];
+ Sn = factor_loc * epsilondev_yz[tx + NGLL3 * working_element];
Snp1 = factor_loc * epsilondev_yz_loc;
R_yz[offset] = alphaval_loc * R_yz[offset] + betaval_loc * Sn + gammaval_loc * Snp1;
}
@@ -167,40 +172,40 @@
realw* d_minus_deriv_gravity_table,
realw* d_density_table,
realw* wgll_cube,
- reald jacobianl,
- reald* s_dummyx_loc,
- reald* s_dummyy_loc,
- reald* s_dummyz_loc,
- reald* sigma_xx,
- reald* sigma_yy,
- reald* sigma_zz,
- reald* sigma_xy,
- reald* sigma_yx,
- reald* sigma_xz,
- reald* sigma_zx,
- reald* sigma_yz,
- reald* sigma_zy,
- reald* rho_s_H1,
- reald* rho_s_H2,
- reald* rho_s_H3){
+ realw jacobianl,
+ realw* s_dummyx_loc,
+ realw* s_dummyy_loc,
+ realw* s_dummyz_loc,
+ realw* sigma_xx,
+ realw* sigma_yy,
+ realw* sigma_zz,
+ realw* sigma_xy,
+ realw* sigma_yx,
+ realw* sigma_xz,
+ realw* sigma_zx,
+ realw* sigma_yz,
+ realw* sigma_zy,
+ realw* rho_s_H1,
+ realw* rho_s_H2,
+ realw* rho_s_H3){
- reald radius,theta,phi;
- reald cos_theta,sin_theta,cos_phi,sin_phi;
- reald minus_g,minus_dg;
- reald rho;
- reald gxl,gyl,gzl;
- reald minus_g_over_radius,minus_dg_plus_g_over_radius;
- reald cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
- reald Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
- reald sx_l,sy_l,sz_l;
- reald factor;
+ realw radius,theta,phi;
+ realw cos_theta,sin_theta,cos_phi,sin_phi;
+ realw minus_g,minus_dg;
+ realw rho;
+ realw gxl,gyl,gzl;
+ realw minus_g_over_radius,minus_dg_plus_g_over_radius;
+ realw cos_theta_sq,sin_theta_sq,cos_phi_sq,sin_phi_sq;
+ realw Hxxl,Hyyl,Hzzl,Hxyl,Hxzl,Hyzl;
+ realw sx_l,sy_l,sz_l;
+ realw factor;
// R_EARTH_KM is the radius of the bottom of the oceans
- const reald R_EARTH = 6371000.0f; // in m
- const reald R_EARTH_KM = 6371.0f; // in km
+ //const realw R_EARTH = 6371000.0f; // in m
+ //const realw R_EARTH_KM = 6371.0f; // in km
// uncomment line below for PREM with oceans
- //const reald R_EARTH = 6368000.0f;
- //const reald R_EARTH_KM = 6368.0f;
+ //const realw R_EARTH = 6368000.0f;
+ //const realw R_EARTH_KM = 6368.0f;
// compute non-symmetric terms for gravity
@@ -211,7 +216,7 @@
radius = d_xstore[iglob];
// make sure radius is never zero even for points at center of cube
// because we later divide by radius
- if(radius < 100.f / R_EARTH){ radius = 100.f / R_EARTH; }
+ if(radius < 100.f / (R_EARTH_KM*1000.0f)){ radius = 100.f / (R_EARTH_KM*1000.0f); }
theta = d_ystore[iglob];
phi = d_zstore[iglob];
@@ -310,7 +315,7 @@
realw* d_xix, realw* d_xiy, realw* d_xiz,
realw* d_etax, realw* d_etay, realw* d_etaz,
realw* d_gammax, realw* d_gammay, realw* d_gammaz,
- realw* d_hprime_xx, realw* d_hprime_yy, realw* d_hprime_zz,
+ realw* d_hprime_xx,
realw* d_hprimewgll_xx, realw* d_hprimewgll_yy, realw* d_hprimewgll_zz,
realw* d_wgllwgll_xy,realw* d_wgllwgll_xz,realw* d_wgllwgll_yz,
realw* d_kappav,
@@ -342,10 +347,6 @@
/* int bx = blockIdx.x; */
int tx = threadIdx.x;
- //const int NGLLX = 5;
- // const int NGLL2 = 25;
- //const int NGLL3 = NGLL3;
- const int NGLL3_ALIGN = NGLL3_PADDED;
const int IFLAG_IN_FICTITIOUS_CUBE = 11; // from constants.h
int K = (tx/NGLL2);
@@ -356,697 +357,700 @@
int iglob = 0;
int working_element;
- reald tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
- reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
- reald duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
- reald duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
- reald duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
+ realw tempx1l,tempx2l,tempx3l,tempy1l,tempy2l,tempy3l,tempz1l,tempz2l,tempz3l;
+ realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+ realw duxdxl,duxdyl,duxdzl,duydxl,duydyl,duydzl,duzdxl,duzdyl,duzdzl;
+ realw duxdxl_plus_duydyl,duxdxl_plus_duzdzl,duydyl_plus_duzdzl;
+ realw duxdyl_plus_duydxl,duzdxl_plus_duxdzl,duzdyl_plus_duydzl;
- reald tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
- reald duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
- reald duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
+ realw tempx1l_att,tempx2l_att,tempx3l_att,tempy1l_att,tempy2l_att,tempy3l_att,tempz1l_att,tempz2l_att,tempz3l_att;
+ realw duxdxl_att,duxdyl_att,duxdzl_att,duydxl_att,duydyl_att,duydzl_att,duzdxl_att,duzdyl_att,duzdzl_att;
+ realw duxdyl_plus_duydxl_att,duzdxl_plus_duxdzl_att,duzdyl_plus_duydzl_att;
- reald fac1,fac2,fac3;
- reald lambdal,mul,lambdalplus2mul,kappal;
- reald mul_iso,mul_aniso;
- reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
- reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
- reald c11,c12,c13,c33,c44;
- reald sum_terms1,sum_terms2,sum_terms3;
+ realw fac1,fac2,fac3;
+ realw lambdal,mul,lambdalplus2mul,kappal;
+ realw mul_iso,mul_aniso;
+ realw sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
+ realw epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+ realw c11,c12,c13,c33,c44;
+ realw sum_terms1,sum_terms2,sum_terms3;
// gravity variables
- reald sigma_yx,sigma_zx,sigma_zy;
- reald rho_s_H1,rho_s_H2,rho_s_H3;
+ realw sigma_yx,sigma_zx,sigma_zy;
+ realw rho_s_H1,rho_s_H2,rho_s_H3;
#ifndef MANUALLY_UNROLLED_LOOPS
- int l;
- realw hp1,hp2,hp3;
+ int l;
#endif
- __shared__ reald s_dummyx_loc[NGLL3];
- __shared__ reald s_dummyy_loc[NGLL3];
- __shared__ reald s_dummyz_loc[NGLL3];
+ __shared__ realw s_dummyx_loc[NGLL3];
+ __shared__ realw s_dummyy_loc[NGLL3];
+ __shared__ realw s_dummyz_loc[NGLL3];
- __shared__ reald s_dummyx_loc_att[NGLL3];
- __shared__ reald s_dummyy_loc_att[NGLL3];
- __shared__ reald s_dummyz_loc_att[NGLL3];
+ __shared__ realw s_dummyx_loc_att[NGLL3];
+ __shared__ realw s_dummyy_loc_att[NGLL3];
+ __shared__ realw s_dummyz_loc_att[NGLL3];
- __shared__ reald s_tempx1[NGLL3];
- __shared__ reald s_tempx2[NGLL3];
- __shared__ reald s_tempx3[NGLL3];
- __shared__ reald s_tempy1[NGLL3];
- __shared__ reald s_tempy2[NGLL3];
- __shared__ reald s_tempy3[NGLL3];
- __shared__ reald s_tempz1[NGLL3];
- __shared__ reald s_tempz2[NGLL3];
- __shared__ reald s_tempz3[NGLL3];
+ __shared__ realw s_tempx1[NGLL3];
+ __shared__ realw s_tempx2[NGLL3];
+ __shared__ realw s_tempx3[NGLL3];
+ __shared__ realw s_tempy1[NGLL3];
+ __shared__ realw s_tempy2[NGLL3];
+ __shared__ realw s_tempy3[NGLL3];
+ __shared__ realw s_tempz1[NGLL3];
+ __shared__ realw s_tempz2[NGLL3];
+ __shared__ realw s_tempz3[NGLL3];
+ __shared__ realw sh_hprime_xx[NGLL2];
+
// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
- active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
+ active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
// copy from global memory to shared memory
// each thread writes one of the NGLL^3 = 125 data points
- if (active) {
+ if (active) {
#ifdef USE_MESH_COLORING_GPU
+ working_element = bx;
+#else
+ //mesh coloring
+ if( use_mesh_coloring_gpu ){
working_element = bx;
-#else
- //mesh coloring
- if( use_mesh_coloring_gpu ){
- working_element = bx;
- }else{
- // iphase-1 and working_element-1 for Fortran->C array conventions
- working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
- }
+ }else{
+ // iphase-1 and working_element-1 for Fortran->C array conventions
+ working_element = d_phase_ispec_inner[bx + num_phase_ispec*(d_iphase-1)]-1;
+ }
#endif
- // exclude fictitious elements in central cube
- if( d_idoubling[working_element] == IFLAG_IN_FICTITIOUS_CUBE ){
- active = 0;
- }else{
- // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
- iglob = d_ibool[working_element*NGLL3 + tx]-1;
+ // exclude fictitious elements in central cube
+ if( d_idoubling[working_element] == IFLAG_IN_FICTITIOUS_CUBE ){
+ active = 0;
+ }else{
+ // iglob = d_ibool[working_element*NGLL3_PADDED + tx]-1;
+ iglob = d_ibool[working_element*NGLL3 + tx]-1;
-#ifdef USE_TEXTURES
- s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
- s_dummyy_loc[tx] = tex1Dfetch(tex_displ, iglob + NGLOB);
- s_dummyz_loc[tx] = tex1Dfetch(tex_displ, iglob + 2*NGLOB);
+#ifdef USE_TEXTURES_FIELDS
+ s_dummyx_loc[tx] = tex1Dfetch(d_displ_ic_tex, iglob*3);
+ s_dummyy_loc[tx] = tex1Dfetch(d_displ_ic_tex, iglob*3 + 1);
+ s_dummyz_loc[tx] = tex1Dfetch(d_displ_ic_tex, iglob*3 + 2);
#else
- // changing iglob indexing to match fortran row changes fast style
- s_dummyx_loc[tx] = d_displ[iglob*3];
- s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
- s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
+ // changing iglob indexing to match fortran row changes fast style
+ s_dummyx_loc[tx] = d_displ[iglob*3];
+ s_dummyy_loc[tx] = d_displ[iglob*3 + 1];
+ s_dummyz_loc[tx] = d_displ[iglob*3 + 2];
#endif
- if(ATTENUATION){
- if(ATTENUATION_NEW){
- // takes new routines
- // use first order Taylor expansion of displacement for local storage of stresses
- // at this current time step, to fix attenuation in a consistent way
-#ifdef USE_TEXTURES
- s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob);
- s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + NGLOB);
- s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(tex_veloc, iglob + 2*NGLOB);
+ if(ATTENUATION){
+ if(ATTENUATION_NEW){
+ // takes new routines
+ // use first order Taylor expansion of displacement for local storage of stresses
+ // at this current time step, to fix attenuation in a consistent way
+#ifdef USE_TEXTURES_FIELDS
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * tex1Dfetch(d_displ_ic_tex, iglob*3);
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * tex1Dfetch(d_displ_ic_tex, iglob*3 + 1);
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * tex1Dfetch(d_displ_ic_tex, iglob*3 + 2);
#else
- s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
- s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
- s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx] + d_deltat * d_veloc[iglob*3];
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx] + d_deltat * d_veloc[iglob*3 + 1];
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx] + d_deltat * d_veloc[iglob*3 + 2];
#endif
- }
- else{
- // takes old routines
- s_dummyx_loc_att[tx] = s_dummyx_loc[tx];
- s_dummyy_loc_att[tx] = s_dummyy_loc[tx];
- s_dummyz_loc_att[tx] = s_dummyz_loc[tx];
- }
}
+ else{
+ // takes old routines
+ s_dummyx_loc_att[tx] = s_dummyx_loc[tx];
+ s_dummyy_loc_att[tx] = s_dummyy_loc[tx];
+ s_dummyz_loc_att[tx] = s_dummyz_loc[tx];
+ }
}
}
+ }
+ if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+ sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_ic_tex,tx);
+#else
+ sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+ }
+
// synchronize all the threads (one thread for each of the NGLL grid points of the
// current spectral element) because we need the whole element to be ready in order
// to be able to compute the matrix products along cut planes of the 3D element below
- __syncthreads();
+ __syncthreads();
-#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+ if (active) {
- if (active) {
-
#ifndef MANUALLY_UNROLLED_LOOPS
- tempx1l = 0.f;
- tempx2l = 0.f;
- tempx3l = 0.f;
+ tempx1l = 0.f;
+ tempx2l = 0.f;
+ tempx3l = 0.f;
- tempy1l = 0.f;
- tempy2l = 0.f;
- tempy3l = 0.f;
+ tempy1l = 0.f;
+ tempy2l = 0.f;
+ tempy3l = 0.f;
- tempz1l = 0.f;
- tempz2l = 0.f;
- tempz3l = 0.f;
+ tempz1l = 0.f;
+ tempz2l = 0.f;
+ tempz3l = 0.f;
- for (l=0;l<NGLLX;l++) {
- hp1 = d_hprime_xx[l*NGLLX+I];
- offset = K*NGLL2+J*NGLLX+l;
- tempx1l += s_dummyx_loc[offset]*hp1;
- tempy1l += s_dummyy_loc[offset]*hp1;
- tempz1l += s_dummyz_loc[offset]*hp1;
+ for (l=0;l<NGLLX;l++) {
+ fac1 = sh_hprime_xx[l*NGLLX+I];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l += s_dummyx_loc[offset]*fac1;
+ tempy1l += s_dummyy_loc[offset]*fac1;
+ tempz1l += s_dummyz_loc[offset]*fac1;
- hp2 = d_hprime_xx[l*NGLLX+J];
- offset = K*NGLL2+l*NGLLX+I;
- tempx2l += s_dummyx_loc[offset]*hp2;
- tempy2l += s_dummyy_loc[offset]*hp2;
- tempz2l += s_dummyz_loc[offset]*hp2;
+ fac2 = sh_hprime_xx[l*NGLLX+J];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l += s_dummyx_loc[offset]*fac2;
+ tempy2l += s_dummyy_loc[offset]*fac2;
+ tempz2l += s_dummyz_loc[offset]*fac2;
- hp3 = d_hprime_xx[l*NGLLX+K];
- offset = l*NGLL2+J*NGLLX+I;
- tempx3l += s_dummyx_loc[offset]*hp3;
- tempy3l += s_dummyy_loc[offset]*hp3;
- tempz3l += s_dummyz_loc[offset]*hp3;
+ fac3 = sh_hprime_xx[l*NGLLX+K];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l += s_dummyx_loc[offset]*fac3;
+ tempy3l += s_dummyy_loc[offset]*fac3;
+ tempz3l += s_dummyz_loc[offset]*fac3;
- }
+ }
- if( ATTENUATION ){
- // temporary variables used for fixing attenuation in a consistent way
- tempx1l_att = 0.f;
- tempx2l_att = 0.f;
- tempx3l_att = 0.f;
+ if( ATTENUATION ){
+ // temporary variables used for fixing attenuation in a consistent way
+ tempx1l_att = 0.f;
+ tempx2l_att = 0.f;
+ tempx3l_att = 0.f;
- tempy1l_att = 0.f;
- tempy2l_att = 0.f;
- tempy3l_att = 0.f;
+ tempy1l_att = 0.f;
+ tempy2l_att = 0.f;
+ tempy3l_att = 0.f;
- tempz1l_att = 0.f;
- tempz2l_att = 0.f;
- tempz3l_att = 0.f;
+ tempz1l_att = 0.f;
+ tempz2l_att = 0.f;
+ tempz3l_att = 0.f;
- for (l=0;l<NGLLX;l++) {
- hp1 = d_hprime_xx[l*NGLLX+I];
- offset = K*NGLL2+J*NGLLX+l;
- tempx1l_att += s_dummyx_loc_att[offset]*hp1;
- tempy1l_att += s_dummyy_loc_att[offset]*hp1;
- tempz1l_att += s_dummyz_loc_att[offset]*hp1;
+ for (l=0;l<NGLLX;l++) {
+ fac1 = d_hprime_xx[l*NGLLX+I];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l_att += s_dummyx_loc_att[offset]*fac1;
+ tempy1l_att += s_dummyy_loc_att[offset]*fac1;
+ tempz1l_att += s_dummyz_loc_att[offset]*fac1;
- hp2 = d_hprime_xx[l*NGLLX+J];
- offset = K*NGLL2+l*NGLLX+I;
- tempx2l_att += s_dummyx_loc_att[offset]*hp2;
- tempy2l_att += s_dummyy_loc_att[offset]*hp2;
- tempz2l_att += s_dummyz_loc_att[offset]*hp2;
+ fac2 = d_hprime_xx[l*NGLLX+J];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l_att += s_dummyx_loc_att[offset]*fac2;
+ tempy2l_att += s_dummyy_loc_att[offset]*fac2;
+ tempz2l_att += s_dummyz_loc_att[offset]*fac2;
- hp3 = d_hprime_xx[l*NGLLX+K];
- offset = l*NGLL2+J*NGLLX+I;
- tempx3l_att += s_dummyx_loc_att[offset]*hp3;
- tempy3l_att += s_dummyy_loc_att[offset]*hp3;
- tempz3l_att += s_dummyz_loc_att[offset]*hp3;
-
- }
+ fac3 = d_hprime_xx[l*NGLLX+K];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l_att += s_dummyx_loc_att[offset]*fac3;
+ tempy3l_att += s_dummyy_loc_att[offset]*fac3;
+ tempz3l_att += s_dummyz_loc_att[offset]*fac3;
}
+ }
#else
- tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempx1l = s_dummyx_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyx_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempy1l = s_dummyy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempz1l = s_dummyz_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyz_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempx2l = s_dummyx_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyx_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyx_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempy2l = s_dummyy_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempz2l = s_dummyz_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyz_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyz_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempx3l = s_dummyx_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyx_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyx_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyx_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyx_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempy3l = s_dummyy_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempz3l = s_dummyz_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyz_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyz_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyz_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyz_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- if( ATTENUATION ){
- // temporary variables used for fixing attenuation in a consistent way
- tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ if( ATTENUATION ){
+ // temporary variables used for fixing attenuation in a consistent way
+ tempx1l_att = s_dummyx_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyx_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempy1l_att = s_dummyy_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyy_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
- + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
- + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
- + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
- + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
+ tempz1l_att = s_dummyz_loc_att[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummyz_loc_att[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempx2l_att = s_dummyx_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyx_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyx_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempy2l_att = s_dummyy_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyy_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyy_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
- + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
- + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
- + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
- + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
+ tempz2l_att = s_dummyz_loc_att[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummyz_loc_att[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummyz_loc_att[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempx3l_att = s_dummyx_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyx_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyx_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyx_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyx_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ tempy3l_att = s_dummyy_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyy_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyy_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyy_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyy_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
- + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
- + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
- + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
- + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
- }
+ tempz3l_att = s_dummyz_loc_att[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummyz_loc_att[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummyz_loc_att[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummyz_loc_att[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummyz_loc_att[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
+ }
#endif
// compute derivatives of ux, uy and uz with respect to x, y and z
- offset = working_element*NGLL3_ALIGN + tx;
+ offset = working_element*NGLL3_PADDED + tx;
- xixl = d_xix[offset];
- xiyl = d_xiy[offset];
- xizl = d_xiz[offset];
- etaxl = d_etax[offset];
- etayl = d_etay[offset];
- etazl = d_etaz[offset];
- gammaxl = d_gammax[offset];
- gammayl = d_gammay[offset];
- gammazl = d_gammaz[offset];
+ xixl = d_xix[offset];
+ xiyl = d_xiy[offset];
+ xizl = d_xiz[offset];
+ etaxl = d_etax[offset];
+ etayl = d_etay[offset];
+ etazl = d_etaz[offset];
+ gammaxl = d_gammax[offset];
+ gammayl = d_gammay[offset];
+ gammazl = d_gammaz[offset];
- duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
- duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
- duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
+ duxdxl = xixl*tempx1l + etaxl*tempx2l + gammaxl*tempx3l;
+ duxdyl = xiyl*tempx1l + etayl*tempx2l + gammayl*tempx3l;
+ duxdzl = xizl*tempx1l + etazl*tempx2l + gammazl*tempx3l;
- duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
- duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
- duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
+ duydxl = xixl*tempy1l + etaxl*tempy2l + gammaxl*tempy3l;
+ duydyl = xiyl*tempy1l + etayl*tempy2l + gammayl*tempy3l;
+ duydzl = xizl*tempy1l + etazl*tempy2l + gammazl*tempy3l;
- duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
- duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
- duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
+ duzdxl = xixl*tempz1l + etaxl*tempz2l + gammaxl*tempz3l;
+ duzdyl = xiyl*tempz1l + etayl*tempz2l + gammayl*tempz3l;
+ duzdzl = xizl*tempz1l + etazl*tempz2l + gammazl*tempz3l;
- // precompute some sums to save CPU time
- duxdxl_plus_duydyl = duxdxl + duydyl;
- duxdxl_plus_duzdzl = duxdxl + duzdzl;
- duydyl_plus_duzdzl = duydyl + duzdzl;
- duxdyl_plus_duydxl = duxdyl + duydxl;
- duzdxl_plus_duxdzl = duzdxl + duxdzl;
- duzdyl_plus_duydzl = duzdyl + duydzl;
+ // precompute some sums to save CPU time
+ duxdxl_plus_duydyl = duxdxl + duydyl;
+ duxdxl_plus_duzdzl = duxdxl + duzdzl;
+ duydyl_plus_duzdzl = duydyl + duzdzl;
+ duxdyl_plus_duydxl = duxdyl + duydxl;
+ duzdxl_plus_duxdzl = duzdxl + duxdzl;
+ duzdyl_plus_duydzl = duzdyl + duydzl;
- if(ATTENUATION){
- // temporary variables used for fixing attenuation in a consistent way
- duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
- duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
- duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
+ if(ATTENUATION){
+ // temporary variables used for fixing attenuation in a consistent way
+ duxdxl_att = xixl*tempx1l_att + etaxl*tempx2l_att + gammaxl*tempx3l_att;
+ duxdyl_att = xiyl*tempx1l_att + etayl*tempx2l_att + gammayl*tempx3l_att;
+ duxdzl_att = xizl*tempx1l_att + etazl*tempx2l_att + gammazl*tempx3l_att;
- duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
- duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
- duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
+ duydxl_att = xixl*tempy1l_att + etaxl*tempy2l_att + gammaxl*tempy3l_att;
+ duydyl_att = xiyl*tempy1l_att + etayl*tempy2l_att + gammayl*tempy3l_att;
+ duydzl_att = xizl*tempy1l_att + etazl*tempy2l_att + gammazl*tempy3l_att;
- duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
- duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
- duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
+ duzdxl_att = xixl*tempz1l_att + etaxl*tempz2l_att + gammaxl*tempz3l_att;
+ duzdyl_att = xiyl*tempz1l_att + etayl*tempz2l_att + gammayl*tempz3l_att;
+ duzdzl_att = xizl*tempz1l_att + etazl*tempz2l_att + gammazl*tempz3l_att;
- // precompute some sums to save CPU time
- duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
- duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
- duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
+ // precompute some sums to save CPU time
+ duxdyl_plus_duydxl_att = duxdyl_att + duydxl_att;
+ duzdxl_plus_duxdzl_att = duzdxl_att + duxdzl_att;
+ duzdyl_plus_duydzl_att = duzdyl_att + duydzl_att;
- // computes deviatoric strain attenuation and/or for kernel calculations
- if(COMPUTE_AND_STORE_STRAIN) {
- realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
+ // computes deviatoric strain attenuation and/or for kernel calculations
+ if(COMPUTE_AND_STORE_STRAIN) {
+ realw templ = 0.33333333333333333333f * (duxdxl_att + duydyl_att + duzdzl_att); // 1./3. = 0.33333
- // local storage: stresses at this current time step
- epsilondev_xx_loc = duxdxl_att - templ;
- epsilondev_yy_loc = duydyl_att - templ;
- epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
- epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
- epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
+ // local storage: stresses at this current time step
+ epsilondev_xx_loc = duxdxl_att - templ;
+ epsilondev_yy_loc = duydyl_att - templ;
+ epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl_att;
+ epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl_att;
+ epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl_att;
- if(SIMULATION_TYPE == 3) {
- epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
- }
+ if(SIMULATION_TYPE == 3) {
+ epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
}
- }else{
- // computes deviatoric strain attenuation and/or for kernel calculations
- if(COMPUTE_AND_STORE_STRAIN) {
- realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+ }
+ }else{
+ // computes deviatoric strain attenuation and/or for kernel calculations
+ if(COMPUTE_AND_STORE_STRAIN) {
+ realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
- // local storage: stresses at this current time step
- epsilondev_xx_loc = duxdxl - templ;
- epsilondev_yy_loc = duydyl - templ;
- epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
- epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
- epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
+ // local storage: stresses at this current time step
+ epsilondev_xx_loc = duxdxl - templ;
+ epsilondev_yy_loc = duydyl - templ;
+ epsilondev_xy_loc = 0.5f * duxdyl_plus_duydxl;
+ epsilondev_xz_loc = 0.5f * duzdxl_plus_duxdzl;
+ epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
- if(SIMULATION_TYPE == 3) {
- epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
- }
+ if(SIMULATION_TYPE == 3) {
+ epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
}
}
+ }
- // compute elements with an elastic isotropic rheology
- kappal = d_kappav[offset];
- mul = d_muv[offset];
+ // compute elements with an elastic isotropic rheology
+ kappal = d_kappav[offset];
+ mul = d_muv[offset];
- // attenuation
- if(ATTENUATION){
- // use unrelaxed parameters if attenuation
- if( ATTENUATION_3D ){
- mul_iso = mul * one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
- mul_aniso = mul *( one_minus_sum_beta[tx+working_element*NGLL3] - 1.0f );
- }else{
- mul_iso = mul * one_minus_sum_beta[working_element]; // (1,1,1,ispec)
- mul_aniso = mul *( one_minus_sum_beta[working_element] - 1.0f );
- }
+ // attenuation
+ if(ATTENUATION){
+ // use unrelaxed parameters if attenuation
+ if( ATTENUATION_3D ){
+ mul_iso = mul * one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
+ mul_aniso = mul *( one_minus_sum_beta[tx+working_element*NGLL3] - 1.0f );
}else{
- mul_iso = mul;
+ mul_iso = mul * one_minus_sum_beta[working_element]; // (1,1,1,ispec)
+ mul_aniso = mul *( one_minus_sum_beta[working_element] - 1.0f );
}
+ }else{
+ mul_iso = mul;
+ }
- // full anisotropic case, stress calculations
- if(ANISOTROPY){
+ // full anisotropic case, stress calculations
+ if(ANISOTROPY){
- // elastic tensor for hexagonal symmetry in reduced notation:
- //
- // c11 c12 c13 0 0 0
- // c12 c11 c13 0 0 0
- // c13 c13 c33 0 0 0
- // 0 0 0 c44 0 0
- // 0 0 0 0 c44 0
- // 0 0 0 0 0 (c11-c12)/2
- //
- // in terms of the A, C, L, N and F of Love (1927):
- //
- // c11 = A
- // c12 = A-2N
- // c13 = F
- // c33 = C
- // c44 = L
+ // elastic tensor for hexagonal symmetry in reduced notation:
+ //
+ // c11 c12 c13 0 0 0
+ // c12 c11 c13 0 0 0
+ // c13 c13 c33 0 0 0
+ // 0 0 0 c44 0 0
+ // 0 0 0 0 c44 0
+ // 0 0 0 0 0 (c11-c12)/2
+ //
+ // in terms of the A, C, L, N and F of Love (1927):
+ //
+ // c11 = A
+ // c12 = A-2N
+ // c13 = F
+ // c33 = C
+ // c44 = L
- c11 = d_c11store[offset];
- c12 = d_c12store[offset];
- c13 = d_c13store[offset];
- c33 = d_c33store[offset];
- c44 = d_c44store[offset];
+ c11 = d_c11store[offset];
+ c12 = d_c12store[offset];
+ c13 = d_c13store[offset];
+ c33 = d_c33store[offset];
+ c44 = d_c44store[offset];
- // use unrelaxed parameters if attenuation
- if( ATTENUATION){
- c11 = c11 + 1.33333333333333333333f * mul_aniso; // FOUR_THIRDS = 1.33333
- c12 = c12 - 0.66666666666666666666f * mul_aniso; // TWO_THIRDS = 0.66666666666666666666f
- c13 = c13 - 0.66666666666666666666f * mul_aniso;
- c33 = c33 + 1.33333333333333333333f * mul_aniso;
- c44 = c44 + mul_aniso;
- }
+ // use unrelaxed parameters if attenuation
+ if( ATTENUATION){
+ c11 = c11 + 1.33333333333333333333f * mul_aniso; // FOUR_THIRDS = 1.33333
+ c12 = c12 - 0.66666666666666666666f * mul_aniso; // TWO_THIRDS = 0.66666666666666666666f
+ c13 = c13 - 0.66666666666666666666f * mul_aniso;
+ c33 = c33 + 1.33333333333333333333f * mul_aniso;
+ c44 = c44 + mul_aniso;
+ }
- sigma_xx = c11*duxdxl + c12*duydyl + c13*duzdzl;
- sigma_yy = c12*duxdxl + c11*duydyl + c13*duzdzl;
- sigma_zz = c13*duxdxl + c13*duydyl + c33*duzdzl;
- sigma_xy = 0.5f*(c11-c12)*duxdyl_plus_duydxl;
- sigma_xz = c44*duzdxl_plus_duxdzl;
- sigma_yz = c44*duzdyl_plus_duydzl;
+ sigma_xx = c11*duxdxl + c12*duydyl + c13*duzdzl;
+ sigma_yy = c12*duxdxl + c11*duydyl + c13*duzdzl;
+ sigma_zz = c13*duxdxl + c13*duydyl + c33*duzdzl;
+ sigma_xy = 0.5f*(c11-c12)*duxdyl_plus_duydxl;
+ sigma_xz = c44*duzdxl_plus_duxdzl;
+ sigma_yz = c44*duzdyl_plus_duydzl;
- }else{
+ }else{
- // isotropic case
+ // isotropic case
- lambdalplus2mul = kappal + 1.33333333333333333333f * mul_iso; // 4./3. = 1.3333333
- lambdal = lambdalplus2mul - 2.0f * mul_iso;
+ lambdalplus2mul = kappal + 1.33333333333333333333f * mul_iso; // 4./3. = 1.3333333
+ lambdal = lambdalplus2mul - 2.0f * mul_iso;
- // compute the six components of the stress tensor sigma
- sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
- sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
- sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+ // compute the six components of the stress tensor sigma
+ sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+ sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+ sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
- sigma_xy = mul*duxdyl_plus_duydxl;
- sigma_xz = mul*duzdxl_plus_duxdzl;
- sigma_yz = mul*duzdyl_plus_duydzl;
- }
+ sigma_xy = mul*duxdyl_plus_duydxl;
+ sigma_xz = mul*duzdxl_plus_duxdzl;
+ sigma_yz = mul*duzdyl_plus_duydzl;
+ }
- if(ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
- // subtracts memory variables if attenuation
- compute_element_ic_att_stress(tx,working_element,
- R_xx,R_yy,R_xy,R_xz,R_yz,
- &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
- }
+ if(ATTENUATION && ( ! USE_ATTENUATION_MIMIC ) ){
+ // subtracts memory variables if attenuation
+ compute_element_ic_att_stress(tx,working_element,
+ R_xx,R_yy,R_xy,R_xz,R_yz,
+ &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_xz,&sigma_yz);
+ }
- // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
- sigma_yx = sigma_xy;
- sigma_zx = sigma_xz;
- sigma_zy = sigma_yz;
+ // define symmetric components (needed for non-symmetric dot product and sigma for gravity)
+ sigma_yx = sigma_xy;
+ sigma_zx = sigma_xz;
+ sigma_zy = sigma_yz;
- // jacobian
- jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
- -xiyl*(etaxl*gammazl-etazl*gammaxl)
- +xizl*(etaxl*gammayl-etayl*gammaxl));
+ // jacobian
+ jacobianl = 1.0f / (xixl*(etayl*gammazl-etazl*gammayl)
+ -xiyl*(etaxl*gammazl-etazl*gammaxl)
+ +xizl*(etaxl*gammayl-etayl*gammaxl));
- if( GRAVITY ){
- // computes non-symmetric terms for gravity
- compute_element_ic_gravity(tx,working_element,
- d_ibool,d_xstore,d_ystore,d_zstore,
- d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
- wgll_cube,jacobianl,
- s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
- &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
- &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,
- &rho_s_H1,&rho_s_H2,&rho_s_H3);
- }
+ if( GRAVITY ){
+ // computes non-symmetric terms for gravity
+ compute_element_ic_gravity(tx,working_element,
+ d_ibool,d_xstore,d_ystore,d_zstore,
+ d_minus_gravity_table,d_minus_deriv_gravity_table,d_density_table,
+ wgll_cube,jacobianl,
+ s_dummyx_loc,s_dummyy_loc,s_dummyz_loc,
+ &sigma_xx,&sigma_yy,&sigma_zz,&sigma_xy,&sigma_yx,
+ &sigma_xz,&sigma_zx,&sigma_yz,&sigma_zy,
+ &rho_s_H1,&rho_s_H2,&rho_s_H3);
+ }
- // form dot product with test vector, non-symmetric form
- s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
- s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
- s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
+ // form dot product with test vector, non-symmetric form
+ s_tempx1[tx] = jacobianl * (sigma_xx*xixl + sigma_yx*xiyl + sigma_zx*xizl);
+ s_tempy1[tx] = jacobianl * (sigma_xy*xixl + sigma_yy*xiyl + sigma_zy*xizl);
+ s_tempz1[tx] = jacobianl * (sigma_xz*xixl + sigma_yz*xiyl + sigma_zz*xizl);
- s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
- s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
- s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
+ s_tempx2[tx] = jacobianl * (sigma_xx*etaxl + sigma_yx*etayl + sigma_zx*etazl);
+ s_tempy2[tx] = jacobianl * (sigma_xy*etaxl + sigma_yy*etayl + sigma_zy*etazl);
+ s_tempz2[tx] = jacobianl * (sigma_xz*etaxl + sigma_yz*etayl + sigma_zz*etazl);
- s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
- s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
- s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
+ s_tempx3[tx] = jacobianl * (sigma_xx*gammaxl + sigma_yx*gammayl + sigma_zx*gammazl);
+ s_tempy3[tx] = jacobianl * (sigma_xy*gammaxl + sigma_yy*gammayl + sigma_zy*gammazl);
+ s_tempz3[tx] = jacobianl * (sigma_xz*gammaxl + sigma_yz*gammayl + sigma_zz*gammazl);
- }
+ }
// synchronize all the threads (one thread for each of the NGLL grid points of the
// current spectral element) because we need the whole element to be ready in order
// to be able to compute the matrix products along cut planes of the 3D element below
- __syncthreads();
+ __syncthreads();
- if (active) {
+ if (active) {
#ifndef MANUALLY_UNROLLED_LOOPS
- tempx1l = 0.f;
- tempy1l = 0.f;
- tempz1l = 0.f;
+ tempx1l = 0.f;
+ tempy1l = 0.f;
+ tempz1l = 0.f;
- tempx2l = 0.f;
- tempy2l = 0.f;
- tempz2l = 0.f;
+ tempx2l = 0.f;
+ tempy2l = 0.f;
+ tempz2l = 0.f;
- tempx3l = 0.f;
- tempy3l = 0.f;
- tempz3l = 0.f;
+ tempx3l = 0.f;
+ tempy3l = 0.f;
+ tempz3l = 0.f;
- for (l=0;l<NGLLX;l++) {
+ for (l=0;l<NGLLX;l++) {
- fac1 = d_hprimewgll_xx[I*NGLLX+l];
- offset = K*NGLL2+J*NGLLX+l;
- tempx1l += s_tempx1[offset]*fac1;
- tempy1l += s_tempy1[offset]*fac1;
- tempz1l += s_tempz1[offset]*fac1;
+ fac1 = d_hprimewgll_xx[I*NGLLX+l];
+ offset = K*NGLL2+J*NGLLX+l;
+ tempx1l += s_tempx1[offset]*fac1;
+ tempy1l += s_tempy1[offset]*fac1;
+ tempz1l += s_tempz1[offset]*fac1;
- fac2 = d_hprimewgll_yy[J*NGLLX+l];
- offset = K*NGLL2+l*NGLLX+I;
- tempx2l += s_tempx2[offset]*fac2;
- tempy2l += s_tempy2[offset]*fac2;
- tempz2l += s_tempz2[offset]*fac2;
+ fac2 = d_hprimewgll_yy[J*NGLLX+l];
+ offset = K*NGLL2+l*NGLLX+I;
+ tempx2l += s_tempx2[offset]*fac2;
+ tempy2l += s_tempy2[offset]*fac2;
+ tempz2l += s_tempz2[offset]*fac2;
- fac3 = d_hprimewgll_zz[K*NGLLX+l];
- offset = l*NGLL2+J*NGLLX+I;
- tempx3l += s_tempx3[offset]*fac3;
- tempy3l += s_tempy3[offset]*fac3;
- tempz3l += s_tempz3[offset]*fac3;
+ fac3 = d_hprimewgll_zz[K*NGLLX+l];
+ offset = l*NGLL2+J*NGLLX+I;
+ tempx3l += s_tempx3[offset]*fac3;
+ tempy3l += s_tempy3[offset]*fac3;
+ tempz3l += s_tempz3[offset]*fac3;
- }
+ }
#else
- tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
- + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
- + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
- + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
- + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+ tempx1l = s_tempx1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempx1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempx1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempx1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempx1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
- tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
- + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
- + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
- + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
- + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+ tempy1l = s_tempy1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempy1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempy1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempy1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempy1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
- tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
- + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
- + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
- + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
- + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
+ tempz1l = s_tempz1[K*NGLL2+J*NGLLX]*d_hprimewgll_xx[I*NGLLX]
+ + s_tempz1[K*NGLL2+J*NGLLX+1]*d_hprimewgll_xx[I*NGLLX+1]
+ + s_tempz1[K*NGLL2+J*NGLLX+2]*d_hprimewgll_xx[I*NGLLX+2]
+ + s_tempz1[K*NGLL2+J*NGLLX+3]*d_hprimewgll_xx[I*NGLLX+3]
+ + s_tempz1[K*NGLL2+J*NGLLX+4]*d_hprimewgll_xx[I*NGLLX+4];
- tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
- + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
- + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
- + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
- + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+ tempx2l = s_tempx2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+ + s_tempx2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+ + s_tempx2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+ + s_tempx2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+ + s_tempx2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
- tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
- + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
- + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
- + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
- + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+ tempy2l = s_tempy2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+ + s_tempy2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+ + s_tempy2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+ + s_tempy2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+ + s_tempy2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
- tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
- + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
- + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
- + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
- + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
+ tempz2l = s_tempz2[K*NGLL2+I]*d_hprimewgll_yy[J*NGLLX]
+ + s_tempz2[K*NGLL2+NGLLX+I]*d_hprimewgll_yy[J*NGLLX+1]
+ + s_tempz2[K*NGLL2+2*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+2]
+ + s_tempz2[K*NGLL2+3*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+3]
+ + s_tempz2[K*NGLL2+4*NGLLX+I]*d_hprimewgll_yy[J*NGLLX+4];
- tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
- + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
- + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
- + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
- + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+ tempx3l = s_tempx3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+ + s_tempx3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+ + s_tempx3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+ + s_tempx3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+ + s_tempx3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
- tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
- + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
- + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
- + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
- + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+ tempy3l = s_tempy3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+ + s_tempy3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+ + s_tempy3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+ + s_tempy3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+ + s_tempy3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
- tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
- + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
- + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
- + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
- + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
+ tempz3l = s_tempz3[J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX]
+ + s_tempz3[NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+1]
+ + s_tempz3[2*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+2]
+ + s_tempz3[3*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+3]
+ + s_tempz3[4*NGLL2+J*NGLLX+I]*d_hprimewgll_zz[K*NGLLX+4];
#endif
- fac1 = d_wgllwgll_yz[K*NGLLX+J];
- fac2 = d_wgllwgll_xz[K*NGLLX+I];
- fac3 = d_wgllwgll_xy[J*NGLLX+I];
+ fac1 = d_wgllwgll_yz[K*NGLLX+J];
+ fac2 = d_wgllwgll_xz[K*NGLLX+I];
+ fac3 = d_wgllwgll_xy[J*NGLLX+I];
- sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
- sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
- sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
+ sum_terms1 = - (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
+ sum_terms2 = - (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
+ sum_terms3 = - (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
- // adds gravity term
- if( GRAVITY ){
- sum_terms1 += rho_s_H1;
- sum_terms2 += rho_s_H2;
- sum_terms3 += rho_s_H3;
- }
+ // adds gravity term
+ if( GRAVITY ){
+ sum_terms1 += rho_s_H1;
+ sum_terms2 += rho_s_H2;
+ sum_terms3 += rho_s_H3;
+ }
-#ifdef USE_TEXTURES
- d_accel[iglob] = tex1Dfetch(tex_accel, iglob) + sum_terms1);
- d_accel[iglob + NGLOB] = tex1Dfetch(tex_accel, iglob + NGLOB) + sum_terms2);
- d_accel[iglob + 2*NGLOB] = tex1Dfetch(tex_accel, iglob + 2*NGLOB) + sum_terms3);
+
+#ifdef USE_MESH_COLORING_GPU
+ // no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+ d_accel[iglob*3] = tex1Dfetch(d_accel_ic_tex, iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 2) + sum_terms3;
#else
- /* OLD/To be implemented version that uses coloring to get around race condition. About 1.6x faster */
+ d_accel[iglob*3] += sum_terms1;
+ d_accel[iglob*3 + 1] += sum_terms2;
+ d_accel[iglob*3 + 2] += sum_terms3;
+#endif // USE_TEXTURES_FIELDS
+#else // MESH_COLORING
-#ifdef USE_MESH_COLORING_GPU
- // no atomic operation needed, colors don't share global points between elements
+ //mesh coloring
+ if( use_mesh_coloring_gpu ){
+
+ // no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+ d_accel[iglob*3] = tex1Dfetch(d_accel_ic_tex, iglob*3) + sum_terms1;
+ d_accel[iglob*3 + 1] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 1) + sum_terms2;
+ d_accel[iglob*3 + 2] = tex1Dfetch(d_accel_ic_tex, iglob*3 + 2) + sum_terms3;
+#else
d_accel[iglob*3] += sum_terms1;
d_accel[iglob*3 + 1] += sum_terms2;
d_accel[iglob*3 + 2] += sum_terms3;
-#else
- //mesh coloring
- if( use_mesh_coloring_gpu ){
+#endif // USE_TEXTURES_FIELDS
- // no atomic operation needed, colors don't share global points between elements
- d_accel[iglob*3] += sum_terms1;
- d_accel[iglob*3 + 1] += sum_terms2;
- d_accel[iglob*3 + 2] += sum_terms3;
+ }else{
- }else{
+ // for testing purposes only: w/out atomic updates
+ //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
+ //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
+ //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
- // for testing purposes only: w/out atomic updates
- //d_accel[iglob*3] -= (0.00000001f*tempx1l + 0.00000001f*tempx2l + 0.00000001f*tempx3l);
- //d_accel[iglob*3 + 1] -= (0.00000001f*tempy1l + 0.00000001f*tempy2l + 0.00000001f*tempy3l);
- //d_accel[iglob*3 + 2] -= (0.00000001f*tempz1l + 0.00000001f*tempz2l + 0.00000001f*tempz3l);
+ atomicAdd(&d_accel[iglob*3], sum_terms1);
+ atomicAdd(&d_accel[iglob*3+1], sum_terms2);
+ atomicAdd(&d_accel[iglob*3+2], sum_terms3);
- atomicAdd(&d_accel[iglob*3], sum_terms1);
- atomicAdd(&d_accel[iglob*3+1], sum_terms2);
- atomicAdd(&d_accel[iglob*3+2], sum_terms3);
+ }
+#endif // MESH_COLORING
- }
-#endif
+ // update memory variables based upon the Runge-Kutta scheme
+ if( ATTENUATION && ! USE_ATTENUATION_MIMIC ){
+ compute_element_ic_att_memory(tx,working_element,
+ d_muv,
+ factor_common,alphaval,betaval,gammaval,
+ R_xx,R_yy,R_xy,R_xz,R_yz,
+ epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
+ epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc,
+ ATTENUATION_3D);
+ }
-#endif
-
- // update memory variables based upon the Runge-Kutta scheme
- if( ATTENUATION && ! USE_ATTENUATION_MIMIC ){
- compute_element_ic_att_memory(tx,working_element,
- d_muv,
- factor_common,alphaval,betaval,gammaval,
- R_xx,R_yy,R_xy,R_xz,R_yz,
- epsilondev_xx,epsilondev_yy,epsilondev_xy,epsilondev_xz,epsilondev_yz,
- epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc,
- ATTENUATION_3D);
- }
-
- // save deviatoric strain for Runge-Kutta scheme
- if( COMPUTE_AND_STORE_STRAIN ){
- int ijk_ispec = tx + working_element*NGLL3;
-
- // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
- epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
- epsilondev_yy[ijk_ispec] = epsilondev_yy_loc;
- epsilondev_xy[ijk_ispec] = epsilondev_xy_loc;
- epsilondev_xz[ijk_ispec] = epsilondev_xz_loc;
- epsilondev_yz[ijk_ispec] = epsilondev_yz_loc;
- }
-
+ // save deviatoric strain for Runge-Kutta scheme
+ if( COMPUTE_AND_STORE_STRAIN ){
+ // fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
+ epsilondev_xx[tx + working_element*NGLL3] = epsilondev_xx_loc;
+ epsilondev_yy[tx + working_element*NGLL3] = epsilondev_yy_loc;
+ epsilondev_xy[tx + working_element*NGLL3] = epsilondev_xy_loc;
+ epsilondev_xz[tx + working_element*NGLL3] = epsilondev_xz_loc;
+ epsilondev_yz[tx + working_element*NGLL3] = epsilondev_yz_loc;
}
-
-#else // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
- d_accel[iglob] -= 0.00000001f;
- d_accel[iglob + NGLOB] -= 0.00000001f;
- d_accel[iglob + 2*NGLOB] -= 0.00000001f;
-#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
+ }
}
/* ----------------------------------------------------------------------------------------------- */
@@ -1130,7 +1134,7 @@
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+ mp->d_hprime_xx,
mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
d_kappav, d_muv,
@@ -1177,7 +1181,7 @@
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+ mp->d_hprime_xx,
mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
d_kappav, d_muv,
@@ -1283,7 +1287,7 @@
if( mp->attenuation_3D ){
color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * NGLL3 * N_SLS;
}else{
- color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * 1 * N_SLS;
+ color_offset_nonpadded_att2 = (mp->nspec_outer_inner_core) * 1 * N_SLS;
}
color_offset_ispec = mp->nspec_outer_inner_core;
}
@@ -1354,7 +1358,7 @@
if( mp->attenuation_3D ){
color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
}else{
- color_offset_nonpadded_att2 += nb_blocks_to_compute * 1 * N_SLS;
+ color_offset_nonpadded_att2 += nb_blocks_to_compute * 1 * N_SLS;
}
// for array(ispec)
color_offset_ispec += nb_blocks_to_compute;
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_forces_outer_core_cuda.cu 2012-07-23 21:58:26 UTC (rev 20536)
@@ -37,6 +37,15 @@
#include "config.h"
#include "mesh_constants_cuda.h"
+#ifdef USE_TEXTURES_FIELDS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_displ_oc_tex;
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_accel_oc_tex;
+#endif
+
+#ifdef USE_TEXTURES_CONSTANTS
+texture<realw, cudaTextureType1D, cudaReadModeElementType> d_hprime_xx_oc_tex;
+#endif
+
/* ----------------------------------------------------------------------------------------------- */
// elemental routines
@@ -51,14 +60,14 @@
realw deltat,
realw* d_A_array_rotation,
realw* d_B_array_rotation,
- reald dpotentialdxl, reald dpotentialdyl,
- reald* dpotentialdx_with_rot,
- reald* dpotentialdy_with_rot) {
+ realw dpotentialdxl, realw dpotentialdyl,
+ realw* dpotentialdx_with_rot,
+ realw* dpotentialdy_with_rot) {
- reald two_omega_deltat,cos_two_omega_t,sin_two_omega_t;
- reald A_rotation,B_rotation;
- reald ux_rotation,uy_rotation;
- reald source_euler_A,source_euler_B;
+ realw two_omega_deltat,cos_two_omega_t,sin_two_omega_t;
+ realw A_rotation,B_rotation;
+ realw ux_rotation,uy_rotation;
+ realw source_euler_A,source_euler_B;
// non-padded offset
int offset_nonpadded = tx + working_element*NGLL3;
@@ -108,7 +117,7 @@
realw* d_xix, realw* d_xiy, realw* d_xiz,
realw* d_etax, realw* d_etay, realw* d_etaz,
realw* d_gammax, realw* d_gammay, realw* d_gammaz,
- realw* hprime_xx, realw* hprime_yy, realw* hprime_zz,
+ realw* d_hprime_xx,
realw* hprimewgll_xx, realw* hprimewgll_yy, realw* hprimewgll_zz,
realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
int GRAVITY,
@@ -125,12 +134,10 @@
int bx = blockIdx.y*gridDim.x+blockIdx.x;
int tx = threadIdx.x;
- //const int NGLL3 = NGLL3;
- const int NGLL3_ALIGN = NGLL3_PADDED;
// R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
- const reald R_EARTH_KM = 6371.0f;
+ //const realw R_EARTH_KM = 6371.0f;
// uncomment line below for PREM with oceans
- //const reald R_EARTH_KM = 6368.0f;
+ //const realw R_EARTH_KM = 6368.0f;
int K = (tx/NGLL2);
int J = ((tx-K*NGLL2)/NGLLX);
@@ -139,32 +146,34 @@
int active,offset;
int iglob = 0;
int working_element;
- reald temp1l,temp2l,temp3l;
- reald xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
- reald dpotentialdxl,dpotentialdyl,dpotentialdzl;
- reald dpotentialdx_with_rot,dpotentialdy_with_rot;
- reald fac1,fac2,fac3;
- reald sum_terms;
- reald gravity_term;
- reald gxl,gyl,gzl;
- reald radius,theta,phi;
- reald cos_theta,sin_theta,cos_phi,sin_phi;
- reald grad_x_ln_rho,grad_y_ln_rho,grad_z_ln_rho;
+
+ realw temp1l,temp2l,temp3l;
+ realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl,jacobianl;
+ realw dpotentialdxl,dpotentialdyl,dpotentialdzl;
+ realw dpotentialdx_with_rot,dpotentialdy_with_rot;
+ realw fac1,fac2,fac3;
+ realw sum_terms;
+ realw gravity_term;
+ realw gxl,gyl,gzl;
+ realw radius,theta,phi;
+ realw cos_theta,sin_theta,cos_phi,sin_phi;
+ realw grad_x_ln_rho,grad_y_ln_rho,grad_z_ln_rho;
int int_radius;
#ifndef MANUALLY_UNROLLED_LOOPS
int l;
int offset1,offset2,offset3;
- realw hp1,hp2,hp3;
#endif
- __shared__ reald s_dummy_loc[NGLL3];
+ __shared__ realw s_dummy_loc[NGLL3];
- __shared__ reald s_temp1[NGLL3];
- __shared__ reald s_temp2[NGLL3];
- __shared__ reald s_temp3[NGLL3];
+ __shared__ realw s_temp1[NGLL3];
+ __shared__ realw s_temp2[NGLL3];
+ __shared__ realw s_temp3[NGLL3];
+ __shared__ realw sh_hprime_xx[NGLL2];
+
// use only NGLL^3 = 125 active threads, plus 3 inactive/ghost threads,
// because we used memory padding from NGLL^3 = 125 to 128 to get coalescent memory accesses
active = (tx < NGLL3 && bx < nb_blocks_to_compute) ? 1:0;
@@ -185,31 +194,32 @@
}
#endif
- // iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
+ // iglob = d_ibool[working_element*NGLL3_PADDED + tx]-1;
iglob = d_ibool[working_element*NGLL3 + tx]-1;
-#ifdef USE_TEXTURES
- s_dummy_loc[tx] = tex1Dfetch(tex_potential, iglob);
+#ifdef USE_TEXTURES_FIELDS
+ s_dummy_loc[tx] = tex1Dfetch(d_displ_oc_tex, iglob);
#else
// changing iglob indexing to match fortran row changes fast style
s_dummy_loc[tx] = d_potential[iglob];
#endif
}
+ if (tx < NGLL2) {
+#ifdef USE_TEXTURES_CONSTANTS
+ sh_hprime_xx[tx] = tex1Dfetch(d_hprime_xx_oc_tex,tx);
+#else
+ sh_hprime_xx[tx] = d_hprime_xx[tx];
+#endif
+ }
+
// synchronize all the threads (one thread for each of the NGLL grid points of the
// current spectral element) because we need the whole element to be ready in order
// to be able to compute the matrix products along cut planes of the 3D element below
__syncthreads();
-#ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
-
if (active) {
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-// if(iglob == 0 )printf("kernel 2: iglob %i hprime_xx %f %f %f \n",iglob,hprime_xx[0],hprime_xx[1],hprime_xx[2]);
-#endif
-
-
#ifndef MANUALLY_UNROLLED_LOOPS
temp1l = 0.f;
@@ -217,43 +227,43 @@
temp3l = 0.f;
for (l=0;l<NGLLX;l++) {
- hp1 = hprime_xx[l*NGLLX+I];
- offset1 = K*NGLL2+J*NGLLX+l;
- temp1l += s_dummy_loc[offset1]*hp1;
+ fac1 = sh_hprime_xx[l*NGLLX+I];
+ offset1 = K*NGLL2+J*NGLLX+l;
+ temp1l += s_dummy_loc[offset1]*fac1;
- //no more assumes that hprime_xx = hprime_yy = hprime_zz
- hp2 = hprime_yy[l*NGLLX+J];
- offset2 = K*NGLL2+l*NGLLX+I;
- temp2l += s_dummy_loc[offset2]*hp2;
+ //assumes that hprime_xx = hprime_yy = hprime_zz
+ fac2 = sh_hprime_xx[l*NGLLX+J];
+ offset2 = K*NGLL2+l*NGLLX+I;
+ temp2l += s_dummy_loc[offset2]*fac2;
- hp3 = hprime_zz[l*NGLLX+K];
- offset3 = l*NGLL2+J*NGLLX+I;
- temp3l += s_dummy_loc[offset3]*hp3;
+ fac3 = sh_hprime_xx[l*NGLLX+K];
+ offset3 = l*NGLL2+J*NGLLX+I;
+ temp3l += s_dummy_loc[offset3]*fac3;
}
#else
- temp1l = s_dummy_loc[K*NGLL2+J*NGLLX]*hprime_xx[I]
- + s_dummy_loc[K*NGLL2+J*NGLLX+1]*hprime_xx[NGLLX+I]
- + s_dummy_loc[K*NGLL2+J*NGLLX+2]*hprime_xx[2*NGLLX+I]
- + s_dummy_loc[K*NGLL2+J*NGLLX+3]*hprime_xx[3*NGLLX+I]
- + s_dummy_loc[K*NGLL2+J*NGLLX+4]*hprime_xx[4*NGLLX+I];
+ temp1l = s_dummy_loc[K*NGLL2+J*NGLLX]*d_hprime_xx[I]
+ + s_dummy_loc[K*NGLL2+J*NGLLX+1]*d_hprime_xx[NGLLX+I]
+ + s_dummy_loc[K*NGLL2+J*NGLLX+2]*d_hprime_xx[2*NGLLX+I]
+ + s_dummy_loc[K*NGLL2+J*NGLLX+3]*d_hprime_xx[3*NGLLX+I]
+ + s_dummy_loc[K*NGLL2+J*NGLLX+4]*d_hprime_xx[4*NGLLX+I];
- temp2l = s_dummy_loc[K*NGLL2+I]*hprime_yy[J]
- + s_dummy_loc[K*NGLL2+NGLLX+I]*hprime_yy[NGLLX+J]
- + s_dummy_loc[K*NGLL2+2*NGLLX+I]*hprime_yy[2*NGLLX+J]
- + s_dummy_loc[K*NGLL2+3*NGLLX+I]*hprime_yy[3*NGLLX+J]
- + s_dummy_loc[K*NGLL2+4*NGLLX+I]*hprime_yy[4*NGLLX+J];
+ temp2l = s_dummy_loc[K*NGLL2+I]*d_hprime_xx[J]
+ + s_dummy_loc[K*NGLL2+NGLLX+I]*d_hprime_xx[NGLLX+J]
+ + s_dummy_loc[K*NGLL2+2*NGLLX+I]*d_hprime_xx[2*NGLLX+J]
+ + s_dummy_loc[K*NGLL2+3*NGLLX+I]*d_hprime_xx[3*NGLLX+J]
+ + s_dummy_loc[K*NGLL2+4*NGLLX+I]*d_hprime_xx[4*NGLLX+J];
- temp3l = s_dummy_loc[J*NGLLX+I]*hprime_zz[K]
- + s_dummy_loc[NGLL2+J*NGLLX+I]*hprime_zz[NGLLX+K]
- + s_dummy_loc[2*NGLL2+J*NGLLX+I]*hprime_zz[2*NGLLX+K]
- + s_dummy_loc[3*NGLL2+J*NGLLX+I]*hprime_zz[3*NGLLX+K]
- + s_dummy_loc[4*NGLL2+J*NGLLX+I]*hprime_zz[4*NGLLX+K];
+ temp3l = s_dummy_loc[J*NGLLX+I]*d_hprime_xx[K]
+ + s_dummy_loc[NGLL2+J*NGLLX+I]*d_hprime_xx[NGLLX+K]
+ + s_dummy_loc[2*NGLL2+J*NGLLX+I]*d_hprime_xx[2*NGLLX+K]
+ + s_dummy_loc[3*NGLL2+J*NGLLX+I]*d_hprime_xx[3*NGLLX+K]
+ + s_dummy_loc[4*NGLL2+J*NGLLX+I]*d_hprime_xx[4*NGLLX+K];
#endif
// compute derivatives of ux, uy and uz with respect to x, y and z
- offset = working_element*NGLL3_ALIGN + tx;
+ offset = working_element*NGLL3_PADDED + tx;
xixl = d_xix[offset];
xiyl = d_xiy[offset];
@@ -414,41 +424,41 @@
fac3 = wgllwgll_xy[J*NGLLX+I];
sum_terms = -(fac1*temp1l + fac2*temp2l + fac3*temp3l);
+
if( GRAVITY ) sum_terms += gravity_term;
- iglob = d_ibool[working_element*NGLL3 + tx]-1;
+ //iglob = d_ibool[working_element*NGLL3 + tx]-1;
-#ifdef USE_TEXTURES
- d_potential_dot_dot[iglob] = tex1Dfetch(tex_potential_dot_dot, iglob)
- + sum_terms;
-#else
-
#ifdef USE_MESH_COLORING_GPU
// no atomic operation needed, colors don't share global points between elements
+
+#ifdef USE_TEXTURES_FIELDS
+ d_potential_dot_dot[iglob] = tex1Dfetch(d_accel_oc_tex, iglob) + sum_terms;
+#else
d_potential_dot_dot[iglob] += sum_terms;
-#else
+#endif // USE_TEXTURES_FIELDS
+
+#else // MESH_COLORING
+
//mesh coloring
if( use_mesh_coloring_gpu ){
// no atomic operation needed, colors don't share global points between elements
+#ifdef USE_TEXTURES_FIELDS
+ d_potential_dot_dot[iglob] = tex1Dfetch(d_accel_oc_tex, iglob) + sum_terms;
+#else
d_potential_dot_dot[iglob] += sum_terms;
+#endif // USE_TEXTURES_FIELDS
}else{
atomicAdd(&d_potential_dot_dot[iglob],sum_terms);
}
-#endif
-
-#endif
+#endif // MESH_COLORING
}
-
-#else // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
- d_potential_dot_dot[iglob] = 123.123f;
-#endif // of #ifndef MAKE_KERNEL2_BECOME_STUPID_FOR_TESTS
}
-
/* ----------------------------------------------------------------------------------------------- */
void Kernel_2_outer_core(int nb_blocks_to_compute, Mesh* mp,
@@ -488,45 +498,18 @@
// cudaEventRecord( start, 0 );
Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
- mp->NGLOB_OUTER_CORE,
- d_ibool,
- mp->d_phase_ispec_inner_outer_core,
- mp->num_phase_ispec_outer_core,
- d_iphase,
- mp->use_mesh_coloring_gpu,
- mp->d_displ_outer_core,
- mp->d_accel_outer_core,
- d_xix, d_xiy, d_xiz,
- d_etax, d_etay, d_etaz,
- d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
- mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
- mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
- mp->gravity,
- mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
- mp->d_d_ln_density_dr_table,
- mp->d_minus_rho_g_over_kappa_fluid,
- mp->d_wgll_cube,
- mp->rotation,
- time,
- mp->d_two_omega_earth,
- mp->d_deltat,
- d_A_array_rotation,d_B_array_rotation);
-
- if(mp->simulation_type == 3) {
- Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
mp->NGLOB_OUTER_CORE,
d_ibool,
mp->d_phase_ispec_inner_outer_core,
mp->num_phase_ispec_outer_core,
d_iphase,
mp->use_mesh_coloring_gpu,
- mp->d_b_displ_outer_core,
- mp->d_b_accel_outer_core,
+ mp->d_displ_outer_core,
+ mp->d_accel_outer_core,
d_xix, d_xiy, d_xiz,
d_etax, d_etay, d_etaz,
d_gammax, d_gammay, d_gammaz,
- mp->d_hprime_xx, mp->d_hprime_yy, mp->d_hprime_zz,
+ mp->d_hprime_xx,
mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
mp->gravity,
@@ -535,10 +518,37 @@
mp->d_minus_rho_g_over_kappa_fluid,
mp->d_wgll_cube,
mp->rotation,
- b_time,
- mp->d_b_two_omega_earth,
- mp->d_b_deltat,
- d_b_A_array_rotation,d_b_B_array_rotation);
+ time,
+ mp->d_two_omega_earth,
+ mp->d_deltat,
+ d_A_array_rotation,d_B_array_rotation);
+
+ if(mp->simulation_type == 3) {
+ Kernel_2_outer_core_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,
+ mp->NGLOB_OUTER_CORE,
+ d_ibool,
+ mp->d_phase_ispec_inner_outer_core,
+ mp->num_phase_ispec_outer_core,
+ d_iphase,
+ mp->use_mesh_coloring_gpu,
+ mp->d_b_displ_outer_core,
+ mp->d_b_accel_outer_core,
+ d_xix, d_xiy, d_xiz,
+ d_etax, d_etay, d_etaz,
+ d_gammax, d_gammay, d_gammaz,
+ mp->d_hprime_xx,
+ mp->d_hprimewgll_xx, mp->d_hprimewgll_yy, mp->d_hprimewgll_zz,
+ mp->d_wgllwgll_xy, mp->d_wgllwgll_xz, mp->d_wgllwgll_yz,
+ mp->gravity,
+ mp->d_xstore_outer_core,mp->d_ystore_outer_core,mp->d_zstore_outer_core,
+ mp->d_d_ln_density_dr_table,
+ mp->d_minus_rho_g_over_kappa_fluid,
+ mp->d_wgll_cube,
+ mp->rotation,
+ b_time,
+ mp->d_b_two_omega_earth,
+ mp->d_b_deltat,
+ d_b_A_array_rotation,d_b_B_array_rotation);
}
// cudaEventRecord( stop, 0 );
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2012-07-23 21:58:26 UTC (rev 20536)
@@ -352,8 +352,6 @@
realw* scalar_field,
realw* vector_field_element,
realw* hprime_xx,
- realw* hprime_yy,
- realw* hprime_zz,
realw* d_xix,
realw* d_xiy,
realw* d_xiz,
@@ -370,7 +368,7 @@
int l,offset,offset1,offset2,offset3;
//const int NGLLX = 5;
- const int NGLL3_ALIGN = NGLL3_PADDED;
+ //const int NGLL3_ALIGN = NGLL3_PADDED;
int K = (ijk/NGLL2);
int J = ((ijk-K*NGLL2)/NGLLX);
@@ -387,7 +385,8 @@
// derivative along y
temp2l = 0.f;
for( l=0; l<NGLLX;l++){
- hp2 = hprime_yy[l*NGLLX+J];
+ //assumes that hprime_xx = hprime_yy = hprime_zz
+ hp2 = hprime_xx[l*NGLLX+J];
offset2 = K*NGLL2+l*NGLLX+I;
temp2l += scalar_field[offset2]*hp2;
}
@@ -395,12 +394,13 @@
// derivative along z
temp3l = 0.f;
for( l=0; l<NGLLX;l++){
- hp3 = hprime_zz[l*NGLLX+K];
+ //assumes that hprime_xx = hprime_yy = hprime_zz
+ hp3 = hprime_xx[l*NGLLX+K];
offset3 = l*NGLL2+J*NGLLX+I;
temp3l += scalar_field[offset3]*hp3;
}
- offset = ispec*NGLL3_ALIGN + ijk;
+ offset = ispec*NGLL3_PADDED + ijk;
xixl = d_xix[offset];
xiyl = d_xiy[offset];
@@ -429,8 +429,6 @@
realw* rhostore,
realw* kappastore,
realw* hprime_xx,
- realw* hprime_yy,
- realw* hprime_zz,
realw* d_xix,
realw* d_xiy,
realw* d_xiz,
@@ -476,12 +474,12 @@
// displacement vector from backward field
compute_gradient_kernel(ijk,ispec,scalar_field_displ,b_displ_elm,
- hprime_xx,hprime_yy,hprime_zz,
+ hprime_xx,
d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz);
// acceleration vector
compute_gradient_kernel(ijk,ispec,scalar_field_accel,accel_elm,
- hprime_xx,hprime_yy,hprime_zz,
+ hprime_xx,
d_xix,d_xiy,d_xiz,d_etax,d_etay,d_etaz,d_gammax,d_gammay,d_gammaz);
// gets material parameter
@@ -530,8 +528,6 @@
mp->d_rhostore_outer_core,
mp->d_kappavstore_outer_core,
mp->d_hprime_xx,
- mp->d_hprime_yy,
- mp->d_hprime_zz,
mp->d_xix_outer_core,
mp->d_xiy_outer_core,
mp->d_xiz_outer_core,
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/initialize_cuda.cu 2012-07-23 21:58:26 UTC (rev 20536)
@@ -39,7 +39,6 @@
#include "config.h"
#include "mesh_constants_cuda.h"
-#include "prepare_constants_cuda.h"
/* ----------------------------------------------------------------------------------------------- */
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2012-07-23 21:58:26 UTC (rev 20536)
@@ -112,23 +112,58 @@
#define IREGION_CRUST_MANTLE 1
#define IREGION_INNER_CORE 3
+// R_EARTH_KM is the radius of the bottom of the oceans (radius of Earth in km)
+#define R_EARTH_KM 6371.0f
+// uncomment line below for PREM with oceans
+//#define R_EARTH_KM 6368.0f
+
+
/* ----------------------------------------------------------------------------------------------- */
-//typedef float real; // type of variables passed into function
-typedef float realw; // type of "working" variables
+// type of "working" variables: see also CUSTOM_REAL
+// double precision temporary variables leads to 10% performance decrease
+// in Kernel_2_impl (not very much..)
+typedef float realw;
-// double precision temporary variables leads to 10% performance
-// decrease in Kernel_2_impl (not very much..)
-typedef float reald;
+/* ----------------------------------------------------------------------------------------------- */
+
// (optional) pre-processing directive used in kernels: if defined check that it is also set in src/shared/constants.h:
// leads up to ~ 5% performance increase
//#define USE_MESH_COLORING_GPU
+/* ----------------------------------------------------------------------------------------------- */
+
+// Texture memory usage:
+// requires CUDA version >= 4.0, see check below
+// Use textures for d_displ and d_accel -- 10% performance boost
+#define USE_TEXTURES_FIELDS
+
+// Using texture memory for the hprime-style constants is slower on
+// Fermi generation hardware, but *may* be faster on Kepler
+// generation.
+// Use textures for hprime_xx
+#define USE_TEXTURES_CONSTANTS
+
+// CUDA version >= 4.0 needed for cudaTextureType1D and cudaDeviceSynchronize()
+#if CUDA_VERSION < 4000
+#undef USE_TEXTURES_FIELDS
+#undef USE_TEXTURES_CONSTANTS
+#endif
+
+#ifdef USE_TEXTURES_FIELDS
+#pragma message ("\nCompiling with: USE_TEXTURES_FIELDS enabled\n")
+#endif
+#ifdef USE_TEXTURES_CONSTANTS
+#pragma message ("\nCompiling with: USE_TEXTURES_CONSTANTS enabled\n")
+#endif
+
// (optional) unrolling loops
// leads up to ~1% performance increase
//#define MANUALLY_UNROLLED_LOOPS
+/* ----------------------------------------------------------------------------------------------- */
+
// cuda kernel block size for updating displacements/potential (newmark time scheme)
// current hardware: 128 is slightly faster than 256 ( ~ 4%)
#define BLOCKSIZE_KERNEL1 128
@@ -221,6 +256,12 @@
// backward/reconstructed elastic wavefield
realw* d_b_displ_crust_mantle; realw* d_b_veloc_crust_mantle; realw* d_b_accel_crust_mantle;
+#ifdef USE_TEXTURES_FIELDS
+ // Texture references for fast non-coalesced scattered access
+ const textureReference* d_displ_cm_tex_ref_ptr;
+ const textureReference* d_accel_cm_tex_ref_ptr;
+#endif
+
// attenuation
realw* d_R_xx_crust_mantle;
realw* d_R_yy_crust_mantle;
@@ -305,6 +346,12 @@
// backward/reconstructed elastic wavefield
realw* d_b_displ_outer_core; realw* d_b_veloc_outer_core; realw* d_b_accel_outer_core;
+#ifdef USE_TEXTURES_FIELDS
+ // Texture references for fast non-coalesced scattered access
+ const textureReference* d_displ_oc_tex_ref_ptr;
+ const textureReference* d_accel_oc_tex_ref_ptr;
+#endif
+
// kernels
realw* d_rho_kl_outer_core;
realw* d_alpha_kl_outer_core;
@@ -369,6 +416,12 @@
// backward/reconstructed elastic wavefield
realw* d_b_displ_inner_core; realw* d_b_veloc_inner_core; realw* d_b_accel_inner_core;
+#ifdef USE_TEXTURES_FIELDS
+ // Texture references for fast non-coalesced scattered access
+ const textureReference* d_displ_ic_tex_ref_ptr;
+ const textureReference* d_accel_ic_tex_ref_ptr;
+#endif
+
// attenuation
realw* d_R_xx_inner_core;
realw* d_R_yy_inner_core;
@@ -447,7 +500,15 @@
// ------------------------------------------------------------------ //
// pointers to constant memory arrays
- realw* d_hprime_xx; realw* d_hprime_yy; realw* d_hprime_zz;
+ realw* d_hprime_xx;
+ //realw* d_hprime_yy; // only needed if NGLLX != NGLLY != NGLLZ
+ //realw* d_hprime_zz; // only needed if NGLLX != NGLLY != NGLLZ
+
+#ifdef USE_TEXTURES_CONSTANTS
+ const textureReference* d_hprime_xx_tex_ptr;
+ realw* d_hprime_xx_tex;
+#endif
+
realw* d_hprimewgll_xx; realw* d_hprimewgll_yy; realw* d_hprimewgll_zz;
realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz;
realw* d_wgll_cube;
@@ -461,12 +522,12 @@
// simulation flags
int save_forward;
int absorbing_conditions;
-
+
int attenuation;
int attenuation_new;
int use_attenuation_mimic;
int attenuation_3D;
-
+
int compute_and_store_strain;
int anisotropic_3D_mantle;
int gravity;
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2012-07-23 21:58:26 UTC (rev 20536)
@@ -33,111 +33,231 @@
/* ----------------------------------------------------------------------------------------------- */
-// setters for these const arrays (very ugly hack, but will have to do)
+// CONSTANT arrays setup
-// elastic
-void setConst_hprime_xx(realw* array,Mesh* mp);
-void setConst_hprime_yy(realw* array,Mesh* mp);
-void setConst_hprime_zz(realw* array,Mesh* mp);
+/* ----------------------------------------------------------------------------------------------- */
-void setConst_hprimewgll_xx(realw* array,Mesh* mp);
-void setConst_hprimewgll_yy(realw* array,Mesh* mp);
-void setConst_hprimewgll_zz(realw* array,Mesh* mp);
+/* note:
+ constant arrays when used in other compute_forces_***_cuda.cu routines stay zero,
+ constant declaration and cudaMemcpyToSymbol would have to be in the same file...
-void setConst_wgllwgll_xy(realw* array,Mesh* mp);
-void setConst_wgllwgll_xz(realw* array, Mesh* mp);
-void setConst_wgllwgll_yz(realw* array, Mesh* mp);
+ extern keyword doesn't work for __constant__ declarations.
-void setConst_wgll_cube(realw* array, Mesh* mp);
+ also:
+ cudaMemcpyToSymbol("deviceCaseParams", caseParams, sizeof(CaseParams));
+ ..
+ and compile with -arch=sm_20
-/* ----------------------------------------------------------------------------------------------- */
+ see also: http://stackoverflow.com/questions/4008031/how-to-use-cuda-constant-memory-in-a-programmer-pleasant-way
+ doesn't seem to work.
-/* CUDA specific things from specfem3D_kernels.cu */
+ we could keep arrays separated for acoustic and elastic routines...
-#ifdef USE_TEXTURES
- // declaration of textures
- texture<realw, 1, cudaReadModeElementType> tex_displ;
- texture<realw, 1, cudaReadModeElementType> tex_veloc;
- texture<realw, 1, cudaReadModeElementType> tex_accel;
+ workaround:
- texture<realw, 1, cudaReadModeElementType> tex_potential;
- texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot;
+ for now, we store pointers with cudaGetSymbolAddress() function calls.
+ we pass those pointers in all other compute_forces_..() routines
- // for binding the textures
+ in this file, we can use the above constant array declarations without need of the pointers.
- void bindTexturesDispl(realw* d_displ)
- {
- cudaError_t err;
+ */
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+// cuda constant arrays
+//
+// note: we use definition __device__ to use global memory rather than constant memory registers
+// to avoid over-loading registers; this should help increasing the occupancy on the GPU
- err = cudaBindTexture(NULL,tex_displ, d_displ, channelDescFloat, NDIM*NGLOB*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in bindTexturesDispl for displ: %s\n", cudaGetErrorString(err));
- exit(1);
- }
- }
+__device__ realw d_hprime_xx[NGLL2];
+//__device__ realw d_hprime_yy[NGLL2]; // only needed if NGLLX != NGLLY != NGLLZ
+//__device__ realw d_hprime_zz[NGLL2]; // only needed if NGLLX != NGLLY != NGLLZ
- void bindTexturesVeloc(realw* d_veloc)
+__device__ realw d_hprimewgll_xx[NGLL2];
+__device__ realw d_hprimewgll_yy[NGLL2];
+__device__ realw d_hprimewgll_zz[NGLL2];
+
+__device__ realw d_wgllwgll_xy[NGLL2];
+__device__ realw d_wgllwgll_xz[NGLL2];
+__device__ realw d_wgllwgll_yz[NGLL2];
+
+__device__ realw d_wgll_cube[NGLL3]; // needed only for gravity case
+
+
+// setup functions
+void setConst_hprime_xx(realw* array,Mesh* mp)
+{
+
+ cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
{
- cudaError_t err;
+ fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err));
+ fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+ exit(1);
+ }
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+ err = cudaGetSymbolAddress((void**)&(mp->d_hprime_xx),"d_hprime_xx");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_hprime_xx: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+}
- err = cudaBindTexture(NULL,tex_veloc, d_veloc, channelDescFloat, NDIM*NGLOB*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in bindTexturesVeloc for veloc: %s\n", cudaGetErrorString(err));
- exit(1);
- }
+/*
+ // only needed if NGLLX != NGLLY != NGLLZ
+ void setConst_hprime_yy(realw* array,Mesh* mp)
+ {
+
+ cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
+ fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+ exit(1);
+ }
+
+ err = cudaGetSymbolAddress((void**)&(mp->d_hprime_yy),"d_hprime_yy");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_hprime_yy: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+ }
+ */
+
+/*
+ // only needed if NGLLX != NGLLY != NGLLZ
+ void setConst_hprime_zz(realw* array,Mesh* mp)
+ {
+
+ cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
+ fprintf(stderr, "The problem is maybe -arch sm_13 instead of -arch sm_11 in the Makefile, please doublecheck\n");
+ exit(1);
+ }
+
+ err = cudaGetSymbolAddress((void**)&(mp->d_hprime_zz),"d_hprime_zz");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_hprime_zz: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+ }
+ */
+
+void setConst_hprimewgll_xx(realw* array,Mesh* mp)
+{
+ cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err));
+ exit(1);
}
- void bindTexturesAccel(realw* d_accel)
+ err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_xx),"d_hprimewgll_xx");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_hprimewgll_xx: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+}
+
+void setConst_hprimewgll_yy(realw* array,Mesh* mp)
+{
+ cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
{
- cudaError_t err;
+ fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+ err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_yy),"d_hprimewgll_yy");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_hprimewgll_yy: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+}
- err = cudaBindTexture(NULL,tex_accel, d_accel, channelDescFloat, NDIM*NGLOB*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in bindTexturesAccel for accel: %s\n", cudaGetErrorString(err));
- exit(1);
- }
+void setConst_hprimewgll_zz(realw* array,Mesh* mp)
+{
+ cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err));
+ exit(1);
}
- void bindTexturesPotential(realw* d_potential)
+ err = cudaGetSymbolAddress((void**)&(mp->d_hprimewgll_zz),"d_hprimewgll_zz");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_hprimewgll_zz: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+}
+
+void setConst_wgllwgll_xy(realw* array,Mesh* mp)
+{
+ cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
{
- cudaError_t err;
+ fprintf(stderr, "Error in setConst_wgllwgll_xy: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+ err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xy),"d_wgllwgll_xy");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_wgllwgll_xy: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
- err = cudaBindTexture(NULL,tex_potential, d_potential,
- channelDescFloat, NGLOB*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in bindTexturesPotential for potential: %s\n", cudaGetErrorString(err));
- exit(1);
- }
+}
+
+void setConst_wgllwgll_xz(realw* array,Mesh* mp)
+{
+ cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in setConst_wgllwgll_xz: %s\n", cudaGetErrorString(err));
+ exit(1);
}
- void bindTexturesPotential_dot_dot(realw* d_potential_dot_dot)
+ err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_xz),"d_wgllwgll_xz");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_wgllwgll_xz: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+
+}
+
+void setConst_wgllwgll_yz(realw* array,Mesh* mp)
+{
+ cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(realw));
+ if (err != cudaSuccess)
{
- cudaError_t err;
+ fprintf(stderr, "Error in setConst_wgllwgll_yz: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
+ err = cudaGetSymbolAddress((void**)&(mp->d_wgllwgll_yz),"d_wgllwgll_yz");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_wgllwgll_yz: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
- err = cudaBindTexture(NULL,tex_potential_dot_dot, d_potential_dot_dot,
- channelDescFloat, NGLOB*sizeof(realw));
- if (err != cudaSuccess)
- {
- fprintf(stderr, "Error in bindTexturesPotential_dot_dot for potential_dot_dot: %s\n", cudaGetErrorString(err));
- exit(1);
- }
+}
+
+void setConst_wgll_cube(realw* array,Mesh* mp)
+{
+ cudaError_t err = cudaMemcpyToSymbol(d_wgll_cube, array, NGLL3*sizeof(realw));
+ if (err != cudaSuccess)
+ {
+ fprintf(stderr, "Error in setConst_wgll_cube: %s\n", cudaGetErrorString(err));
+ exit(1);
}
-#endif // USE_TEXTURES
+ err = cudaGetSymbolAddress((void**)&(mp->d_wgll_cube),"d_wgll_cube");
+ if(err != cudaSuccess) {
+ fprintf(stderr, "Error with d_wgll_cube: %s\n", cudaGetErrorString(err));
+ exit(1);
+ }
+}
+
#endif //CUDA_HEADER_H
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2012-07-23 21:58:26 UTC (rev 20536)
@@ -42,7 +42,6 @@
#include "prepare_constants_cuda.h"
-
/* ----------------------------------------------------------------------------------------------- */
// GPU preparation
@@ -101,15 +100,41 @@
// sets constant arrays
setConst_hprime_xx(h_hprime_xx,mp);
- setConst_hprime_yy(h_hprime_yy,mp);
- setConst_hprime_zz(h_hprime_zz,mp);
+ //setConst_hprime_yy(h_hprime_yy,mp); // only needed if NGLLX != NGLLY != NGLLZ
+ //setConst_hprime_zz(h_hprime_zz,mp); // only needed if NGLLX != NGLLY != NGLLZ
+
setConst_hprimewgll_xx(h_hprimewgll_xx,mp);
setConst_hprimewgll_yy(h_hprimewgll_yy,mp);
setConst_hprimewgll_zz(h_hprimewgll_zz,mp);
+
setConst_wgllwgll_xy(h_wgllwgll_xy,mp);
setConst_wgllwgll_xz(h_wgllwgll_xz,mp);
setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
+ // Using texture memory for the hprime-style constants is slower on
+ // Fermi generation hardware, but *may* be faster on Kepler
+ // generation. We will reevaluate this again, so might as well leave
+ // in the code with #USE_TEXTURES_FIELDS not-defined.
+ #ifdef USE_TEXTURES_CONSTANTS
+ {
+ print_CUDA_error_if_any(cudaGetTextureReference(&(mp->d_hprime_xx_tex_ptr), "d_hprime_xx_cm_tex"), 1101);
+ cudaChannelFormatDesc channelDesc1 = cudaCreateChannelDesc<realw>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_hprime_xx_tex_ptr, mp->d_hprime_xx,
+ &channelDesc1, sizeof(realw)*(NGLL2)), 1102);
+
+ print_CUDA_error_if_any(cudaGetTextureReference(&(mp->d_hprime_xx_tex_ptr), "d_hprime_xx_oc_tex"), 1103);
+ cudaChannelFormatDesc channelDesc2 = cudaCreateChannelDesc<realw>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_hprime_xx_tex_ptr, mp->d_hprime_xx,
+ &channelDesc2, sizeof(realw)*(NGLL2)), 1104);
+
+ print_CUDA_error_if_any(cudaGetTextureReference(&(mp->d_hprime_xx_tex_ptr), "d_hprime_xx_ic_tex"), 1105);
+ cudaChannelFormatDesc channelDesc3 = cudaCreateChannelDesc<realw>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_hprime_xx_tex_ptr, mp->d_hprime_xx,
+ &channelDesc3, sizeof(realw)*(NGLL2)), 1106);
+ }
+ #endif
+
+
// sets global parameters
mp->NSPEC_CRUST_MANTLE = *NSPEC_CRUST_MANTLE;
mp->NGLOB_CRUST_MANTLE = *NGLOB_CRUST_MANTLE;
@@ -130,12 +155,12 @@
mp->oceans = *OCEANS_f;
mp->gravity = *GRAVITY_f;
mp->rotation = *ROTATION_f;
-
+
mp->attenuation = *ATTENUATION_f;
mp->attenuation_new = *ATTENUATION_NEW_f;
mp->use_attenuation_mimic = *USE_ATTENUATION_MIMIC_f;
mp->attenuation_3D = *ATTENUATION_3D_VAL_f;
-
+
mp->compute_and_store_strain = *COMPUTE_AND_STORE_STRAIN_f;
mp->anisotropic_3D_mantle = *ANISOTROPIC_3D_MANTLE_f;
mp->anisotropic_inner_core = *ANISOTROPIC_INNER_CORE_f;
@@ -408,9 +433,9 @@
}else{
R_size1 = N_SLS*NGLL3*mp->NSPEC_CRUST_MANTLE;
R_size2 = 1*mp->NSPEC_CRUST_MANTLE;
- R_size3 = N_SLS*1*mp->NSPEC_CRUST_MANTLE;
+ R_size3 = N_SLS*1*mp->NSPEC_CRUST_MANTLE;
}
-
+
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_one_minus_sum_beta_crust_mantle,
R_size2*sizeof(realw)),4430);
print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta_crust_mantle,one_minus_sum_beta_crust_mantle,
@@ -455,9 +480,9 @@
}else{
R_size1 = N_SLS*NGLL3*mp->NSPEC_INNER_CORE;
R_size2 = 1*mp->NSPEC_INNER_CORE;
- R_size3 = N_SLS*1*mp->NSPEC_INNER_CORE;
+ R_size3 = N_SLS*1*mp->NSPEC_INNER_CORE;
}
-
+
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_one_minus_sum_beta_inner_core,
R_size2*sizeof(realw)),4430);
print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta_inner_core,one_minus_sum_beta_inner_core,
@@ -1497,6 +1522,23 @@
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_crust_mantle),sizeof(realw)*size),4003);
}
+ #ifdef USE_TEXTURES_FIELDS
+ {
+ print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_cm_tex_ref_ptr, "d_displ_cm_tex"), 4001);
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_cm_tex_ref_ptr, mp->d_displ_crust_mantle,
+ &channelDesc, sizeof(realw)*size), 4001);
+ }
+
+ {
+ print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_cm_tex_ref_ptr, "d_accel_cm_tex"), 4003);
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_cm_tex_ref_ptr, mp->d_accel_crust_mantle,
+ &channelDesc, sizeof(realw)*size), 4003);
+ }
+ #endif
+
+
// mass matrices
if( *NCHUNKS_VAL != 6 && mp->absorbing_conditions){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmassx_crust_mantle),sizeof(realw)*size_glob),2005);
@@ -1696,6 +1738,23 @@
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_outer_core),sizeof(realw)*size),4003);
}
+ #ifdef USE_TEXTURES_FIELDS
+ {
+ print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_oc_tex_ref_ptr, "d_displ_oc_tex"), 4001);
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_oc_tex_ref_ptr, mp->d_displ_outer_core,
+ &channelDesc, sizeof(realw)*size), 4001);
+ }
+
+ {
+ print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_oc_tex_ref_ptr, "d_accel_oc_tex"), 4003);
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_oc_tex_ref_ptr, mp->d_accel_outer_core,
+ &channelDesc, sizeof(realw)*size), 4003);
+ }
+ #endif
+
+
// mass matrix
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_outer_core),sizeof(realw)*size_glob),2005);
print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_outer_core,h_rmass,
@@ -1892,6 +1951,23 @@
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel_inner_core),sizeof(realw)*size),4003);
}
+ #ifdef USE_TEXTURES_FIELDS
+ {
+ print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_displ_ic_tex_ref_ptr, "d_displ_ic_tex"), 4001);
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_displ_ic_tex_ref_ptr, mp->d_displ_inner_core,
+ &channelDesc, sizeof(realw)*size), 4001);
+ }
+
+ {
+ print_CUDA_error_if_any(cudaGetTextureReference(&mp->d_accel_ic_tex_ref_ptr, "d_accel_ic_tex"), 4003);
+ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<realw>();
+ print_CUDA_error_if_any(cudaBindTexture(0, mp->d_accel_ic_tex_ref_ptr, mp->d_accel_inner_core,
+ &channelDesc, sizeof(realw)*size), 4003);
+ }
+ #endif
+
+
// mass matrix
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_inner_core),sizeof(realw)*size_glob),2005);
print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_inner_core,h_rmass,
@@ -1953,362 +2029,10 @@
#endif
}
-/* ----------------------------------------------------------------------------------------------- */
-// for ELASTIC simulations
/* ----------------------------------------------------------------------------------------------- */
-/*
-extern "C"
-void FC_FUNC_(prepare_fields_elastic_device,
- PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
- int* size,
- realw* rmass,
- realw* rho_vp,
- realw* rho_vs,
- int* num_phase_ispec_elastic,
- int* phase_ispec_inner_elastic,
- int* ispec_is_elastic,
- int* ABSORBING_CONDITIONS,
- realw* h_b_absorb_field,
- int* h_b_reclen_field,
- int* SIMULATION_TYPE,int* SAVE_FORWARD,
- int* COMPUTE_AND_STORE_STRAIN,
- realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
- realw* epsilondev_xz,realw* epsilondev_yz,
- int* ATTENUATION,
- int* R_size,
- realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
- realw* one_minus_sum_beta,realw* factor_common,
- realw* alphaval,realw* betaval,realw* gammaval,
- int* OCEANS,
- realw* rmass_ocean_load,
- int* NOISE_TOMOGRAPHY,
- realw* free_surface_normal,
- int* free_surface_ispec,
- int* free_surface_ijk,
- int* num_free_surface_faces,
- int* ACOUSTIC_SIMULATION,
- int* num_colors_outer_elastic,
- int* num_colors_inner_elastic,
- int* num_elem_colors_elastic,
- int* ANISOTROPY,
- realw *c11store,
- realw *c12store,
- realw *c13store,
- realw *c14store,
- realw *c15store,
- realw *c16store,
- realw *c22store,
- realw *c23store,
- realw *c24store,
- realw *c25store,
- realw *c26store,
- realw *c33store,
- realw *c34store,
- realw *c35store,
- realw *c36store,
- realw *c44store,
- realw *c45store,
- realw *c46store,
- realw *c55store,
- realw *c56store,
- realw *c66store){
-
-TRACE("prepare_fields_elastic_device");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f);
- // Assuming NGLLX==5. Padded is then 128 (5^3+3)
- int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
- int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),4001);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),4002);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),4003);
-
- // mpi buffer
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
- 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),4004);
-
- // mass matrix
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),4005);
- // transfer element data
- print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
- sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4010);
-
-
- // element indices
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_ispec_is_elastic),mp->NSPEC_AB*sizeof(int)),4009);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_ispec_is_elastic,ispec_is_elastic,
- mp->NSPEC_AB*sizeof(int),cudaMemcpyHostToDevice),4012);
-
- // phase elements
- mp->num_phase_ispec_elastic = *num_phase_ispec_elastic;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_elastic),
- mp->num_phase_ispec_elastic*2*sizeof(int)),4008);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_phase_ispec_inner_elastic,phase_ispec_inner_elastic,
- mp->num_phase_ispec_elastic*2*sizeof(int),cudaMemcpyHostToDevice),4011);
-
- // for seismograms
- if( mp->nrec_local > 0 ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
- 3*NGLL3*(mp->nrec_local)*sizeof(realw)),4015);
-
- mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
- if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
- }
-
- // absorbing conditions
- if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
- // non-padded arrays
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),4006);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),4007);
-
- // rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
- print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
- NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4013);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
- NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4014);
-
- // absorb_field array used for file i/o
- if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )){
- mp->d_b_reclen_field = *h_b_reclen_field;
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_absorb_field),
- mp->d_b_reclen_field),4016);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field, h_b_absorb_field,
- mp->d_b_reclen_field,cudaMemcpyHostToDevice),4017);
- }
- }
-
- // strains used for attenuation and kernel simulations
- if( *COMPUTE_AND_STORE_STRAIN ){
- // strains
- int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
-
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
- epsilondev_size*sizeof(realw)),4301);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
- cudaMemcpyHostToDevice),4302);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
- epsilondev_size*sizeof(realw)),4302);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
- cudaMemcpyHostToDevice),4303);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
- epsilondev_size*sizeof(realw)),4304);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
- cudaMemcpyHostToDevice),4305);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
- epsilondev_size*sizeof(realw)),4306);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
- cudaMemcpyHostToDevice),4307);
- print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
- epsilondev_size*sizeof(realw)),4308);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
- cudaMemcpyHostToDevice),4309);
-
- }
-
- // attenuation memory variables
- if( *ATTENUATION ){
- // memory arrays
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
- (*R_size)*sizeof(realw)),4401);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),4402);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
- (*R_size)*sizeof(realw)),4403);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),4404);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
- (*R_size)*sizeof(realw)),4405);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),4406);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
- (*R_size)*sizeof(realw)),4407);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),4408);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
- (*R_size)*sizeof(realw)),4409);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
- cudaMemcpyHostToDevice),4410);
-
- // attenuation factors
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
- NGLL3*mp->NSPEC_AB*sizeof(realw)),4430);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
- NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4431);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
- N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),4432);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
- N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),4433);
-
- // alpha,beta,gamma factors
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
- N_SLS*sizeof(realw)),4434);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
- N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4435);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
- N_SLS*sizeof(realw)),4436);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
- N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4437);
-
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
- N_SLS*sizeof(realw)),4438);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
- N_SLS*sizeof(realw),cudaMemcpyHostToDevice),4439);
-
- }
-
- // anisotropy
- if( *ANISOTROPY ){
- // allocates memory on GPU
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),
- size_padded*sizeof(realw)),4700);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),
- size_padded*sizeof(realw)),4701);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),
- size_padded*sizeof(realw)),4702);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),
- size_padded*sizeof(realw)),4703);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),
- size_padded*sizeof(realw)),4704);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),
- size_padded*sizeof(realw)),4705);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),
- size_padded*sizeof(realw)),4706);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),
- size_padded*sizeof(realw)),4707);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),
- size_padded*sizeof(realw)),4708);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),
- size_padded*sizeof(realw)),4709);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),
- size_padded*sizeof(realw)),4710);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),
- size_padded*sizeof(realw)),4711);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),
- size_padded*sizeof(realw)),4712);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),
- size_padded*sizeof(realw)),4713);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),
- size_padded*sizeof(realw)),4714);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),
- size_padded*sizeof(realw)),4715);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),
- size_padded*sizeof(realw)),4716);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),
- size_padded*sizeof(realw)),4717);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),
- size_padded*sizeof(realw)),4718);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),
- size_padded*sizeof(realw)),4719);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),
- size_padded*sizeof(realw)),4720);
-
- // transfer constant element data with padding
- for(int i=0;i < mp->NSPEC_AB;i++) {
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store + i*NGLL3_PADDED, &c11store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4800);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store + i*NGLL3_PADDED, &c12store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4801);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store + i*NGLL3_PADDED, &c13store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4802);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c14store + i*NGLL3_PADDED, &c14store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4803);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c15store + i*NGLL3_PADDED, &c15store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4804);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c16store + i*NGLL3_PADDED, &c16store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4805);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c22store + i*NGLL3_PADDED, &c22store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4806);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c23store + i*NGLL3_PADDED, &c23store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4807);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c24store + i*NGLL3_PADDED, &c24store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4808);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c25store + i*NGLL3_PADDED, &c25store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4809);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c26store + i*NGLL3_PADDED, &c26store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4810);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store + i*NGLL3_PADDED, &c33store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4811);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c34store + i*NGLL3_PADDED, &c34store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4812);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c35store + i*NGLL3_PADDED, &c35store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4813);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c36store + i*NGLL3_PADDED, &c36store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4814);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store + i*NGLL3_PADDED, &c44store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4815);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c45store + i*NGLL3_PADDED, &c45store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4816);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c46store + i*NGLL3_PADDED, &c46store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4817);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c55store + i*NGLL3_PADDED, &c55store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4818);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c56store + i*NGLL3_PADDED, &c56store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4819);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_c66store + i*NGLL3_PADDED, &c66store[i*NGLL3],
- NGLL3*sizeof(realw),cudaMemcpyHostToDevice),4820);
- }
- }
-
- // ocean load approximation
- if( *OCEANS ){
- // oceans needs a free surface
- mp->num_free_surface_faces = *num_free_surface_faces;
- if( mp->num_free_surface_faces > 0 ){
- // mass matrix
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
- sizeof(realw)*mp->NGLOB_AB),4501);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
- sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),4502);
- // surface normal
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
- 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),4503);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
- 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),4504);
-
- // temporary global array: used to synchronize updates on global accel array
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
- sizeof(int)*mp->NGLOB_AB),4505);
-
- if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
- mp->num_free_surface_faces*sizeof(int)),4601);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
- mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4603);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
- 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4602);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
- 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4604);
- }
- }
- }
-
- // mesh coloring
- if( mp->use_mesh_coloring_gpu ){
- mp->num_colors_outer_elastic = *num_colors_outer_elastic;
- mp->num_colors_inner_elastic = *num_colors_inner_elastic;
- mp->h_num_elem_colors_elastic = (int*) num_elem_colors_elastic;
- }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("prepare_fields_elastic_device");
-#endif
-}
-*/
-
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
// cleanup
/* ----------------------------------------------------------------------------------------------- */
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2012-07-23 21:58:26 UTC (rev 20536)
@@ -1,4 +1,4 @@
-/*
+/*
!=====================================================================
!
! S p e c f e m 3 D G l o b e V e r s i o n 5 . 1
@@ -34,8 +34,8 @@
typedef float realw;
+
-
//
// src/cuda/assemble_MPI_scalar_cuda.cu
//
@@ -43,12 +43,12 @@
void FC_FUNC_(transfer_boun_pot_from_device,
TRANSFER_BOUN_POT_FROM_DEVICE)(long* Mesh_pointer_f,
realw* send_potential_dot_dot_buffer,
- int* FORWARD_OR_ADJOINT){}
+ int* FORWARD_OR_ADJOINT){}
void FC_FUNC_(transfer_asmbl_pot_to_device,
TRANSFER_ASMBL_POT_TO_DEVICE)(long* Mesh_pointer,
realw* buffer_recv_scalar,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
//
@@ -59,13 +59,13 @@
TRANSFER_BOUN_ACCEL_FROM_DEVICE)(long* Mesh_pointer_f,
realw* send_accel_buffer,
int* IREGION,
- int* FORWARD_OR_ADJOINT){}
+ int* FORWARD_OR_ADJOINT){}
void FC_FUNC_(transfer_asmbl_accel_to_device,
TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
realw* buffer_recv_vector,
int* IREGION,
- int* FORWARD_OR_ADJOINT) {}
+ int* FORWARD_OR_ADJOINT) {}
//
@@ -73,58 +73,58 @@
//
void FC_FUNC_(pause_for_debug,
- PAUSE_FOR_DEBUG)() {}
+ PAUSE_FOR_DEBUG)() {}
void FC_FUNC_(output_free_device_memory,
- OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
+ OUTPUT_FREE_DEVICE_MEMORY)(int* myrank) {}
void FC_FUNC_(get_free_device_memory,
- get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
+ get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {}
void FC_FUNC_(check_max_norm_displ_gpu,
- CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {}
void FC_FUNC_(check_max_norm_vector,
- CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
+ CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {}
void FC_FUNC_(check_max_norm_displ,
- CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
+ CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {}
void FC_FUNC_(check_max_norm_b_displ_gpu,
- CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {}
void FC_FUNC_(check_max_norm_b_accel_gpu,
- CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {}
void FC_FUNC_(check_max_norm_b_veloc_gpu,
- CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
+ CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {}
void FC_FUNC_(check_max_norm_b_displ,
- CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
+ CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {}
void FC_FUNC_(check_max_norm_b_accel,
- CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
+ CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {}
void FC_FUNC_(check_error_vectors,
- CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
+ CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {}
void FC_FUNC_(get_max_accel,
- GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
+ GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer) {}
void FC_FUNC_(check_norm_acoustic_from_device,
CHECK_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
- int* SIMULATION_TYPE) {}
+ int* SIMULATION_TYPE) {}
void FC_FUNC_(check_norm_elastic_from_device,
CHECK_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
- int* SIMULATION_TYPE) {}
+ int* SIMULATION_TYPE) {}
void FC_FUNC_(check_norm_strain_from_device,
CHECK_NORM_STRAIN_FROM_DEVICE)(realw* strain_norm,
realw* strain_norm2,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
//
@@ -134,12 +134,12 @@
void FC_FUNC_(compute_add_sources_el_cuda,
COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
int* NSOURCESf,
- double* h_stf_pre_compute) {}
+ double* h_stf_pre_compute) {}
void FC_FUNC_(compute_add_sources_el_s3_cuda,
COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer_f,
int* NSOURCESf,
- double* h_stf_pre_compute) {}
+ double* h_stf_pre_compute) {}
void FC_FUNC_(add_sources_el_sim_type_2_or_3,
ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
@@ -147,7 +147,7 @@
realw* h_adj_sourcearrays,
int* h_islice_selected_rec,
int* h_ispec_selected_rec,
- int* time_index) {}
+ int* time_index) {}
//
@@ -155,26 +155,26 @@
//
void FC_FUNC_(compute_coupling_fluid_cmb_cuda,
- COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f) {}
+ COMPUTE_COUPLING_FLUID_CMB_CUDA)(long* Mesh_pointer_f) {}
void FC_FUNC_(compute_coupling_fluid_icb_cuda,
- COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f) {}
+ COMPUTE_COUPLING_FLUID_ICB_CUDA)(long* Mesh_pointer_f) {}
void FC_FUNC_(compute_coupling_cmb_fluid_cuda,
COMPUTE_COUPLING_CMB_FLUID_CUDA)(long* Mesh_pointer_f,
double RHO_TOP_OC,
realw minus_g_cmb,
- int GRAVITY_VAL) {}
+ int GRAVITY_VAL) {}
void FC_FUNC_(compute_coupling_icb_fluid_cuda,
COMPUTE_COUPLING_ICB_FLUID_CUDA)(long* Mesh_pointer_f,
double RHO_BOTTOM_OC,
realw minus_g_icb,
- int GRAVITY_VAL) {}
+ int GRAVITY_VAL) {}
void FC_FUNC_(compute_coupling_ocean_cuda,
COMPUTE_COUPLING_OCEAN_CUDA)(long* Mesh_pointer_f,
- int* NCHUNKS_VAL) {}
+ int* NCHUNKS_VAL) {}
//
@@ -184,7 +184,7 @@
void FC_FUNC_(compute_forces_crust_mantle_cuda,
COMPUTE_FORCES_CRUST_MANTLE_CUDA)(long* Mesh_pointer_f,
realw* deltat,
- int* iphase) {}
+ int* iphase) {}
//
@@ -193,8 +193,8 @@
void FC_FUNC_(compute_forces_inner_core_cuda,
COMPUTE_FORCES_INNER_CORE_CUDA)(long* Mesh_pointer_f,
- realw* deltat,
- int* iphase) {}
+ realw* deltat,
+ int* iphase) {}
//
@@ -205,7 +205,7 @@
COMPUTE_FORCES_OUTER_CORE_CUDA)(long* Mesh_pointer_f,
int* iphase,
realw* time_f,
- realw* b_time_f) {}
+ realw* b_time_f) {}
//
@@ -213,22 +213,22 @@
//
void FC_FUNC_(compute_kernels_cm_cuda,
- COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+ COMPUTE_KERNELS_CM_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
void FC_FUNC_(compute_kernels_ic_cuda,
- COMPUTE_KERNELS_IC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+ COMPUTE_KERNELS_IC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
void FC_FUNC_(compute_kernels_oc_cuda,
- COMPUTE_KERNELS_OC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
+ COMPUTE_KERNELS_OC_CUDA)(long* Mesh_pointer,realw* deltat_f) {}
void FC_FUNC_(compute_kernels_strgth_noise_cu,
COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
realw* h_noise_surface_movie,
- realw* deltat_f) {}
+ realw* deltat_f) {}
void FC_FUNC_(compute_kernels_hess_cuda,
COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
- realw* deltat_f) {}
+ realw* deltat_f) {}
//
@@ -238,7 +238,7 @@
void FC_FUNC_(compute_stacey_acoustic_cuda,
COMPUTE_STACEY_ACOUSTIC_CUDA)(long* Mesh_pointer_f,
realw* absorb_potential,
- int* itype) {}
+ int* itype) {}
//
@@ -248,7 +248,7 @@
void FC_FUNC_(compute_stacey_elastic_cuda,
COMPUTE_STACEY_ELASTIC_CUDA)(long* Mesh_pointer_f,
realw* absorb_field,
- int* itype) {}
+ int* itype) {}
//
@@ -256,10 +256,10 @@
//
void FC_FUNC_(initialize_cuda_device,
- INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
+ INITIALIZE_CUDA_DEVICE)(int* myrank_f,int* ncuda_devices) {
fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
exit(1);
-}
+}
//
@@ -273,7 +273,7 @@
realw* deltatover2_F,
realw* b_deltat_F,
realw* b_deltatsqover2_F,
- realw* b_deltatover2_F) {}
+ realw* b_deltatover2_F) {}
void FC_FUNC_(it_update_displacement_cm_cuda,
IT_UPDATE_DISPLACMENT_CM_CUDA)(long* Mesh_pointer_f,
@@ -282,7 +282,7 @@
realw* deltatover2_F,
realw* b_deltat_F,
realw* b_deltatsqover2_F,
- realw* b_deltatover2_F) {}
+ realw* b_deltatover2_F) {}
void FC_FUNC_(it_update_displacement_oc_cuda,
IT_UPDATE_DISPLACEMENT_OC_cuda)(long* Mesh_pointer_f,
@@ -291,7 +291,7 @@
realw* deltatover2_F,
realw* b_deltat_F,
realw* b_deltatsqover2_F,
- realw* b_deltatover2_F) {}
+ realw* b_deltatover2_F) {}
void FC_FUNC_(kernel_3_a_cuda,
KERNEL_3_A_CUDA)(long* Mesh_pointer,
@@ -299,49 +299,49 @@
int* SIMULATION_TYPE_f,
realw* b_deltatover2_F,
int* OCEANS,
- int* NCHUNKS_VAL) {}
+ int* NCHUNKS_VAL) {}
void FC_FUNC_(kernel_3_b_cuda,
KERNEL_3_B_CUDA)(long* Mesh_pointer,
realw* deltatover2_F,
int* SIMULATION_TYPE_f,
realw* b_deltatover2_F,
- int* OCEANS) {}
+ int* OCEANS) {}
void FC_FUNC_(kernel_3_outer_core_cuda,
KERNEL_3_OUTER_CORE_CUDA)(long* Mesh_pointer,
realw* deltatover2_F,
int* SIMULATION_TYPE_f,
- realw* b_deltatover2_F) {}
+ realw* b_deltatover2_F) {}
//
// src/cuda/noise_tomography_cuda.cu
//
-void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
+void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
-void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
+void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id) {}
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {}
-void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
+void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val) {}
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {}
void FC_FUNC_(noise_transfer_surface_to_host,
NOISE_TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
- realw* h_noise_surface_movie) {}
+ realw* h_noise_surface_movie) {}
void FC_FUNC_(noise_add_source_master_rec_cu,
NOISE_ADD_SOURCE_MASTER_REC_CU)(long* Mesh_pointer_f,
int* it_f,
int* irec_master_noise_f,
- int* islice_selected_rec) {}
+ int* islice_selected_rec) {}
void FC_FUNC_(noise_add_surface_movie_cuda,
NOISE_ADD_SURFACE_MOVIE_CUDA)(long* Mesh_pointer_f,
- realw* h_noise_surface_movie) {}
+ realw* h_noise_surface_movie) {}
//
@@ -376,13 +376,14 @@
int* ATTENUATION_f,
int* ATTENUATION_NEW_f,
int* USE_ATTENUATION_MIMIC_f,
+ int* ATTENUATION_3D_VAL_f,
int* COMPUTE_AND_STORE_STRAIN_f,
int* ANISOTROPIC_3D_MANTLE_f,
int* ANISOTROPIC_INNER_CORE_f,
int* SAVE_BOUNDARY_MESH_f,
int* USE_MESH_COLORING_GPU_f,
int* ANISOTROPIC_KL_f,
- int* APPROXIMATE_HESS_KL_f) {}
+ int* APPROXIMATE_HESS_KL_f) {}
void FC_FUNC_(prepare_fields_rotation_device,
PREPARE_FIELDS_ROTATION_DEVICE)(long* Mesh_pointer_f,
@@ -395,7 +396,7 @@
realw* b_A_array_rotation,
realw* b_B_array_rotation,
int* NSPEC_OUTER_CORE_ROTATION
- ) {}
+ ) {}
void FC_FUNC_(prepare_fields_gravity_device,
PREPARE_FIELDS_gravity_DEVICE)(long* Mesh_pointer_f,
@@ -406,7 +407,7 @@
realw* density_table,
realw* h_wgll_cube,
int* NRAD_GRAVITY
- ) {}
+ ) {}
void FC_FUNC_(prepare_fields_attenuat_device,
PREPARE_FIELDS_ATTENUAT_DEVICE)(long* Mesh_pointer_f,
@@ -426,7 +427,7 @@
realw* one_minus_sum_beta_inner_core,
realw* alphaval,realw* betaval,realw* gammaval,
realw* b_alphaval,realw* b_betaval,realw* b_gammaval
- ) {}
+ ) {}
void FC_FUNC_(prepare_fields_strain_device,
PREPARE_FIELDS_STRAIN_DEVICE)(long* Mesh_pointer_f,
@@ -454,7 +455,7 @@
realw* b_epsilondev_yz_inner_core,
realw* eps_trace_over_3_inner_core,
realw* b_eps_trace_over_3_inner_core
- ) {}
+ ) {}
void FC_FUNC_(prepare_fields_absorb_device,
PREPARE_FIELDS_ABSORB_DEVICE)(long* Mesh_pointer_f,
@@ -486,7 +487,7 @@
realw* jacobian2D_ymin_outer_core, realw* jacobian2D_ymax_outer_core,
realw* jacobian2D_bottom_outer_core,
realw* vp_outer_core
- ) {}
+ ) {}
void FC_FUNC_(prepare_mpi_buffers_device,
PREPARE_MPI_BUFFERS_DEVICE)(long* Mesh_pointer_f,
@@ -502,7 +503,7 @@
int* max_nibool_interfaces_outer_core,
int* nibool_interfaces_outer_core,
int* ibool_interfaces_outer_core
- ){}
+ ){}
void FC_FUNC_(prepare_fields_noise_device,
PREPARE_FIELDS_NOISE_DEVICE)(long* Mesh_pointer_f,
@@ -514,7 +515,7 @@
realw* normal_y_noise,
realw* normal_z_noise,
realw* mask_noise,
- realw* jacobian2D_top_crust_mantle) {}
+ realw* jacobian2D_top_crust_mantle) {}
void FC_FUNC_(prepare_crust_mantle_device,
PREPARE_CRUST_MANTLE_DEVICE)(long* Mesh_pointer_f,
@@ -548,7 +549,7 @@
int* NSPEC2D_TOP_CM,
int* NSPEC2D_BOTTOM_CM,
int* NCHUNKS_VAL
- ) {}
+ ) {}
void FC_FUNC_(prepare_outer_core_device,
PREPARE_OUTER_CORE_DEVICE)(long* Mesh_pointer_f,
@@ -571,7 +572,7 @@
int* nspec_inner,
int* NSPEC2D_TOP_OC,
int* NSPEC2D_BOTTOM_OC
- ) {}
+ ) {}
void FC_FUNC_(prepare_inner_core_device,
PREPARE_INNER_CORE_DEVICE)(long* Mesh_pointer_f,
@@ -590,70 +591,15 @@
int* phase_ispec_inner,
int* nspec_outer,
int* nspec_inner,
- int* NSPEC2D_TOP_IC) {}
+ int* NSPEC2D_TOP_IC) {}
void FC_FUNC_(prepare_oceans_device,
PREPARE_OCEANS_DEVICE)(long* Mesh_pointer_f,
- realw* h_rmass_ocean_load) {}
+ realw* h_rmass_ocean_load) {}
-void FC_FUNC_(prepare_fields_elastic_device,
- PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
- int* size,
- realw* rmass,
- realw* rho_vp,
- realw* rho_vs,
- int* num_phase_ispec_elastic,
- int* phase_ispec_inner_elastic,
- int* ispec_is_elastic,
- int* ABSORBING_CONDITIONS,
- realw* h_b_absorb_field,
- int* h_b_reclen_field,
- int* SIMULATION_TYPE,int* SAVE_FORWARD,
- int* COMPUTE_AND_STORE_STRAIN,
- realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
- realw* epsilondev_xz,realw* epsilondev_yz,
- int* ATTENUATION,
- int* R_size,
- realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
- realw* one_minus_sum_beta,realw* factor_common,
- realw* alphaval,realw* betaval,realw* gammaval,
- int* OCEANS,
- realw* rmass_ocean_load,
- int* NOISE_TOMOGRAPHY,
- realw* free_surface_normal,
- int* free_surface_ispec,
- int* free_surface_ijk,
- int* num_free_surface_faces,
- int* ACOUSTIC_SIMULATION,
- int* num_colors_outer_elastic,
- int* num_colors_inner_elastic,
- int* num_elem_colors_elastic,
- int* ANISOTROPY,
- realw *c11store,
- realw *c12store,
- realw *c13store,
- realw *c14store,
- realw *c15store,
- realw *c16store,
- realw *c22store,
- realw *c23store,
- realw *c24store,
- realw *c25store,
- realw *c26store,
- realw *c33store,
- realw *c34store,
- realw *c35store,
- realw *c36store,
- realw *c44store,
- realw *c45store,
- realw *c46store,
- realw *c55store,
- realw *c56store,
- realw *c66store){}
-
void FC_FUNC_(prepare_cleanup_device,
PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
- int* NCHUNKS_VAL) {}
+ int* NCHUNKS_VAL) {}
//
@@ -661,82 +607,82 @@
//
void FC_FUNC_(transfer_fields_cm_to_device,
- TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_CM_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_fields_ic_to_device,
- TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_IC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_fields_oc_to_device,
- TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_OC_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_cm_to_device,
TRANSFER_FIELDS_B_CM_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_ic_to_device,
TRANSFER_FIELDS_B_IC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_oc_to_device,
TRANSFER_FIELDS_B_OC_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_fields_cm_from_device,
- TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_CM_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_fields_ic_from_device,
- TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_IC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_fields_oc_from_device,
- TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_FIELDS_OC_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_cm_from_device,
TRANSFER_B_FIELDS_CM_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_ic_from_device,
TRANSFER_B_FIELDS_IC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_fields_oc_from_device,
TRANSFER_B_FIELDS_OC_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
- long* Mesh_pointer_f) {}
+ long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_accel_cm_to_device,
- TRANSFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_ACCEL_CM_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_displ_cm_from_device,
- TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_displ_cm_from_device,
- TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_B_DISPL_CM_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_displ_ic_from_device,
- TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_displ_ic_from_device,
- TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_B_DISPL_IC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_displ_oc_from_device,
- TRANSFER_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_displ_oc_from_device,
- TRANSFER_B_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
+ TRANSFER_B_DISPL_OC_FROM_DEVICE)(int* size, realw* displ, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_veloc_cm_from_device,
- TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
+ TRANSFER_DISPL_CM_FROM_DEVICE)(int* size, realw* veloc, long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_accel_cm_from_device,
- TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_ACCEL_CM_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_b_accel_cm_from_device,
- TRANSFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
+ TRANSFER_B_ACCEL_CM_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_accel_ic_from_device,
- TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_ACCEL_IC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_accel_oc_from_device,
- TRANSFER_ACCEL_OC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
+ TRANSFER_ACCEL_OC_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {}
void FC_FUNC_(transfer_strain_cm_from_device,
TRANSFER_STRAIN_CM_FROM_DEVICE)(long* Mesh_pointer,
@@ -745,7 +691,7 @@
realw* epsilondev_yy,
realw* epsilondev_xy,
realw* epsilondev_xz,
- realw* epsilondev_yz) {}
+ realw* epsilondev_yz) {}
void FC_FUNC_(transfer_b_strain_cm_to_device,
TRANSFER_B_STRAIN_CM_TO_DEVICE)(long* Mesh_pointer,
@@ -753,7 +699,7 @@
realw* epsilondev_yy,
realw* epsilondev_xy,
realw* epsilondev_xz,
- realw* epsilondev_yz) {}
+ realw* epsilondev_yz) {}
void FC_FUNC_(transfer_strain_ic_from_device,
TRANSFER_STRAIN_IC_FROM_DEVICE)(long* Mesh_pointer,
@@ -762,7 +708,7 @@
realw* epsilondev_yy,
realw* epsilondev_xy,
realw* epsilondev_xz,
- realw* epsilondev_yz) {}
+ realw* epsilondev_yz) {}
void FC_FUNC_(transfer_b_strain_ic_to_device,
TRANSFER_B_STRAIN_IC_TO_DEVICE)(long* Mesh_pointer,
@@ -770,17 +716,17 @@
realw* epsilondev_yy,
realw* epsilondev_xy,
realw* epsilondev_xz,
- realw* epsilondev_yz) {}
+ realw* epsilondev_yz) {}
void FC_FUNC_(transfer_rotation_from_device,
TRANSFER_ROTATION_FROM_DEVICE)(long* Mesh_pointer,
realw* A_array_rotation,
- realw* B_array_rotation) {}
+ realw* B_array_rotation) {}
void FC_FUNC_(transfer_b_rotation_to_device,
TRANSFER_B_ROTATION_TO_DEVICE)(long* Mesh_pointer,
realw* A_array_rotation,
- realw* B_array_rotation) {}
+ realw* B_array_rotation) {}
void FC_FUNC_(transfer_kernels_cm_to_host,
TRANSFER_KERNELS_CM_TO_HOST)(long* Mesh_pointer,
@@ -788,30 +734,30 @@
realw* h_alpha_kl,
realw* h_beta_kl,
realw* h_cijkl_kl,
- int* NSPEC) {}
+ int* NSPEC) {}
void FC_FUNC_(transfer_kernels_ic_to_host,
TRANSFER_KERNELS_IC_TO_HOST)(long* Mesh_pointer,
realw* h_rho_kl,
realw* h_alpha_kl,
realw* h_beta_kl,
- int* NSPEC) {}
+ int* NSPEC) {}
void FC_FUNC_(transfer_kernels_oc_to_host,
TRANSFER_KERNELS_OC_TO_HOST)(long* Mesh_pointer,
realw* h_rho_kl,
realw* h_alpha_kl,
- int* NSPEC) {}
+ int* NSPEC) {}
void FC_FUNC_(transfer_kernels_noise_to_host,
TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
realw* h_Sigma_kl,
- int* NSPEC) {}
+ int* NSPEC) {}
void FC_FUNC_(transfer_kernels_hess_cm_tohost,
TRANSFER_KERNELS_HESS_CM_TOHOST)(long* Mesh_pointer,
realw* h_hess_kl,
- int* NSPEC) {}
+ int* NSPEC) {}
//
@@ -831,7 +777,7 @@
int* number_receiver_global,
int* ispec_selected_rec,
int* ispec_selected_source,
- int* ibool) {}
+ int* ibool) {}
void FC_FUNC_(transfer_station_ac_from_device,
TRANSFER_STATION_AC_FROM_DEVICE)(
@@ -846,5 +792,5 @@
int* ispec_selected_rec,
int* ispec_selected_source,
int* ibool,
- int* SIMULATION_TYPEf) {}
+ int* SIMULATION_TYPEf) {}
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2012-07-23 21:58:26 UTC (rev 20536)
@@ -35,7 +35,6 @@
#include "config.h"
#include "mesh_constants_cuda.h"
-#include "prepare_constants_cuda.h"
/* ----------------------------------------------------------------------------------------------- */
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/shared/exit_mpi.f90 2012-07-23 21:58:26 UTC (rev 20536)
@@ -263,6 +263,28 @@
!-------------------------------------------------------------------------------------------------
!
+ subroutine max_all_cr(sendbuf, recvbuf)
+
+ implicit none
+
+! standard include of the MPI library
+ include 'mpif.h'
+
+ include "constants.h"
+ include "precision.h"
+
+ real(kind=CUSTOM_REAL) :: sendbuf, recvbuf
+ integer :: ier
+
+ call MPI_REDUCE(sendbuf,recvbuf,1,CUSTOM_MPI_TYPE, &
+ MPI_MAX,0,MPI_COMM_WORLD,ier)
+
+ end subroutine max_all_cr
+
+!
+!-------------------------------------------------------------------------------------------------
+!
+
subroutine sum_all_dp(sendbuf, recvbuf)
implicit none
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/check_simulation_stability.f90 2012-07-23 21:58:26 UTC (rev 20536)
@@ -38,9 +38,6 @@
implicit none
- include 'mpif.h'
- include "precision.h"
-
! time step
integer it,NSTEP,myrank
@@ -56,8 +53,6 @@
real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STRAIN_ONLY) :: &
eps_trace_over_3_crust_mantle
-! real(kind=CUSTOM_REAL), dimension(5,NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
-! epsilondev_crust_mantle
real(kind=CUSTOM_REAL), dimension(NGLLX,NGLLY,NGLLZ,NSPEC_CRUST_MANTLE_STR_OR_ATT) :: &
epsilondev_xx_crust_mantle,epsilondev_yy_crust_mantle,epsilondev_xy_crust_mantle, &
epsilondev_xz_crust_mantle,epsilondev_yz_crust_mantle
@@ -67,8 +62,6 @@
double precision :: time_start,DT,t0
-! logical COMPUTE_AND_STORE_STRAIN
-
! local parameters
! maximum of the norm of the displacement and of the potential in the fluid
real(kind=CUSTOM_REAL) Usolidnorm,Usolidnorm_all,Ufluidnorm,Ufluidnorm_all
@@ -129,10 +122,8 @@
call exit_MPI(myrank,'forward simulation became unstable in fluid and blew up')
! compute the maximum of the maxima for all the slices using an MPI reduction
- call MPI_REDUCE(Usolidnorm,Usolidnorm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
- MPI_COMM_WORLD,ier)
- call MPI_REDUCE(Ufluidnorm,Ufluidnorm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
- MPI_COMM_WORLD,ier)
+ call max_all_cr(Usolidnorm,Usolidnorm_all)
+ call max_all_cr(Ufluidnorm,Ufluidnorm_all)
if (SIMULATION_TYPE == 3) then
if( .not. GPU_MODE) then
@@ -157,10 +148,8 @@
call exit_MPI(myrank,'backward simulation became unstable and blew up in the fluid')
! compute the maximum of the maxima for all the slices using an MPI reduction
- call MPI_REDUCE(b_Usolidnorm,b_Usolidnorm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
- MPI_COMM_WORLD,ier)
- call MPI_REDUCE(b_Ufluidnorm,b_Ufluidnorm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
- MPI_COMM_WORLD,ier)
+ call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
+ call max_all_cr(b_Ufluidnorm,b_Ufluidnorm_all)
endif
if (COMPUTE_AND_STORE_STRAIN) then
@@ -177,10 +166,8 @@
call check_norm_strain_from_device(Strain_norm,Strain2_norm,Mesh_pointer)
endif
- call MPI_REDUCE(Strain_norm,Strain_norm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
- MPI_COMM_WORLD,ier)
- call MPI_REDUCE(Strain2_norm,Strain2_norm_all,1,CUSTOM_MPI_TYPE,MPI_MAX,0, &
- MPI_COMM_WORLD,ier)
+ call max_all_cr(Strain_norm,Strain_norm_all)
+ call max_all_cr(Strain2_norm,Strain2_norm_all)
endif
if(myrank == 0) then
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/compute_forces_inner_core_Dev.F90 2012-07-23 21:58:26 UTC (rev 20536)
@@ -671,7 +671,7 @@
if( ATTENUATION_VAL ) then
if( ATTENUATION_3D_VAL ) then
mul = mul * one_minus_sum_beta(i,j,k,ispec)
- else
+ else
mul = mul * one_minus_sum_beta(1,1,1,ispec)
endif
endif
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/get_attenuation.f90 2012-07-23 21:58:26 UTC (rev 20536)
@@ -46,7 +46,7 @@
integer :: i,j,k,ispec,ier
double precision, dimension(N_SLS) :: tau_e, fc
double precision :: omsb, Q_mu, sf, T_c_source, scale_t
-
+
! checks if attenuation is on and anything to do
if( .not. ATTENUATION_VAL) return
Modified: seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2012-07-23 12:25:30 UTC (rev 20535)
+++ seismo/3D/SPECFEM3D_GLOBE/branches/SPECFEM3D_GLOBE_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2012-07-23 21:58:26 UTC (rev 20536)
@@ -714,7 +714,7 @@
factor_common_crust_mantle_dble = 0.d0
factor_scale_crust_mantle_dble = 0.d0
tau_sigma_dble = 0.d0
-
+
call get_attenuation_model_3D_or_1D(myrank, prnamel, omsb_crust_mantle_dble, &
factor_common_crust_mantle_dble,factor_scale_crust_mantle_dble,tau_sigma_dble, &
ATT1,ATT2,ATT3,ATT4)
@@ -727,7 +727,7 @@
factor_common_inner_core_dble = 0.d0
factor_scale_inner_core_dble = 0.d0
tau_sigma_dble = 0.d0
-
+
call get_attenuation_model_3D_or_1D(myrank, prnamel, omsb_inner_core_dble, &
factor_common_inner_core_dble,factor_scale_inner_core_dble,tau_sigma_dble, &
ATT1,ATT2,ATT3,ATT5)
More information about the CIG-COMMITS
mailing list