[cig-commits] r19152 - in seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src: cuda generate_databases meshfem3D specfem3D
danielpeter at geodynamics.org
danielpeter at geodynamics.org
Sat Nov 5 19:02:37 PDT 2011
Author: danielpeter
Date: 2011-11-05 19:02:36 -0700 (Sat, 05 Nov 2011)
New Revision: 19152
Added:
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu
Removed:
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
Modified:
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_aniso.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
Log:
updates declarations; re-adds anisotropy
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/check_fields_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -47,14 +47,14 @@
extern "C"
void FC_FUNC_(check_max_norm_displ_gpu,
- CHECK_MAX_NORM_DISPL_GPU)(int* size, float* displ,long* Mesh_pointer_f,int* announceID) {
+ CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID) {
TRACE("check_max_norm_displ_gpu");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- cudaMemcpy(displ, mp->d_displ,*size*sizeof(float),cudaMemcpyDeviceToHost);
- float maxnorm=0;
+ cudaMemcpy(displ, mp->d_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+ realw maxnorm=0;
for(int i=0;i<*size;i++) {
maxnorm = MAX(maxnorm,fabsf(displ[i]));
@@ -66,13 +66,13 @@
extern "C"
void FC_FUNC_(check_max_norm_vector,
- CHECK_MAX_NORM_VECTOR)(int* size, float* vector1, int* announceID) {
+ CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID) {
TRACE("check_max_norm_vector");
int procid;
MPI_Comm_rank(MPI_COMM_WORLD,&procid);
- float maxnorm=0;
+ realw maxnorm=0;
int maxloc;
for(int i=0;i<*size;i++) {
if(maxnorm<fabsf(vector1[i])) {
@@ -87,11 +87,11 @@
extern "C"
void FC_FUNC_(check_max_norm_displ,
- CHECK_MAX_NORM_DISPL)(int* size, float* displ, int* announceID) {
+ CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID) {
TRACE("check_max_norm_displ");
- float maxnorm=0;
+ realw maxnorm=0;
for(int i=0;i<*size;i++) {
maxnorm = MAX(maxnorm,fabsf(displ[i]));
@@ -103,19 +103,19 @@
extern "C"
void FC_FUNC_(check_max_norm_b_displ_gpu,
- CHECK_MAX_NORM_B_DISPL_GPU)(int* size, float* b_displ,long* Mesh_pointer_f,int* announceID) {
+ CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID) {
TRACE("check_max_norm_b_displ_gpu");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- float* b_accel = (float*)malloc(*size*sizeof(float));
+ realw* b_accel = (realw*)malloc(*size*sizeof(realw));
- cudaMemcpy(b_displ, mp->d_b_displ,*size*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_displ, mp->d_b_displ,*size*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
- float maxnorm=0;
- float maxnorm_accel=0;
+ realw maxnorm=0;
+ realw maxnorm_accel=0;
for(int i=0;i<*size;i++) {
maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
@@ -130,15 +130,15 @@
extern "C"
void FC_FUNC_(check_max_norm_b_accel_gpu,
- CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, float* b_accel,long* Mesh_pointer_f,int* announceID) {
+ CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID) {
TRACE("check_max_norm_b_accel_gpu");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_accel, mp->d_b_accel,*size*sizeof(realw),cudaMemcpyDeviceToHost);
- float maxnorm=0;
+ realw maxnorm=0;
for(int i=0;i<*size;i++) {
maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
@@ -150,15 +150,15 @@
extern "C"
void FC_FUNC_(check_max_norm_b_veloc_gpu,
- CHECK_MAX_NORM_B_VELOC_GPU)(int* size, float* b_veloc,long* Mesh_pointer_f,int* announceID) {
+ CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID) {
TRACE("check_max_norm_b_veloc_gpu");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_veloc, mp->d_b_veloc,*size*sizeof(realw),cudaMemcpyDeviceToHost);
- float maxnorm=0;
+ realw maxnorm=0;
for(int i=0;i<*size;i++) {
maxnorm = MAX(maxnorm,fabsf(b_veloc[i]));
@@ -170,11 +170,11 @@
extern "C"
void FC_FUNC_(check_max_norm_b_displ,
- CHECK_MAX_NORM_B_DISPL)(int* size, float* b_displ,int* announceID) {
+ CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID) {
TRACE("check_max_norm_b_displ");
- float maxnorm=0;
+ realw maxnorm=0;
for(int i=0;i<*size;i++) {
maxnorm = MAX(maxnorm,fabsf(b_displ[i]));
@@ -186,11 +186,11 @@
extern "C"
void FC_FUNC_(check_max_norm_b_accel,
- CHECK_MAX_NORM_B_ACCEL)(int* size, float* b_accel,int* announceID) {
+ CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID) {
TRACE("check_max_norm_b_accel");
- float maxnorm=0;
+ realw maxnorm=0;
for(int i=0;i<*size;i++) {
maxnorm = MAX(maxnorm,fabsf(b_accel[i]));
@@ -202,7 +202,7 @@
extern "C"
void FC_FUNC_(check_error_vectors,
- CHECK_ERROR_VECTORS)(int* sizef, float* vector1,float* vector2) {
+ CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2) {
TRACE("check_error_vectors");
@@ -256,9 +256,9 @@
MPI_Comm_rank(MPI_COMM_WORLD,&procid);
int size = *sizef;
int it = *itf;
- float* accel_cpy = (float*)malloc(size*sizeof(float));
- cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(float),cudaMemcpyDeviceToHost);
- float maxval=0;
+ realw* accel_cpy = (realw*)malloc(size*sizeof(realw));
+ cudaMemcpy(accel_cpy,mp->d_accel,size*sizeof(realw),cudaMemcpyDeviceToHost);
+ realw maxval=0;
for(int i=0;i<size;++i) {
maxval = MAX(maxval,accel_cpy[i]);
}
@@ -272,10 +272,10 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void get_maximum_kernel(float* array, int size, float* d_max){
+__global__ void get_maximum_kernel(realw* array, int size, realw* d_max){
/* simplest version: uses only 1 thread
- float max;
+ realw max;
max = 0;
// finds maximum value in array
if( size > 0 ){
@@ -288,7 +288,7 @@
*/
// reduction example:
- __shared__ float sdata[256] ;
+ __shared__ realw sdata[256] ;
// load shared mem
unsigned int tid = threadIdx.x;
@@ -320,7 +320,7 @@
extern "C"
void FC_FUNC_(get_norm_acoustic_from_device,
- GET_NORM_ACOUSTIC_FROM_DEVICE)(float* norm,
+ GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
int* SIMULATION_TYPE) {
@@ -328,19 +328,17 @@
//double start_time = get_time();
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- float max;
- float *d_max;
+ realw max;
+ realw *d_max;
-
-
max = 0;
/* way 1 : timing Elapsed time: 8.464813e-03
- float* h_array;
- h_array = (float*)calloc(mp->NGLOB_AB,sizeof(float));
+ realw* h_array;
+ h_array = (realw*)calloc(mp->NGLOB_AB,sizeof(realw));
print_CUDA_error_if_any(cudaMemcpy(h_array,mp->d_potential_dot_dot_acoustic,
- sizeof(float)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131);
+ sizeof(realw)*(mp->NGLOB_AB),cudaMemcpyDeviceToHost),131);
// finds maximum value in array
max = h_array[0];
@@ -352,7 +350,7 @@
/* way 2: timing Elapsed time: 8.818102e-02
// launch simple kernel
- cudaMalloc((void**)&d_max,sizeof(float));
+ cudaMalloc((void**)&d_max,sizeof(realw));
dim3 grid(1,1);
dim3 threads(1,1,1);
@@ -360,21 +358,21 @@
get_maximum_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
mp->NGLOB_AB,
d_max);
- print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(float), cudaMemcpyDeviceToHost),222);
+ print_CUDA_error_if_any(cudaMemcpy(&max,d_max, sizeof(realw), cudaMemcpyDeviceToHost),222);
cudaFree(d_max);
*/
// way 2 b: timing Elapsed time: 1.236916e-03
// launch simple reduction kernel
- float* h_max;
+ realw* h_max;
int blocksize = 256;
int num_blocks_x = ceil(mp->NGLOB_AB/blocksize);
//printf("num_blocks_x %i \n",num_blocks_x);
- h_max = (float*) calloc(num_blocks_x,sizeof(float));
- cudaMalloc((void**)&d_max,num_blocks_x*sizeof(float));
+ h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
+ cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
dim3 grid(num_blocks_x,1);
dim3 threads(blocksize,1,1);
@@ -391,7 +389,7 @@
d_max);
}
- print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost),222);
+ print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
// determines max for all blocks
max = h_max[0];
@@ -417,14 +415,15 @@
// precision vector x
int incr = 1;
int imax = 0;
- imax = cublasIsamax(mp->NGLOB_AB,(float*)mp->d_potential_dot_dot_acoustic, incr);
+ imax = cublasIsamax(mp->NGLOB_AB,(realw*)mp->d_potential_dot_dot_acoustic, incr);
status= cublasGetError();
if (status != CUBLAS_STATUS_SUCCESS) {
fprintf (stderr, "!!!! CUBLAS error in cublasIsamax\n");
exit(1);
}
- print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]), sizeof(float), cudaMemcpyDeviceToHost),222);
+ print_CUDA_error_if_any(cudaMemcpy(&max,&(mp->d_potential_dot_dot_acoustic[imax]),
+ sizeof(realw), cudaMemcpyDeviceToHost),222);
printf("maximum %i %i %f \n",mp->NGLOB_AB,imax,max);
@@ -453,10 +452,10 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void get_maximum_vector_kernel(float* array, int size, float* d_max){
+__global__ void get_maximum_vector_kernel(realw* array, int size, realw* d_max){
// reduction example:
- __shared__ float sdata[256] ;
+ __shared__ realw sdata[256] ;
// load shared mem
unsigned int tid = threadIdx.x;
@@ -490,7 +489,7 @@
extern "C"
void FC_FUNC_(get_norm_elastic_from_device,
- GET_NORM_ELASTIC_FROM_DEVICE)(float* norm,
+ GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
int* SIMULATION_TYPE) {
@@ -498,20 +497,20 @@
//double start_time = get_time();
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- float max;
- float *d_max;
+ realw max;
+ realw *d_max;
max = 0;
// launch simple reduction kernel
- float* h_max;
+ realw* h_max;
int blocksize = 256;
int num_blocks_x = ceil(mp->NGLOB_AB/blocksize);
//printf("num_blocks_x %i \n",num_blocks_x);
- h_max = (float*) calloc(num_blocks_x,sizeof(float));
- cudaMalloc((void**)&d_max,num_blocks_x*sizeof(float));
+ h_max = (realw*) calloc(num_blocks_x,sizeof(realw));
+ cudaMalloc((void**)&d_max,num_blocks_x*sizeof(realw));
dim3 grid(num_blocks_x,1);
dim3 threads(blocksize,1,1);
@@ -528,7 +527,7 @@
d_max);
}
- print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost),222);
+ print_CUDA_error_if_any(cudaMemcpy(h_max,d_max,num_blocks_x*sizeof(realw),cudaMemcpyDeviceToHost),222);
// determines max for all blocks
max = h_max[0];
Added: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu (rev 0)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_acoustic_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -0,0 +1,370 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+// #include "epik_user.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_add_sources_acoustic_kernel(realw* potential_dot_dot_acoustic,
+ int* ibool,
+ int* ispec_is_inner,
+ int phase_is_inner,
+ realw* sourcearrays,
+ double* stf_pre_compute,
+ int myrank,
+ int* islice_selected_source,
+ int* ispec_selected_source,
+ int* ispec_is_acoustic,
+ realw* kappastore,
+ int NSOURCES) {
+ int i = threadIdx.x;
+ int j = threadIdx.y;
+ int k = threadIdx.z;
+
+ int isource = blockIdx.x + gridDim.x*blockIdx.y; // bx
+
+ int ispec;
+ int iglob;
+ realw stf;
+ realw kappal;
+
+ if( isource < NSOURCES ){
+
+ if(myrank == islice_selected_source[isource]) {
+
+ ispec = ispec_selected_source[isource]-1;
+
+ if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
+
+ stf = (realw) stf_pre_compute[isource];
+ iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+ kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+
+ atomicAdd(&potential_dot_dot_acoustic[iglob],
+ -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal);
+
+ // potential_dot_dot_acoustic[iglob] +=
+ // -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal;
+ }
+ }
+ }
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_ac_cuda,
+ COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
+ int* phase_is_innerf,
+ int* NSOURCESf,
+ int* SIMULATION_TYPEf,
+ double* h_stf_pre_compute,
+ int* myrankf) {
+
+TRACE("compute_add_sources_ac_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ // check if anything to do
+ if( mp->nsources_local == 0 ) return;
+
+ int phase_is_inner = *phase_is_innerf;
+ int NSOURCES = *NSOURCESf;
+ int myrank = *myrankf;
+
+ int num_blocks_x = NSOURCES;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ // copies pre-computed source time factors onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+ NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(5,5,5);
+
+ compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ phase_is_inner,
+ mp->d_sourcearrays,
+ mp->d_stf_pre_compute,
+ myrank,
+ mp->d_islice_selected_source,
+ mp->d_ispec_selected_source,
+ mp->d_ispec_is_acoustic,
+ mp->d_kappastore,
+ NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("compute_add_sources_ac_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_ac_s3_cuda,
+ COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
+ int* phase_is_innerf,
+ int* NSOURCESf,
+ int* SIMULATION_TYPEf,
+ double* h_stf_pre_compute,
+ int* myrankf) {
+
+TRACE("compute_add_sources_ac_s3_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ // check if anything to do
+ if( mp->nsources_local == 0 ) return;
+
+ int phase_is_inner = *phase_is_innerf;
+ int NSOURCES = *NSOURCESf;
+ int myrank = *myrankf;
+
+ int num_blocks_x = NSOURCES;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ // copies source time factors onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+ NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(5,5,5);
+
+ compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ phase_is_inner,
+ mp->d_sourcearrays,
+ mp->d_stf_pre_compute,
+ myrank,
+ mp->d_islice_selected_source,
+ mp->d_ispec_selected_source,
+ mp->d_ispec_is_acoustic,
+ mp->d_kappastore,
+ NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("compute_add_sources_ac_s3_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// acoustic adjoint sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(realw* potential_dot_dot_acoustic,
+ int nrec,
+ realw* adj_sourcearrays,
+ int* ibool,
+ int* ispec_is_inner,
+ int* ispec_is_acoustic,
+ int* ispec_selected_rec,
+ int phase_is_inner,
+ int* pre_computed_irec,
+ int nadj_rec_local,
+ realw* kappastore) {
+
+ int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
+
+ // because of grid shape, irec_local can be too big
+ if(irec_local < nadj_rec_local) {
+
+ int irec = pre_computed_irec[irec_local];
+
+ int ispec = ispec_selected_rec[irec]-1;
+ if( ispec_is_acoustic[ispec] ){
+
+ // checks if element is in phase_is_inner run
+ if(ispec_is_inner[ispec] == phase_is_inner) {
+ int i = threadIdx.x;
+ int j = threadIdx.y;
+ int k = threadIdx.z;
+
+ int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+ //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
+
+ //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+ // pre_computed_irec_local_index[irec],
+ // pre_computed_index,
+ // 0,
+ // i,j,k)]/kappal;
+
+ // beware, for acoustic medium, a pressure source would be taking the negative
+ // and divide by Kappa of the fluid;
+ // this would have to be done when constructing the adjoint source.
+ //
+ // note: we take the first component of the adj_sourcearrays
+ // the idea is to have e.g. a pressure source, where all 3 components would be the same
+ realw stf = adj_sourcearrays[INDEX5(5,5,5,3,i,j,k,0,irec_local)]; // / kappal
+
+ atomicAdd(&potential_dot_dot_acoustic[iglob],stf);
+
+ //+adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
+ // pre_computed_irec_local_index[irec],pre_computed_index-1,
+ // 0,i,j,k)] // / kappal
+ // );
+ }
+ }
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+extern "C"
+void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
+ ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
+ realw* h_adj_sourcearrays,
+ int* phase_is_inner,
+ int* h_ispec_is_inner,
+ int* h_ispec_is_acoustic,
+ int* h_ispec_selected_rec,
+ int* myrank,
+ int* nrec,
+ int* time_index,
+ int* h_islice_selected_rec,
+ int* nadj_rec_local,
+ int* NTSTEP_BETWEEN_READ_ADJSRC) {
+
+TRACE("add_sources_ac_sim_2_or_3_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+ // checks
+ if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
+
+ // make sure grid dimension is less than 65535 in x dimension
+ int num_blocks_x = mp->nadj_rec_local;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y,1);
+ dim3 threads(5,5,5);
+
+ // build slice of adj_sourcearrays because full array is *very* large.
+ // note: this extracts array values for local adjoint sources at given time step "time_index"
+ // from large adj_sourcearrays array into h_adj_sourcearrays_slice
+ int ispec,i,j,k;
+ int irec_local = 0;
+ for(int irec = 0; irec < *nrec; irec++) {
+ if(*myrank == h_islice_selected_rec[irec]) {
+ irec_local++;
+
+ // takes only acoustic sources
+ ispec = h_ispec_selected_rec[irec]-1;
+ if( h_ispec_is_acoustic[ispec] ){
+
+ if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+ for(k=0;k<5;k++) {
+ for(j=0;j<5;j++) {
+ for(i=0;i<5;i++) {
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,irec_local-1)]
+ = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+ *NTSTEP_BETWEEN_READ_ADJSRC,
+ 3,5,5,
+ irec_local-1,(*time_index)-1,
+ 0,i,j,k)];
+
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,irec_local-1)]
+ = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+ *NTSTEP_BETWEEN_READ_ADJSRC,
+ 3,5,5,
+ irec_local-1,(*time_index)-1,
+ 1,i,j,k)];
+
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,irec_local-1)]
+ = h_adj_sourcearrays[INDEX6(mp->nadj_rec_local,
+ *NTSTEP_BETWEEN_READ_ADJSRC,
+ 3,5,5,
+ irec_local-1,(*time_index)-1,
+ 2,i,j,k)];
+ }
+ }
+ }
+ } // phase_is_inner
+ } // h_ispec_is_acoustic
+ }
+ }
+ // check all local sources were added
+ if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
+
+ // copies extracted array values onto GPU
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+ (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice),99099);
+
+ // launches cuda kernel for acoustic adjoint sources
+ add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
+ *nrec,
+ mp->d_adj_sourcearrays,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ mp->d_ispec_is_acoustic,
+ mp->d_ispec_selected_rec,
+ *phase_is_inner,
+ mp->d_pre_computed_irec,
+ mp->nadj_rec_local,
+ mp->d_kappastore);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
+#endif
+}
Deleted: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -1,923 +0,0 @@
-/*
- !=====================================================================
- !
- ! S p e c f e m 3 D V e r s i o n 2 . 0
- ! ---------------------------------------
- !
- ! Main authors: Dimitri Komatitsch and Jeroen Tromp
- ! Princeton University, USA and University of Pau / CNRS / INRIA
- ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
- ! April 2011
- !
- ! This program is free software; you can redistribute it and/or modify
- ! it under the terms of the GNU General Public License as published by
- ! the Free Software Foundation; either version 2 of the License, or
- ! (at your option) any later version.
- !
- ! This program is distributed in the hope that it will be useful,
- ! but WITHOUT ANY WARRANTY; without even the implied warranty of
- ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- ! GNU General Public License for more details.
- !
- ! You should have received a copy of the GNU General Public License along
- ! with this program; if not, write to the Free Software Foundation, Inc.,
- ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- !
- !=====================================================================
- */
-
-#include <stdio.h>
-#include <cuda.h>
-#include <cublas.h>
-#include <mpi.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#include "config.h"
-#include "mesh_constants_cuda.h"
-// #include "epik_user.h"
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// elastic domain sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-// crashes if the CMTSOLUTION does not match the mesh properly
-__global__ void compute_add_sources_kernel(float* accel,
- int* ibool,
- int* ispec_is_inner,
- int phase_is_inner,
- float* sourcearrays,
- double* stf_pre_compute,
- int myrank,
- int* islice_selected_source,
- int* ispec_selected_source,
- int* ispec_is_elastic,
- int NSOURCES //,float* d_debug
- ) {
- int i = threadIdx.x;
- int j = threadIdx.y;
- int k = threadIdx.z;
-
- int isource = blockIdx.x + gridDim.x*blockIdx.y; // bx
- int ispec;
- int iglob;
- float stf;
-
- if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
-
- if(myrank == islice_selected_source[isource]) {
-
- ispec = ispec_selected_source[isource]-1;
-
- if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
-
- stf = (float) stf_pre_compute[isource];
-
- //if(i==0 && j==0 && k==0) printf("add sources kernel: stf = %e\n",stf);
-
- iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-
- atomicAdd(&accel[iglob*3],
- sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf);
- atomicAdd(&accel[iglob*3+1],
- sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 1, i,j,k)]*stf);
-
- // if((iglob*3+2 == 304598)) {
- // atomicAdd(&d_debug[0],1.0f);
- // d_debug[1] = accel[iglob*3+2];
- // d_debug[2] = sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)];
- // d_debug[3] = stf;
- // }
- // d_debug[4] = 42.0f;
-
- atomicAdd(&accel[iglob*3+2],
- sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)]*stf);
- }
- }
- }
-
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(compute_add_sources_el_cuda,
- COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
- //int* NSPEC_ABf, int* NGLOB_ABf,
- int* phase_is_innerf,
- int* NSOURCESf,
- //int* itf, float* dtf, float* t0f,
- //int* SIMULATION_TYPEf,int* NSTEPf,
- //int* NOISE_TOMOGRAPHYf,
- //int* USE_FORCE_POINT_SOURCEf,
- double* h_stf_pre_compute,
- int* myrankf) {
-
-TRACE("compute_add_sources_el_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- // check if anything to do
- if( mp->nsources_local == 0 ) return;
-
- //int NSPEC_AB = *NSPEC_ABf;
- //int NGLOB_AB = *NGLOB_ABf;
- int phase_is_inner = *phase_is_innerf;
- //int it = *itf;
- //float dt = *dtf;
- //float t0 = *t0f;
- //int SIMULATION_TYPE = *SIMULATION_TYPEf;
- //int NSTEP = *NSTEPf;
- //int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf;
- int NSOURCES = *NSOURCESf;
- //int USE_FORCE_POINT_SOURCE = *USE_FORCE_POINT_SOURCEf;
- int myrank = *myrankf;
-
-
- int num_blocks_x = NSOURCES;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
- num_blocks_y = num_blocks_y*2;
- }
-
- //double* d_stf_pre_compute;
- print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
- NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(5,5,5);
-
- //float* d_debug;
- // (float* accel, int* ibool, int* ispec_is_inner, int phase_is_inner,
- // float* sourcearrays, double* stf_pre_compute,int myrank,
- // int* islice_selected_source, int* ispec_selected_source,
- // int* ispec_is_elastic, int NSOURCES)
- //printf("add sources : nsources_local = %d\n",mp->nsources_local);
- //printf("add sources : stf = %e\n",h_stf_pre_compute[0]);
-
- compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- phase_is_inner,
- mp->d_sourcearrays,
- mp->d_stf_pre_compute,
- myrank,
- mp->d_islice_selected_source,
- mp->d_ispec_selected_source,
- mp->d_ispec_is_elastic,
- NSOURCES //,d_debug
- );
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_add_sources_kernel");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(compute_add_sources_el_s3_cuda,
- COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
- int* USE_FORCE_POINT_SOURCE,
- double* h_stf_pre_compute,
- int* NSOURCESf,
- int* phase_is_inner,int* myrank) {
- TRACE("compute_add_sources_el_s3_cuda");
- // EPIK_TRACER("compute_add_sources_el_s3_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
- int NSOURCES = *NSOURCESf;
-
- if(*USE_FORCE_POINT_SOURCE) {
- printf("USE FORCE POINT SOURCE not implemented for GPU_MODE");
- MPI_Abort(MPI_COMM_WORLD, 1);
- }
-
- print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
- NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_add_sources_el_s3_cuda");
-#endif
-
- int num_blocks_x = NSOURCES;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
- num_blocks_y = num_blocks_y*2;
- }
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(5,5,5);
-
- //float* d_debug;
- // float* h_debug = (float*)calloc(128,sizeof(float));
- // cudaMalloc((void**)&d_debug,128*sizeof(float));
- // cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
- compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel,mp->d_ibool,
- mp->d_ispec_is_inner, *phase_is_inner,
- mp->d_sourcearrays,
- mp->d_stf_pre_compute,
- *myrank,
- mp->d_islice_selected_source,mp->d_ispec_selected_source,
- mp->d_ispec_is_elastic,
- NSOURCES //,d_debug
- );
-
- // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
- // for(int i=0;i<10;i++) {
- // printf("debug[%d] = %e \n",i,h_debug[i]);
- // }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_add_sources_el_s3_cuda");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// NOISE sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool,
- int* ispec_selected_rec,
- int irec_master_noise,
- realw* accel,
- realw* noise_sourcearray,
- int it) {
- int tx = threadIdx.x;
- int iglob = ibool[tx + 125*(ispec_selected_rec[irec_master_noise-1]-1)]-1;
-
- // not sure if we need atomic operations but just in case...
- // accel[3*iglob] += noise_sourcearray[3*tx + 3*125*it];
- // accel[1+3*iglob] += noise_sourcearray[1+3*tx + 3*125*it];
- // accel[2+3*iglob] += noise_sourcearray[2+3*tx + 3*125*it];
-
- atomicAdd(&accel[iglob*3],noise_sourcearray[3*tx + 3*125*it]);
- atomicAdd(&accel[iglob*3+1],noise_sourcearray[1+3*tx + 3*125*it]);
- atomicAdd(&accel[iglob*3+2],noise_sourcearray[2+3*tx + 3*125*it]);
-
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(add_source_master_rec_noise_cu,
- ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
- int* myrank_f,
- int* it_f,
- int* irec_master_noise_f,
- int* islice_selected_rec) {
-
-TRACE("add_source_master_rec_noise_cu");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- int it = *it_f-1; // -1 for Fortran -> C indexing differences
- int irec_master_noise = *irec_master_noise_f;
- int myrank = *myrank_f;
-
- dim3 grid(1,1,1);
- dim3 threads(125,1,1);
-
- if(myrank == islice_selected_rec[irec_master_noise-1]) {
- add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool,
- mp->d_ispec_selected_rec,
- irec_master_noise,
- mp->d_accel,
- mp->d_noise_sourcearray,
- it);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel");
-#endif
- }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// ADJOINT sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(float* accel,
- int nrec,
- float* adj_sourcearrays,
- int* ibool,
- int* ispec_is_inner,
- int* ispec_is_elastic,
- int* ispec_selected_rec,
- int phase_is_inner,
- int* islice_selected_rec,
- int* pre_computed_irec,
- int nadj_rec_local //,int myrank //,int* debugi,float* debugf
- ) {
-
- int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
-
- if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
-
- int irec = pre_computed_irec[irec_local];
-
- int ispec = ispec_selected_rec[irec]-1;
- if( ispec_is_elastic[ispec] ){
-
- if(ispec_is_inner[ispec] == phase_is_inner) {
- int i = threadIdx.x;
- int j = threadIdx.y;
- int k = threadIdx.z;
- //int iglob = ibool[i+5*(j+5*(k+5*ispec))]-1;
- int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-
- // atomic operations are absolutely necessary for correctness!
- atomicAdd(&(accel[0+3*iglob]),adj_sourcearrays[INDEX5(5,5,5,3,
- i,j,k,
- 0,
- irec_local)]);
-
- atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
- i,j,k,
- 1,
- irec_local)]);
-
- atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
- i,j,k,
- 2,
- irec_local)]);
- }
- } // ispec_is_elastic
- }
-
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(add_sources_el_sim_type_2_or_3,
- ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
- float* h_adj_sourcearrays,
- int* phase_is_inner,
- int* h_ispec_is_inner,
- int* h_ispec_is_elastic,
- int* h_ispec_selected_rec,
- int* myrank,
- int* nrec,
- int* time_index,
- int* h_islice_selected_rec,
- int* nadj_rec_local,
- int* NTSTEP_BETWEEN_READ_ADJSRC) {
-
-TRACE("add_sources_el_sim_type_2_or_3");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
- // checks
- if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
-
- //int rank;
- //MPI_Comm_rank(MPI_COMM_WORLD,&rank);
-
- // make sure grid dimension is less than 65535 in x dimension
- int num_blocks_x = mp->nadj_rec_local;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
- num_blocks_y = num_blocks_y*2;
- }
-
- dim3 grid(num_blocks_x,num_blocks_y,1);
- dim3 threads(5,5,5);
-
- //float* d_adj_sourcearrays;
- //print_CUDA_error_if_any(cudaMalloc((void**)&d_adj_sourcearrays,
- // (*nadj_rec_local)*3*125*sizeof(float)),1);
-
- //float* h_adj_sourcearrays_slice = (float*)malloc((*nadj_rec_local)*3*125*sizeof(float));
-
- //int* h_pre_computed_irec = new int[*nadj_rec_local];
-
- //int* d_pre_computed_irec;
- //cudaMalloc((void**)&d_pre_computed_irec,(*nadj_rec_local)*sizeof(int));
-
- // build slice of adj_sourcearrays because full array is *very* large.
- // note: this extracts array values for local adjoint sources at given time step "time_index"
- // from large adj_sourcearrays array into h_adj_sourcearrays_slice
- int ispec,i,j,k;
- int irec_local = 0;
- for(int irec = 0; irec < *nrec; irec++) {
- if(*myrank == h_islice_selected_rec[irec]) {
- irec_local++;
- //h_pre_computed_irec[irec_local-1] = irec;
-
- // takes only elastic sources
- ispec = h_ispec_selected_rec[irec]-1;
- if( h_ispec_is_elastic[ispec] ){
-
- if( h_ispec_is_inner[ispec] == *phase_is_inner) {
- for(k=0;k<5;k++) {
- for(j=0;j<5;j++) {
- for(i=0;i<5;i++) {
-
- mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
- i,j,k,0,
- irec_local-1)]
- = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
- *NTSTEP_BETWEEN_READ_ADJSRC,
- 3,5,5,
- irec_local-1,
- *time_index-1,
- 0,i,j,k)];
-
- mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
- i,j,k,1,
- irec_local-1)]
- = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
- *NTSTEP_BETWEEN_READ_ADJSRC,
- 3,5,5,
- irec_local-1,
- *time_index-1,
- 1,i,j,k)];
-
- mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
- i,j,k,2,
- irec_local-1)]
- = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
- *NTSTEP_BETWEEN_READ_ADJSRC,
- 3,5,5,
- irec_local-1,
- *time_index-1,
- 2,i,j,k)];
- }
- }
- }
- } // phase_is_inner
- } // h_ispec_is_elastic
- }
- }
- // check all local sources were added
- if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
-
- // printf("irec_local vs. *nadj_rec_local -> %d vs. %d\n",irec_local,*nadj_rec_local);
- // for(int ispec=0;ispec<(*nadj_rec_local);ispec++) {
- // for(int i=0;i<5;i++)
- // for(int j=0;j<5;j++)
- // for(int k=0;k<5;k++) {
- // h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,0,ispec)] =
- // h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
- // ispec,
- // *time_index-1,
- // 0,
- // i,j,k)];
- // h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,1,ispec)] =
- // h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_READ_ADJSRC,3,5,5,
- // ispec,
- // *time_index-1,
- // 1,
- // i,j,k)];
- // h_adj_sourcearrays_slice[INDEX5(5,5,5,3,i,j,k,2,ispec)] =
- // h_adj_sourcearrays[INDEX6(*nadj_rec_local,*NTSTEP_BETWEEN_ADJSRC,3,5,5,
- // ispec,
- // *time_index-1,
- // 2,
- // i,j,k)];
- // }
-
- // }
-
- // copies extracted array values onto GPU
- cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
- (mp->nadj_rec_local)*3*125*sizeof(float),cudaMemcpyHostToDevice);
-
-
- // the irec_local variable needs to be precomputed (as
- // h_pre_comp..), because normally it is in the loop updating accel,
- // and due to how it's incremented, it cannot be parallelized
-
- // int irec_local=0;
- // for(int irec=0;irec<*nrec;irec++) {
- // if(*myrank == h_islice_selected_rec[irec]) {
- // h_pre_computed_irec_local_index[irec] = irec_local;
- // irec_local++;
- // if(irec_local==1) {
- // // printf("%d:first useful irec==%d\n",rank,irec);
- // }
- // }
- // else h_pre_computed_irec_local_index[irec] = 0;
- // }
- //cudaMemcpy(mp->d_pre_computed_irec,mp->h_pre_computed_irec,
- // (mp->nadj_rec_local)*sizeof(int),cudaMemcpyHostToDevice);
-
- // pause_for_debugger(1);
- //int* d_debugi, *h_debugi;
- //float* d_debugf, *h_debugf;
- //h_debugi = (int*)calloc(num_blocks_x,sizeof(int));
- //cudaMalloc((void**)&d_debugi,num_blocks_x*sizeof(int));
- //cudaMemcpy(d_debugi,h_debugi,num_blocks_x*sizeof(int),cudaMemcpyHostToDevice);
- //h_debugf = (float*)calloc(num_blocks_x,sizeof(float));
- //cudaMalloc((void**)&d_debugf,num_blocks_x*sizeof(float));
- //cudaMemcpy(d_debugf,h_debugf,num_blocks_x*sizeof(float),cudaMemcpyHostToDevice);
-
- add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel,
- *nrec,
- mp->d_adj_sourcearrays,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- mp->d_ispec_is_elastic,
- mp->d_ispec_selected_rec,
- *phase_is_inner,
- mp->d_islice_selected_rec,
- mp->d_pre_computed_irec,
- mp->nadj_rec_local //,*myrank //,d_debugi,d_debugf
- );
-
- //cudaMemcpy(h_debugi,d_debugi,num_blocks_x*sizeof(int),cudaMemcpyDeviceToHost);
- //cudaMemcpy(h_debugf,d_debugf,num_blocks_x*sizeof(float),cudaMemcpyDeviceToHost);
-
- // printf("%d: pre_com0:%d\n",rank,h_pre_computed_irec_local_index[0]);
- // printf("%d: pre_com1:%d\n",rank,h_pre_computed_irec_local_index[1]);
- // printf("%d: pre_com2:%d\n",rank,h_pre_computed_irec_local_index[2]);
- // for(int i=156;i<(156+30);i++) {
- // if(rank==0) printf("%d:debug[%d] = i/f = %d / %e\n",rank,i,h_debugi[i],h_debugf[i]);
- // }
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- // MPI_Barrier(MPI_COMM_WORLD);
- exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
-
- // printf("Proc %d exiting with successful kernel\n",rank);
- // exit(1);
-#endif
- //cudaFree(d_adj_sourcearrays);
- //cudaFree(d_pre_computed_irec);
- //free(h_adj_sourcearrays_slice);
- //delete h_pre_computed_irec;
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// acoustic sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void compute_add_sources_acoustic_kernel(float* potential_dot_dot_acoustic,
- int* ibool,
- int* ispec_is_inner,
- int phase_is_inner,
- float* sourcearrays,
- double* stf_pre_compute,
- int myrank,
- int* islice_selected_source,
- int* ispec_selected_source,
- int* ispec_is_acoustic,
- float* kappastore,
- int NSOURCES) {
- int i = threadIdx.x;
- int j = threadIdx.y;
- int k = threadIdx.z;
-
- int isource = blockIdx.x + gridDim.x*blockIdx.y; // bx
-
- int ispec;
- int iglob;
- float stf;
- float kappal;
-
- if( isource < NSOURCES ){
-
- //if(myrank == 0 && i== 0 && j == 0 && k == 0) printf("source isource = %i \n",isource);
-
- if(myrank == islice_selected_source[isource]) {
-
- ispec = ispec_selected_source[isource]-1;
-
- if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_acoustic[ispec] ) {
-
- stf = (float) stf_pre_compute[isource];
- iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
- kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
-
- //printf("source ispec = %i %i %e %e \n",ispec,iglob,stf,kappal);
- //printf("source arr = %e %i %i %i %i %i\n", -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal,i,j,k,iglob,ispec);
-
- atomicAdd(&potential_dot_dot_acoustic[iglob],
- -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal);
-
- // potential_dot_dot_acoustic[iglob] +=
- // -sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf/kappal;
-
- //printf("potential = %e %i %i %i %i %i\n", potential_dot_dot_acoustic[iglob],i,j,k,iglob,ispec);
-
-
- }
- }
- }
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(compute_add_sources_ac_cuda,
- COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* NSOURCESf,
- int* SIMULATION_TYPEf,
- int* USE_FORCE_POINT_SOURCEf,
- double* h_stf_pre_compute,
- int* myrankf) {
-
-TRACE("compute_add_sources_ac_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- // check if anything to do
- if( mp->nsources_local == 0 ) return;
-
- int phase_is_inner = *phase_is_innerf;
- //int SIMULATION_TYPE = *SIMULATION_TYPEf;
- int NSOURCES = *NSOURCESf;
- //int USE_FORCE_POINT_SOURCE = *USE_FORCE_POINT_SOURCEf;
- int myrank = *myrankf;
-
- int num_blocks_x = NSOURCES;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
- num_blocks_y = num_blocks_y*2;
- }
-
- // copies pre-computed source time factors onto GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
- NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(5,5,5);
-
- compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- phase_is_inner,
- mp->d_sourcearrays,
- mp->d_stf_pre_compute,
- myrank,
- mp->d_islice_selected_source,
- mp->d_ispec_selected_source,
- mp->d_ispec_is_acoustic,
- mp->d_kappastore,
- NSOURCES);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_add_sources_ac_cuda");
-#endif
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(compute_add_sources_ac_s3_cuda,
- COMPUTE_ADD_SOURCES_AC_s3_CUDA)(long* Mesh_pointer_f,
- int* phase_is_innerf,
- int* NSOURCESf,
- int* SIMULATION_TYPEf,
- int* USE_FORCE_POINT_SOURCEf,
- double* h_stf_pre_compute,
- int* myrankf) {
-
-TRACE("compute_add_sources_ac_s3_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- // check if anything to do
- if( mp->nsources_local == 0 ) return;
-
- int phase_is_inner = *phase_is_innerf;
- //int SIMULATION_TYPE = *SIMULATION_TYPEf;
- int NSOURCES = *NSOURCESf;
- //int USE_FORCE_POINT_SOURCE = *USE_FORCE_POINT_SOURCEf;
- int myrank = *myrankf;
-
- int num_blocks_x = NSOURCES;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
- num_blocks_y = num_blocks_y*2;
- }
-
- // copies source time factors onto GPU
- print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
- NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(5,5,5);
-
- compute_add_sources_acoustic_kernel<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- phase_is_inner,
- mp->d_sourcearrays,
- mp->d_stf_pre_compute,
- myrank,
- mp->d_islice_selected_source,
- mp->d_ispec_selected_source,
- mp->d_ispec_is_acoustic,
- mp->d_kappastore,
- NSOURCES);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("compute_add_sources_ac_s3_cuda");
-#endif
-}
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// acoustic adjoint sources
-
-/* ----------------------------------------------------------------------------------------------- */
-
-__global__ void add_sources_ac_SIM_TYPE_2_OR_3_kernel(float* potential_dot_dot_acoustic,
- int nrec,
- float* adj_sourcearrays,
- int* ibool,
- int* ispec_is_inner,
- int* ispec_is_acoustic,
- int* ispec_selected_rec,
- int phase_is_inner,
- int* islice_selected_rec,
- int* pre_computed_irec,
- int nadj_rec_local,
- float* kappastore) {
-
- int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
-
- // because of grid shape, irec_local can be too big
- if(irec_local < nadj_rec_local) {
-
- int irec = pre_computed_irec[irec_local];
-
- int ispec = ispec_selected_rec[irec]-1;
- if( ispec_is_acoustic[ispec] ){
-
- // checks if element is in phase_is_inner run
- if(ispec_is_inner[ispec] == phase_is_inner) {
- int i = threadIdx.x;
- int j = threadIdx.y;
- int k = threadIdx.z;
- int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
-
- //kappal = kappastore[INDEX4(5,5,5,i,j,k,ispec)];
-
- //potential_dot_dot_acoustic[iglob] += adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
- // pre_computed_irec_local_index[irec],
- // pre_computed_index,
- // 0,
- // i,j,k)]/kappal;
-
- // beware, for acoustic medium, a pressure source would be taking the negative
- // and divide by Kappa of the fluid;
- // this would have to be done when constructing the adjoint source.
- //
- // note: we take the first component of the adj_sourcearrays
- // the idea is to have e.g. a pressure source, where all 3 components would be the same
-
- atomicAdd(&potential_dot_dot_acoustic[iglob],adj_sourcearrays[INDEX5(5,5,5,3,
- i,j,k,
- 0,
- irec_local)] // / kappal
- );
-
- //+adj_sourcearrays[INDEX6(nadj_rec_local,NTSTEP_BETWEEN_ADJSRC,3,5,5,
- // pre_computed_irec_local_index[irec],pre_computed_index-1,
- // 0,i,j,k)] // / kappal
- // );
- }
- }
- }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-extern "C"
-void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
- ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
- float* h_adj_sourcearrays,
- int* phase_is_inner,
- int* h_ispec_is_inner,
- int* h_ispec_is_acoustic,
- int* h_ispec_selected_rec,
- int* myrank,
- int* nrec,
- int* time_index,
- int* h_islice_selected_rec,
- int* nadj_rec_local,
- int* NTSTEP_BETWEEN_READ_ADJSRC) {
-
-TRACE("add_sources_ac_sim_2_or_3_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
-
- // checks
- if( *nadj_rec_local != mp->nadj_rec_local) exit_on_cuda_error("add_sources_ac_sim_type_2_or_3: nadj_rec_local not equal\n");
-
- // make sure grid dimension is less than 65535 in x dimension
- int num_blocks_x = mp->nadj_rec_local;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
- num_blocks_y = num_blocks_y*2;
- }
-
- dim3 grid(num_blocks_x,num_blocks_y,1);
- dim3 threads(5,5,5);
-
- // build slice of adj_sourcearrays because full array is *very* large.
- // note: this extracts array values for local adjoint sources at given time step "time_index"
- // from large adj_sourcearrays array into h_adj_sourcearrays_slice
- int ispec,i,j,k;
- int irec_local = 0;
- for(int irec = 0; irec < *nrec; irec++) {
- if(*myrank == h_islice_selected_rec[irec]) {
- irec_local++;
-
- // takes only acoustic sources
- ispec = h_ispec_selected_rec[irec]-1;
- if( h_ispec_is_acoustic[ispec] ){
-
- if( h_ispec_is_inner[ispec] == *phase_is_inner) {
- for(k=0;k<5;k++) {
- for(j=0;j<5;j++) {
- for(i=0;i<5;i++) {
-
- mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
- i,j,k,0,
- irec_local-1)]
- = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
- *NTSTEP_BETWEEN_READ_ADJSRC,
- 3,5,5,
- irec_local-1,
- *time_index-1,
- 0,i,j,k)];
-
- mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
- i,j,k,1,
- irec_local-1)]
- = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
- *NTSTEP_BETWEEN_READ_ADJSRC,
- 3,5,5,
- irec_local-1,
- *time_index-1,
- 1,i,j,k)];
-
- mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
- i,j,k,2,
- irec_local-1)]
- = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
- *NTSTEP_BETWEEN_READ_ADJSRC,
- 3,5,5,
- irec_local-1,
- *time_index-1,
- 2,i,j,k)];
- }
- }
- }
- } // phase_is_inner
- } // h_ispec_is_acoustic
- }
- }
- // check all local sources were added
- if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
-
- // copies extracted array values onto GPU
- cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
- (mp->nadj_rec_local)*3*125*sizeof(float),cudaMemcpyHostToDevice);
-
- // launches cuda kernel for acoustic adjoint sources
- add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
- *nrec,
- mp->d_adj_sourcearrays,
- mp->d_ibool,
- mp->d_ispec_is_inner,
- mp->d_ispec_is_acoustic,
- mp->d_ispec_selected_rec,
- *phase_is_inner,
- mp->d_islice_selected_rec,
- mp->d_pre_computed_irec,
- mp->nadj_rec_local,
- mp->d_kappastore);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("add_sources_acoustic_SIM_TYPE_2_OR_3_kernel");
-#endif
-}
Copied: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu (from rev 19151, seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_cuda.cu)
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu (rev 0)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_add_sources_elastic_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -0,0 +1,422 @@
+/*
+ !=====================================================================
+ !
+ ! S p e c f e m 3 D V e r s i o n 2 . 0
+ ! ---------------------------------------
+ !
+ ! Main authors: Dimitri Komatitsch and Jeroen Tromp
+ ! Princeton University, USA and University of Pau / CNRS / INRIA
+ ! (c) Princeton University / California Institute of Technology and University of Pau / CNRS / INRIA
+ ! April 2011
+ !
+ ! This program is free software; you can redistribute it and/or modify
+ ! it under the terms of the GNU General Public License as published by
+ ! the Free Software Foundation; either version 2 of the License, or
+ ! (at your option) any later version.
+ !
+ ! This program is distributed in the hope that it will be useful,
+ ! but WITHOUT ANY WARRANTY; without even the implied warranty of
+ ! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ ! GNU General Public License for more details.
+ !
+ ! You should have received a copy of the GNU General Public License along
+ ! with this program; if not, write to the Free Software Foundation, Inc.,
+ ! 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ !
+ !=====================================================================
+ */
+
+#include <stdio.h>
+#include <cuda.h>
+#include <cublas.h>
+
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+#include "mesh_constants_cuda.h"
+// #include "epik_user.h"
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// elastic domain sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void compute_add_sources_kernel(realw* accel,
+ int* ibool,
+ int* ispec_is_inner,
+ int phase_is_inner,
+ realw* sourcearrays,
+ double* stf_pre_compute,
+ int myrank,
+ int* islice_selected_source,
+ int* ispec_selected_source,
+ int* ispec_is_elastic,
+ int NSOURCES) {
+ int i = threadIdx.x;
+ int j = threadIdx.y;
+ int k = threadIdx.z;
+
+ int isource = blockIdx.x + gridDim.x*blockIdx.y; // bx
+ int ispec;
+ int iglob;
+ realw stf;
+
+ if(isource < NSOURCES) { // when NSOURCES > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+
+ if(myrank == islice_selected_source[isource]) {
+
+ ispec = ispec_selected_source[isource]-1;
+
+ if(ispec_is_inner[ispec] == phase_is_inner && ispec_is_elastic[ispec] ) {
+
+ stf = (realw) stf_pre_compute[isource];
+ iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+ atomicAdd(&accel[iglob*3],
+ sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 0, i,j,k)]*stf);
+ atomicAdd(&accel[iglob*3+1],
+ sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 1, i,j,k)]*stf);
+ atomicAdd(&accel[iglob*3+2],
+ sourcearrays[INDEX5(NSOURCES, 3, 5, 5,isource, 2, i,j,k)]*stf);
+ }
+ }
+ }
+
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_el_cuda,
+ COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
+ int* phase_is_innerf,
+ int* NSOURCESf,
+ double* h_stf_pre_compute,
+ int* myrankf) {
+
+TRACE("compute_add_sources_el_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ // check if anything to do
+ if( mp->nsources_local == 0 ) return;
+
+ int phase_is_inner = *phase_is_innerf;
+ int NSOURCES = *NSOURCESf;
+ int myrank = *myrankf;
+
+ int num_blocks_x = NSOURCES;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ //double* d_stf_pre_compute;
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+ NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(5,5,5);
+
+ compute_add_sources_kernel<<<grid,threads>>>(mp->d_accel,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ phase_is_inner,
+ mp->d_sourcearrays,
+ mp->d_stf_pre_compute,
+ myrank,
+ mp->d_islice_selected_source,
+ mp->d_ispec_selected_source,
+ mp->d_ispec_is_elastic,
+ NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("compute_add_sources_kernel");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(compute_add_sources_el_s3_cuda,
+ COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
+ double* h_stf_pre_compute,
+ int* NSOURCESf,
+ int* phase_is_inner,
+ int* myrank) {
+ TRACE("compute_add_sources_el_s3_cuda");
+ // EPIK_TRACER("compute_add_sources_el_s3_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+ int NSOURCES = *NSOURCESf;
+
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_stf_pre_compute,h_stf_pre_compute,
+ NSOURCES*sizeof(double),cudaMemcpyHostToDevice),18);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+#endif
+
+ int num_blocks_x = NSOURCES;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(5,5,5);
+
+ compute_add_sources_kernel<<<grid,threads>>>(mp->d_b_accel,mp->d_ibool,
+ mp->d_ispec_is_inner, *phase_is_inner,
+ mp->d_sourcearrays,
+ mp->d_stf_pre_compute,
+ *myrank,
+ mp->d_islice_selected_source,mp->d_ispec_selected_source,
+ mp->d_ispec_is_elastic,
+ NSOURCES);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("compute_add_sources_el_s3_cuda");
+#endif
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// NOISE sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_source_master_rec_noise_cuda_kernel(int* ibool,
+ int* ispec_selected_rec,
+ int irec_master_noise,
+ realw* accel,
+ realw* noise_sourcearray,
+ int it) {
+ int tx = threadIdx.x;
+ int iglob = ibool[tx + NGLL3*(ispec_selected_rec[irec_master_noise-1]-1)]-1;
+
+ // not sure if we need atomic operations but just in case...
+ // accel[3*iglob] += noise_sourcearray[3*tx + 3*125*it];
+ // accel[1+3*iglob] += noise_sourcearray[1+3*tx + 3*125*it];
+ // accel[2+3*iglob] += noise_sourcearray[2+3*tx + 3*125*it];
+
+ atomicAdd(&accel[iglob*3],noise_sourcearray[3*tx + 3*NGLL3*it]);
+ atomicAdd(&accel[iglob*3+1],noise_sourcearray[1+3*tx + 3*NGLL3*it]);
+ atomicAdd(&accel[iglob*3+2],noise_sourcearray[2+3*tx + 3*NGLL3*it]);
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(add_source_master_rec_noise_cu,
+ ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
+ int* myrank_f,
+ int* it_f,
+ int* irec_master_noise_f,
+ int* islice_selected_rec) {
+
+TRACE("add_source_master_rec_noise_cu");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ int it = *it_f-1; // -1 for Fortran -> C indexing differences
+ int irec_master_noise = *irec_master_noise_f;
+ int myrank = *myrank_f;
+
+ dim3 grid(1,1,1);
+ dim3 threads(NGLL3,1,1);
+
+ if(myrank == islice_selected_rec[irec_master_noise-1]) {
+ add_source_master_rec_noise_cuda_kernel<<<grid,threads>>>(mp->d_ibool,
+ mp->d_ispec_selected_rec,
+ irec_master_noise,
+ mp->d_accel,
+ mp->d_noise_sourcearray,
+ it);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("add_source_master_rec_noise_cuda_kernel");
+#endif
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+// ADJOINT sources
+
+/* ----------------------------------------------------------------------------------------------- */
+
+__global__ void add_sources_el_SIM_TYPE_2_OR_3_kernel(realw* accel,
+ int nrec,
+ realw* adj_sourcearrays,
+ int* ibool,
+ int* ispec_is_inner,
+ int* ispec_is_elastic,
+ int* ispec_selected_rec,
+ int phase_is_inner,
+ int* pre_computed_irec,
+ int nadj_rec_local) {
+
+ int irec_local = blockIdx.x + gridDim.x*blockIdx.y;
+
+ if(irec_local < nadj_rec_local) { // when nrec > 65535, but mod(nspec_top,2) > 0, we end up with an extra block.
+
+ int irec = pre_computed_irec[irec_local];
+
+ int ispec = ispec_selected_rec[irec]-1;
+ if( ispec_is_elastic[ispec] ){
+
+ if(ispec_is_inner[ispec] == phase_is_inner) {
+ int i = threadIdx.x;
+ int j = threadIdx.y;
+ int k = threadIdx.z;
+ int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)]-1;
+
+ // atomic operations are absolutely necessary for correctness!
+ atomicAdd(&accel[3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+ i,j,k,
+ 0,
+ irec_local)]);
+
+ atomicAdd(&accel[1+3*iglob], adj_sourcearrays[INDEX5(5,5,5,3,
+ i,j,k,
+ 1,
+ irec_local)]);
+
+ atomicAdd(&accel[2+3*iglob],adj_sourcearrays[INDEX5(5,5,5,3,
+ i,j,k,
+ 2,
+ irec_local)]);
+ }
+ } // ispec_is_elastic
+ }
+
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(add_sources_el_sim_type_2_or_3,
+ ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
+ realw* h_adj_sourcearrays,
+ int* phase_is_inner,
+ int* h_ispec_is_inner,
+ int* h_ispec_is_elastic,
+ int* h_ispec_selected_rec,
+ int* myrank,
+ int* nrec,
+ int* time_index,
+ int* h_islice_selected_rec,
+ int* nadj_rec_local,
+ int* NTSTEP_BETWEEN_READ_ADJSRC) {
+
+TRACE("add_sources_el_sim_type_2_or_3");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
+
+ // checks
+ if( *nadj_rec_local != mp->nadj_rec_local) exit_on_error("add_sources_el_sim_type_2_or_3: nadj_rec_local not equal\n");
+
+ // make sure grid dimension is less than 65535 in x dimension
+ int num_blocks_x = mp->nadj_rec_local;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y,1);
+ dim3 threads(5,5,5);
+
+ // build slice of adj_sourcearrays because full array is *very* large.
+ // note: this extracts array values for local adjoint sources at given time step "time_index"
+ // from large adj_sourcearrays array into h_adj_sourcearrays_slice
+ int ispec,i,j,k;
+ int irec_local = 0;
+ for(int irec = 0; irec < *nrec; irec++) {
+ if(*myrank == h_islice_selected_rec[irec]) {
+ irec_local++;
+
+ // takes only elastic sources
+ ispec = h_ispec_selected_rec[irec]-1;
+ if( h_ispec_is_elastic[ispec] ){
+
+ if( h_ispec_is_inner[ispec] == *phase_is_inner) {
+ for(k=0;k<5;k++) {
+ for(j=0;j<5;j++) {
+ for(i=0;i<5;i++) {
+
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+ i,j,k,0,
+ irec_local-1)]
+ = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+ *NTSTEP_BETWEEN_READ_ADJSRC,
+ 3,5,5,
+ irec_local-1,
+ *time_index-1,
+ 0,i,j,k)];
+
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+ i,j,k,1,
+ irec_local-1)]
+ = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+ *NTSTEP_BETWEEN_READ_ADJSRC,
+ 3,5,5,
+ irec_local-1,
+ *time_index-1,
+ 1,i,j,k)];
+
+ mp->h_adj_sourcearrays_slice[INDEX5(5,5,5,3,
+ i,j,k,2,
+ irec_local-1)]
+ = h_adj_sourcearrays[INDEX6(*nadj_rec_local,
+ *NTSTEP_BETWEEN_READ_ADJSRC,
+ 3,5,5,
+ irec_local-1,
+ *time_index-1,
+ 2,i,j,k)];
+ }
+ }
+ }
+ } // phase_is_inner
+ } // h_ispec_is_elastic
+ }
+ }
+ // check all local sources were added
+ if( irec_local != mp->nadj_rec_local) exit_on_error("irec_local not equal to nadj_rec_local\n");
+
+ // copies extracted array values onto GPU
+ cudaMemcpy(mp->d_adj_sourcearrays, mp->h_adj_sourcearrays_slice,
+ (mp->nadj_rec_local)*3*NGLL3*sizeof(realw),cudaMemcpyHostToDevice);
+
+
+ // the irec_local variable needs to be precomputed (as
+ // h_pre_comp..), because normally it is in the loop updating accel,
+ // and due to how it's incremented, it cannot be parallelized
+
+ add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads>>>(mp->d_accel,
+ *nrec,
+ mp->d_adj_sourcearrays,
+ mp->d_ibool,
+ mp->d_ispec_is_inner,
+ mp->d_ispec_is_elastic,
+ mp->d_ispec_selected_rec,
+ *phase_is_inner,
+ mp->d_pre_computed_irec,
+ mp->nadj_rec_local);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("add_sources_SIM_TYPE_2_OR_3_kernel");
+#endif
+}
+
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_coupling_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
-#include <mpi.h>
#include <sys/time.h>
#include <sys/resource.h>
@@ -44,13 +43,13 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void compute_coupling_acoustic_el_kernel(float* displ,
- float* potential_dot_dot_acoustic,
+__global__ void compute_coupling_acoustic_el_kernel(realw* displ,
+ realw* potential_dot_dot_acoustic,
int num_coupling_ac_el_faces,
int* coupling_ac_el_ispec,
int* coupling_ac_el_ijk,
- float* coupling_ac_el_normal,
- float* coupling_ac_el_jacobian2Dw,
+ realw* coupling_ac_el_normal,
+ realw* coupling_ac_el_jacobian2Dw,
int* ibool,
int* ispec_is_inner,
int phase_is_inner) {
@@ -130,8 +129,7 @@
int SIMULATION_TYPE = *SIMULATION_TYPEf;
// way 1: exact blocksize to match NGLLSQUARE
- int blocksize = 25;
-
+ int blocksize = NGLL2;
int num_blocks_x = num_coupling_ac_el_faces;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
@@ -183,13 +181,13 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void compute_coupling_elastic_ac_kernel(float* potential_dot_dot_acoustic,
- float* accel,
+__global__ void compute_coupling_elastic_ac_kernel(realw* potential_dot_dot_acoustic,
+ realw* accel,
int num_coupling_ac_el_faces,
int* coupling_ac_el_ispec,
int* coupling_ac_el_ijk,
- float* coupling_ac_el_normal,
- float* coupling_ac_el_jacobian2Dw,
+ realw* coupling_ac_el_normal,
+ realw* coupling_ac_el_jacobian2Dw,
int* ibool,
int* ispec_is_inner,
int phase_is_inner) {
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_acoustic_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
-#include <mpi.h>
#include <sys/time.h>
#include <sys/resource.h>
@@ -42,8 +41,8 @@
// prepares a device array with with all inter-element edge-nodes -- this
// is followed by a memcpy and MPI operations
-__global__ void prepare_boundary_potential_on_device(float* d_potential_dot_dot_acoustic,
- float* d_send_potential_dot_dot_buffer,
+__global__ void prepare_boundary_potential_on_device(realw* d_potential_dot_dot_acoustic,
+ realw* d_send_potential_dot_dot_buffer,
int num_interfaces_ext_mesh,
int max_nibool_interfaces_ext_mesh,
int* d_nibool_interfaces_ext_mesh,
@@ -70,8 +69,8 @@
TRANSFER_BOUN_POT_FROM_DEVICE)(
int* size,
long* Mesh_pointer_f,
- float* potential_dot_dot_acoustic,
- float* send_potential_dot_dot_buffer,
+ realw* potential_dot_dot_acoustic,
+ realw* send_potential_dot_dot_buffer,
int* num_interfaces_ext_mesh,
int* max_nibool_interfaces_ext_mesh,
int* nibool_interfaces_ext_mesh,
@@ -84,8 +83,8 @@
if( *num_interfaces_ext_mesh == 0 ) return;
- int blocksize = 256;
- int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)(mp->max_nibool_interfaces_ext_mesh))/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
@@ -99,22 +98,22 @@
if(*FORWARD_OR_ADJOINT == 1) {
prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
mp->d_send_potential_dot_dot_buffer,
- *num_interfaces_ext_mesh,
- *max_nibool_interfaces_ext_mesh,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
}
else if(*FORWARD_OR_ADJOINT == 3) {
prepare_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
mp->d_send_potential_dot_dot_buffer,
- *num_interfaces_ext_mesh,
- *max_nibool_interfaces_ext_mesh,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
}
- cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_potential_dot_dot_buffer,
- *max_nibool_interfaces_ext_mesh* *num_interfaces_ext_mesh*sizeof(realw),cudaMemcpyDeviceToHost);
+ print_CUDA_error_if_any(cudaMemcpy(send_potential_dot_dot_buffer,mp->d_send_potential_dot_dot_buffer,
+ (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),cudaMemcpyDeviceToHost),98000);
// finish timing of kernel+memcpy
// cudaEventRecord( stop, 0 );
@@ -132,8 +131,8 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void assemble_boundary_potential_on_device(float* d_potential_dot_dot_acoustic,
- float* d_send_potential_dot_dot_buffer,
+__global__ void assemble_boundary_potential_on_device(realw* d_potential_dot_dot_acoustic,
+ realw* d_send_potential_dot_dot_buffer,
int num_interfaces_ext_mesh,
int max_nibool_interfaces_ext_mesh,
int* d_nibool_interfaces_ext_mesh,
@@ -182,18 +181,18 @@
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
//double start_time = get_time();
// cudaEvent_t start, stop;
- // float time;
+ // realw time;
// cudaEventCreate(&start);
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
// copies buffer onto GPU
cudaMemcpy(mp->d_send_potential_dot_dot_buffer, buffer_recv_scalar_ext_mesh,
- *max_nibool_interfaces_ext_mesh* *num_interfaces_ext_mesh*sizeof(realw), cudaMemcpyHostToDevice);
+ (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw), cudaMemcpyHostToDevice);
// assembles on GPU
- int blocksize = 256;
- int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
@@ -208,8 +207,8 @@
//assemble forward field
assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_potential_dot_dot_acoustic,
mp->d_send_potential_dot_dot_buffer,
- *num_interfaces_ext_mesh,
- *max_nibool_interfaces_ext_mesh,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
}
@@ -217,8 +216,8 @@
//assemble reconstructed/backward field
assemble_boundary_potential_on_device<<<grid,threads>>>(mp->d_b_potential_dot_dot_acoustic,
mp->d_send_potential_dot_dot_buffer,
- *num_interfaces_ext_mesh,
- *max_nibool_interfaces_ext_mesh,
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
mp->d_nibool_interfaces_ext_mesh,
mp->d_ibool_interfaces_ext_mesh);
}
@@ -239,23 +238,6 @@
/* ----------------------------------------------------------------------------------------------- */
-//void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase, int SIMULATION_TYPE);
-
-//__global__ void Kernel_2_acoustic_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,int* d_phase_ispec_inner_acoustic,
-// int num_phase_ispec_acoustic, int d_iphase,
-// float* d_potential_acoustic, float* d_potential_dot_dot_acoustic,
-// float* d_xix, float* d_xiy, float* d_xiz, float* d_etax, float* d_etay, float* d_etaz,
-// float* d_gammax, float* d_gammay, float* d_gammaz,
-// float* hprime_xx, float* hprime_yy, float* hprime_zz,
-// float* hprimewgll_xx, float* hprimewgll_yy, float* hprimewgll_zz,
-// float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
-// float* d_rhostore);
-
-
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
/* KERNEL 2 */
/* ----------------------------------------------------------------------------------------------- */
@@ -267,20 +249,20 @@
int num_phase_ispec_acoustic,
int d_iphase,
int use_mesh_coloring_gpu,
- float* d_potential_acoustic, float* d_potential_dot_dot_acoustic,
- float* d_xix, float* d_xiy, float* d_xiz,
- float* d_etax, float* d_etay, float* d_etaz,
- float* d_gammax, float* d_gammay, float* d_gammaz,
- float* hprime_xx, float* hprime_yy, float* hprime_zz,
- float* hprimewgll_xx, float* hprimewgll_yy, float* hprimewgll_zz,
- float* wgllwgll_xy,float* wgllwgll_xz,float* wgllwgll_yz,
- float* d_rhostore){
+ realw* d_potential_acoustic, realw* d_potential_dot_dot_acoustic,
+ realw* d_xix, realw* d_xiy, realw* d_xiz,
+ realw* d_etax, realw* d_etay, realw* d_etaz,
+ realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+ realw* hprime_xx, realw* hprime_yy, realw* hprime_zz,
+ realw* hprimewgll_xx, realw* hprimewgll_yy, realw* hprimewgll_zz,
+ realw* wgllwgll_xy,realw* wgllwgll_xz,realw* wgllwgll_yz,
+ realw* d_rhostore){
int bx = blockIdx.y*gridDim.x+blockIdx.x;
int tx = threadIdx.x;
- const int NGLL3 = 125;
- const int NGLL3_ALIGN = 128;
+ //const int NGLL3 = NGLL3;
+ const int NGLL3_ALIGN = NGLL3_PADDED;
int K = (tx/NGLL2);
int J = ((tx-K*NGLL2)/NGLLX);
@@ -296,7 +278,7 @@
#ifndef MANUALLY_UNROLLED_LOOPS
int l;
- float hp1,hp2,hp3;
+ realw hp1,hp2,hp3;
#endif
__shared__ reald s_dummy_loc[NGLL3];
@@ -326,7 +308,7 @@
#endif
// iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
- iglob = d_ibool[working_element*125 + tx]-1;
+ iglob = d_ibool[working_element*NGLL3 + tx]-1;
#ifdef USE_TEXTURES
s_dummy_loc[tx] = tex1Dfetch(tex_potential_acoustic, iglob);
@@ -516,16 +498,16 @@
void Kernel_2_acoustic(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
int SIMULATION_TYPE,
int* d_ibool,
- float* d_xix,
- float* d_xiy,
- float* d_xiz,
- float* d_etax,
- float* d_etay,
- float* d_etaz,
- float* d_gammax,
- float* d_gammay,
- float* d_gammaz,
- float* d_rhostore)
+ realw* d_xix,
+ realw* d_xiy,
+ realw* d_xiz,
+ realw* d_etax,
+ realw* d_etay,
+ realw* d_etaz,
+ realw* d_gammax,
+ realw* d_gammay,
+ realw* d_gammaz,
+ realw* d_rhostore)
{
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -543,13 +525,13 @@
num_blocks_y = num_blocks_y*2;
}
- int threads_2 = 128;//BLOCK_SIZE_K2;
+ int threads_2 = NGLL3_PADDED;//BLOCK_SIZE_K2;
dim3 grid_2(num_blocks_x,num_blocks_y);
// Cuda timing
// cudaEvent_t start, stop;
- // float time;
+ // realw time;
// cudaEventCreate(&start);
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
@@ -631,10 +613,6 @@
if( num_elements == 0 ) return;
- //int myrank;
- /* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
- /* if(myrank==0) { */
-
// mesh coloring
if( mp->use_mesh_coloring_gpu ){
@@ -654,7 +632,7 @@
// array offsets (acoustic elements start after elastic ones)
color_offset = mp->nspec_elastic * NGLL3_PADDED;
- color_offset_nonpadded = mp->nspec_elastic * NGLL3_NONPADDED;
+ color_offset_nonpadded = mp->nspec_elastic * NGLL3;
}else{
// inner element colors (start after outer elements)
nb_colors = mp->num_colors_outer_acoustic + mp->num_colors_inner_acoustic;
@@ -662,7 +640,7 @@
// array offsets (inner elements start after outer ones)
color_offset = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_PADDED;
- color_offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3_NONPADDED;
+ color_offset_nonpadded = ( mp->nspec_elastic + (*nspec_outer_acoustic) ) * NGLL3;
}
// loops over colors
@@ -670,12 +648,6 @@
nb_blocks_to_compute = mp->h_num_elem_colors_acoustic[icolor];
- // checks
- //if( nb_blocks_to_compute <= 0 ){
- // printf("error number of acoustic color blocks: %d -- color = %d \n",nb_blocks_to_compute,icolor);
- // exit(EXIT_FAILURE);
- //}
-
Kernel_2_acoustic(nb_blocks_to_compute,mp,*iphase,
*SIMULATION_TYPE,
mp->d_ibool + color_offset_nonpadded,
@@ -693,13 +665,12 @@
// for padded and aligned arrays
color_offset += nb_blocks_to_compute * NGLL3_PADDED;
// for no-aligned arrays
- color_offset_nonpadded += nb_blocks_to_compute * NGLL3_NONPADDED;
+ color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
}
}else{
// no mesh coloring: uses atomic updates
-
Kernel_2_acoustic(num_elements, mp, *iphase,
*SIMULATION_TYPE,
mp->d_ibool,
@@ -715,14 +686,6 @@
mp->d_rhostore);
}
-
- //cudaThreadSynchronize();
-
- //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- /* MPI_Barrier(MPI_COMM_WORLD); */
- //double end_time = get_time();
- //printf("Elapsed time: %e\n",end_time-start_time);
- //#endif
}
/* ----------------------------------------------------------------------------------------------- */
@@ -732,9 +695,9 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void kernel_3_a_acoustic_cuda_device(float* potential_dot_dot_acoustic,
+__global__ void kernel_3_a_acoustic_cuda_device(realw* potential_dot_dot_acoustic,
int size,
- float* rmass_acoustic) {
+ realw* rmass_acoustic) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
/* because of block and grid sizing problems, there is a small */
@@ -747,11 +710,11 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void kernel_3_b_acoustic_cuda_device(float* potential_dot_acoustic,
- float* potential_dot_dot_acoustic,
+__global__ void kernel_3_b_acoustic_cuda_device(realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
int size,
realw deltatover2,
- float* rmass_acoustic) {
+ realw* rmass_acoustic) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
/* because of block and grid sizing problems, there is a small */
@@ -775,7 +738,7 @@
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
int size = *size_F;
- int blocksize=128;
+ int blocksize = BLOCKSIZE_KERNEL3;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
@@ -808,9 +771,9 @@
void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
long* Mesh_pointer,
int* size_F,
- float* deltatover2_F,
+ realw* deltatover2_F,
int* SIMULATION_TYPE,
- float* b_deltatover2_F) {
+ realw* b_deltatover2_F) {
TRACE("kernel_3_b_acoustic_cuda");
@@ -819,7 +782,7 @@
realw deltatover2 = *deltatover2_F;
realw b_deltatover2 = *b_deltatover2_F;
- int blocksize=128;
+ int blocksize = BLOCKSIZE_KERNEL3;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
@@ -858,9 +821,9 @@
__global__ void enforce_free_surface_cuda_kernel(
- float* potential_acoustic,
- float* potential_dot_acoustic,
- float* potential_dot_dot_acoustic,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
int num_free_surface_faces,
int* free_surface_ispec,
int* free_surface_ijk,
@@ -874,20 +837,12 @@
int ispec = free_surface_ispec[iface]-1;
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-// if( iface > 648-1 ){printf("device iface: %i \n",iface);}
-//#endif
-
// checks if element is in acoustic domain
if( ispec_is_acoustic[ispec] ){
// gets global point index
int igll = threadIdx.x + threadIdx.y*blockDim.x;
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-// if( igll > 25-1 ){printf("device igll: %i \n",igll);}
-//#endif
-
int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
@@ -898,10 +853,6 @@
potential_acoustic[iglob] = 0;
potential_dot_acoustic[iglob] = 0;
potential_dot_dot_acoustic[iglob] = 0;
-
-//#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
-// if( ispec == 160 && igll < 25 ){printf("device: %i %i %i %i %i \n",igll,i,j,k,iglob);}
-//#endif
}
}
}
@@ -931,26 +882,8 @@
num_blocks_y = num_blocks_y*2;
}
dim3 grid(num_blocks_x,num_blocks_y,1);
- dim3 threads(25,1,1);
+ dim3 threads(NGLL2,1,1);
- //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- // debugging
- //int* d_debug;
- //printf("acoustic_enforce_free_surf_cuda ...\n");
- //print_CUDA_error_if_any(cudaMalloc((void**)&d_debug,128*sizeof(int)),999);
-
- //int* h_debug;
- //h_debug = (int*) calloc(128,sizeof(int));
- //for(int i=0;i<128;i++){h_debug[i] = 0;}
- //cudaMemcpy(d_debug,h_debug,128*sizeof(int),cudaMemcpyHostToDevice);
-
- //printf("acoustic_enforce_free_surf_cuda start...\n");
- //doesnt' work...: printf("free_surface_ispec: %i %i %i \n",mp->d_free_surface_ispec[0],mp->d_free_surface_ispec[1],mp->d_free_surface_ispec[2]);
- //printf("free_surface_ispec: %i \n",mp->num_free_surface_faces);
-
- //cudaThreadSynchronize();
- //#endif
-
// sets potentials to zero at free surface
enforce_free_surface_cuda_kernel<<<grid,threads>>>(mp->d_potential_acoustic,
mp->d_potential_dot_acoustic,
@@ -972,15 +905,6 @@
mp->d_ispec_is_acoustic);
}
- //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- //cudaThreadSynchronize();
- //cudaMemcpy(h_debug,d_debug,128*sizeof(int),cudaMemcpyDeviceToHost);
- //for(int i=0;i<25;i++) {printf("ispec d_debug = %d \n",h_debug[i]);}
- //cudaFree(d_debug);
- //free(h_debug);
- //exit(1);
- //#endif
-
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_forces_elastic_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
-#include <mpi.h>
#include <sys/time.h>
#include <sys/resource.h>
@@ -40,51 +39,29 @@
// cuda constant arrays
-__constant__ float d_hprime_xx[NGLL2];
-__constant__ float d_hprime_yy[NGLL2];
-__constant__ float d_hprime_zz[NGLL2];
-__constant__ float d_hprimewgll_xx[NGLL2];
-__constant__ float d_hprimewgll_yy[NGLL2];
-__constant__ float d_hprimewgll_zz[NGLL2];
-__constant__ float d_wgllwgll_xy[NGLL2];
-__constant__ float d_wgllwgll_xz[NGLL2];
-__constant__ float d_wgllwgll_yz[NGLL2];
+__constant__ realw d_hprime_xx[NGLL2];
+__constant__ realw d_hprime_yy[NGLL2];
+__constant__ realw d_hprime_zz[NGLL2];
+__constant__ realw d_hprimewgll_xx[NGLL2];
+__constant__ realw d_hprimewgll_yy[NGLL2];
+__constant__ realw d_hprimewgll_zz[NGLL2];
+__constant__ realw d_wgllwgll_xy[NGLL2];
+__constant__ realw d_wgllwgll_xz[NGLL2];
+__constant__ realw d_wgllwgll_yz[NGLL2];
-//void Kernel_2(int nb_blocks_to_compute, Mesh* mp, int d_iphase,
-// int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION);
-//__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
-// int num_phase_ispec_elastic, int d_iphase, int* d_ibool);
-//__global__ void Kernel_2_impl(int nb_blocks_to_compute,int NGLOB, int* d_ibool,
-// int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic, int d_iphase,
-// float* d_displ, float* d_accel,
-// float* d_xix, float* d_xiy, float* d_xiz,
-// float* d_etax, float* d_etay, float* d_etaz,
-// float* d_gammax, float* d_gammay, float* d_gammaz,
-// float* d_kappav, float* d_muv,
-// //float* d_debug,
-// int COMPUTE_AND_STORE_STRAIN,
-// float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
-// float* epsilondev_xz,float* epsilondev_yz,float* epsilon_trace_over_3,
-// int SIMULATION_TYPE,
-// int ATTENUATION,int NSPEC,
-// float* one_minus_sum_beta,float* factor_common,
-// float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
-// float* alphaval,float* betaval,float* gammaval);
-
/* ----------------------------------------------------------------------------------------------- */
// prepares a device array with with all inter-element edge-nodes -- this
// is followed by a memcpy and MPI operations
-__global__ void prepare_boundary_accel_on_device(float* d_accel, float* d_send_accel_buffer,
- int num_interfaces_ext_mesh, int max_nibool_interfaces_ext_mesh,
- int* d_nibool_interfaces_ext_mesh,
- int* d_ibool_interfaces_ext_mesh) {
+__global__ void prepare_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
+ int num_interfaces_ext_mesh,
+ int max_nibool_interfaces_ext_mesh,
+ int* d_nibool_interfaces_ext_mesh,
+ int* d_ibool_interfaces_ext_mesh) {
int id = threadIdx.x + blockIdx.x*blockDim.x + blockIdx.y*gridDim.x*blockDim.x;
- //int bx = blockIdx.y*gridDim.x+blockIdx.x;
- //int tx = threadIdx.x;
int iinterface=0;
for( iinterface=0; iinterface < num_interfaces_ext_mesh; iinterface++) {
@@ -106,8 +83,8 @@
// (elements on boundary)
extern "C"
void FC_FUNC_(transfer_boun_accel_from_device,
- TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, float* accel,
- float* send_accel_buffer,
+ TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
+ realw* send_accel_buffer,
int* num_interfaces_ext_mesh,
int* max_nibool_interfaces_ext_mesh,
int* nibool_interfaces_ext_mesh,
@@ -119,8 +96,8 @@
if( *num_interfaces_ext_mesh == 0 ) return;
- int blocksize = 256;
- int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
@@ -133,28 +110,28 @@
//timing for memory xfer
// cudaEvent_t start, stop;
- // float time;
+ // realw time;
// cudaEventCreate(&start);
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
if(*FORWARD_OR_ADJOINT == 1) {
prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel,mp->d_send_accel_buffer,
- *num_interfaces_ext_mesh,
- *max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
}
else if(*FORWARD_OR_ADJOINT == 3) {
prepare_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel,mp->d_send_accel_buffer,
- *num_interfaces_ext_mesh,
- *max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
}
cudaMemcpy(send_accel_buffer,mp->d_send_accel_buffer,
- 3* *max_nibool_interfaces_ext_mesh* *num_interfaces_ext_mesh*sizeof(realw),cudaMemcpyDeviceToHost);
+ 3*mp->max_nibool_interfaces_ext_mesh*mp->num_interfaces_ext_mesh*sizeof(realw),cudaMemcpyDeviceToHost);
// finish timing of kernel+memcpy
// cudaEventRecord( stop, 0 );
@@ -170,7 +147,7 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void assemble_boundary_accel_on_device(float* d_accel, float* d_send_accel_buffer,
+__global__ void assemble_boundary_accel_on_device(realw* d_accel, realw* d_send_accel_buffer,
int num_interfaces_ext_mesh,
int max_nibool_interfaces_ext_mesh,
int* d_nibool_interfaces_ext_mesh,
@@ -227,10 +204,10 @@
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
cudaMemcpy(mp->d_send_accel_buffer, buffer_recv_vector_ext_mesh,
- 3*(*max_nibool_interfaces_ext_mesh)*(*num_interfaces_ext_mesh)*sizeof(realw), cudaMemcpyHostToDevice);
+ 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw),cudaMemcpyHostToDevice);
- int blocksize = 256;
- int size_padded = ((int)ceil(((double)*max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
+ int blocksize = BLOCKSIZE_TRANSFER;
+ int size_padded = ((int)ceil(((double)mp->max_nibool_interfaces_ext_mesh)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
@@ -242,23 +219,23 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
// cudaEvent_t start, stop;
- // float time;
+ // realw time;
// cudaEventCreate(&start);
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
if(*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_accel, mp->d_send_accel_buffer,
- *num_interfaces_ext_mesh,
- *max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
}
else if(*FORWARD_OR_ADJOINT == 3) { //assemble adjoint accel
assemble_boundary_accel_on_device<<<grid,threads>>>(mp->d_b_accel, mp->d_send_accel_buffer,
- *num_interfaces_ext_mesh,
- *max_nibool_interfaces_ext_mesh,
- mp->d_nibool_interfaces_ext_mesh,
- mp->d_ibool_interfaces_ext_mesh);
+ mp->num_interfaces_ext_mesh,
+ mp->max_nibool_interfaces_ext_mesh,
+ mp->d_nibool_interfaces_ext_mesh,
+ mp->d_ibool_interfaces_ext_mesh);
}
// cudaEventRecord( stop, 0 );
@@ -283,7 +260,7 @@
/* ----------------------------------------------------------------------------------------------- */
-//__global__ void Kernel_test(float* d_debug_output,int* d_phase_ispec_inner_elastic,
+//__global__ void Kernel_test(realw* d_debug_output,int* d_phase_ispec_inner_elastic,
// int num_phase_ispec_elastic, int d_iphase, int* d_ibool) {
// int bx = blockIdx.x;
// int tx = threadIdx.x;
@@ -310,11 +287,11 @@
// updates stress
__device__ void compute_element_att_stress(int tx,int working_element,int NSPEC,
- float* R_xx,
- float* R_yy,
- float* R_xy,
- float* R_xz,
- float* R_yz,
+ realw* R_xx,
+ realw* R_yy,
+ realw* R_xy,
+ realw* R_xz,
+ realw* R_yz,
reald* sigma_xx,
reald* sigma_yy,
reald* sigma_zz,
@@ -327,7 +304,7 @@
for(i_sls = 0; i_sls < N_SLS; i_sls++){
// index
- offset_sls = tx + 125*(working_element + NSPEC*i_sls);
+ offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls);
R_xx_val = R_xx[offset_sls]; //(i,j,k,ispec,i_sls)
R_yy_val = R_yy[offset_sls];
@@ -347,12 +324,12 @@
// updates R_memory
__device__ void compute_element_att_memory(int tx,int working_element,int NSPEC,
- float* d_muv,
- float* factor_common,
- float* alphaval,float* betaval,float* gammaval,
- float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
- float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
- float* epsilondev_xz,float* epsilondev_yz,
+ realw* d_muv,
+ realw* factor_common,
+ realw* alphaval,realw* betaval,realw* gammaval,
+ realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
reald epsilondev_xx_loc,reald epsilondev_yy_loc,reald epsilondev_xy_loc,
reald epsilondev_xz_loc,reald epsilondev_yz_loc
){
@@ -365,8 +342,8 @@
reald factor_loc,Sn,Snp1;
// indices
- offset_align = tx + 128 * working_element;
- ijk_ispec = tx + 125 * working_element;
+ offset_align = tx + NGLL3_PADDED * working_element;
+ ijk_ispec = tx + NGLL3 * working_element;
mul = d_muv[offset_align];
@@ -374,8 +351,8 @@
for(i_sls = 0; i_sls < N_SLS; i_sls++){
// indices
- offset_common = i_sls + N_SLS*(tx + 125*working_element); // (i_sls,i,j,k,ispec)
- offset_sls = tx + 125*(working_element + NSPEC*i_sls); // (i,j,k,ispec,i_sls)
+ offset_common = i_sls + N_SLS*(tx + NGLL3*working_element); // (i_sls,i,j,k,ispec)
+ offset_sls = tx + NGLL3*(working_element + NSPEC*i_sls); // (i,j,k,ispec,i_sls)
factor_loc = mul * factor_common[offset_common]; //mustore(i,j,k,ispec) * factor_common(i_sls,i,j,k,ispec)
@@ -423,7 +400,7 @@
// double precision temporary variables leads to 10% performance
// decrease in Kernel_2_impl (not very much..)
-//typedef float reald;
+//typedef realw reald;
// doesn't seem to change the performance.
// #define MANUALLY_UNROLLED_LOOPS
@@ -435,23 +412,43 @@
int* d_phase_ispec_inner_elastic, int num_phase_ispec_elastic,
int d_iphase,
int use_mesh_coloring_gpu,
- float* d_displ, float* d_accel,
- float* d_xix, float* d_xiy, float* d_xiz,
- float* d_etax, float* d_etay, float* d_etaz,
- float* d_gammax, float* d_gammay, float* d_gammaz,
- float* d_kappav, float* d_muv,
- //float* d_debug,
+ realw* d_displ, realw* d_accel,
+ realw* d_xix, realw* d_xiy, realw* d_xiz,
+ realw* d_etax, realw* d_etay, realw* d_etaz,
+ realw* d_gammax, realw* d_gammay, realw* d_gammaz,
+ realw* d_kappav, realw* d_muv,
int COMPUTE_AND_STORE_STRAIN,
- float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
- float* epsilondev_xz,float* epsilondev_yz,
- float* epsilon_trace_over_3,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
+ realw* epsilon_trace_over_3,
int SIMULATION_TYPE,
int ATTENUATION,
int NSPEC,
- float* one_minus_sum_beta,float* factor_common,
- float* R_xx, float* R_yy, float* R_xy, float* R_xz, float* R_yz,
- float* alphaval,float* betaval,float* gammaval
- ){
+ realw* one_minus_sum_beta,realw* factor_common,
+ realw* R_xx, realw* R_yy, realw* R_xy, realw* R_xz, realw* R_yz,
+ realw* alphaval,realw* betaval,realw* gammaval,
+ int ANISOTROPY,
+ realw* d_c11store,
+ realw* d_c12store,
+ realw* d_c13store,
+ realw* d_c14store,
+ realw* d_c15store,
+ realw* d_c16store,
+ realw* d_c22store,
+ realw* d_c23store,
+ realw* d_c24store,
+ realw* d_c25store,
+ realw* d_c26store,
+ realw* d_c33store,
+ realw* d_c34store,
+ realw* d_c35store,
+ realw* d_c36store,
+ realw* d_c44store,
+ realw* d_c45store,
+ realw* d_c46store,
+ realw* d_c55store,
+ realw* d_c56store,
+ realw* d_c66store){
/* int bx = blockIdx.y*blockDim.x+blockIdx.x; //possible bug in original code*/
int bx = blockIdx.y*gridDim.x+blockIdx.x;
@@ -460,8 +457,8 @@
//const int NGLLX = 5;
// const int NGLL2 = 25;
- const int NGLL3 = 125;
- const int NGLL3_ALIGN = 128;
+ //const int NGLL3 = NGLL3;
+ const int NGLL3_ALIGN = NGLL3_PADDED;
int K = (tx/NGLL2);
int J = ((tx-K*NGLL2)/NGLLX);
@@ -479,10 +476,11 @@
reald fac1,fac2,fac3,lambdal,mul,lambdalplus2mul,kappal;
reald sigma_xx,sigma_yy,sigma_zz,sigma_xy,sigma_xz,sigma_yz;
reald epsilondev_xx_loc,epsilondev_yy_loc,epsilondev_xy_loc,epsilondev_xz_loc,epsilondev_yz_loc;
+ reald c11,c12,c13,c14,c15,c16,c22,c23,c24,c25,c26,c33,c34,c35,c36,c44,c45,c46,c55,c56,c66;
#ifndef MANUALLY_UNROLLED_LOOPS
int l;
- float hp1,hp2,hp3;
+ realw hp1,hp2,hp3;
#endif
__shared__ reald s_dummyx_loc[NGLL3];
@@ -520,7 +518,7 @@
#endif
// iglob = d_ibool[working_element*NGLL3_ALIGN + tx]-1;
- iglob = d_ibool[working_element*125 + tx]-1;
+ iglob = d_ibool[working_element*NGLL3 + tx]-1;
#ifdef USE_TEXTURES
s_dummyx_loc[tx] = tex1Dfetch(tex_displ, iglob);
@@ -576,13 +574,6 @@
tempy3l += s_dummyy_loc[offset]*hp3;
tempz3l += s_dummyz_loc[offset]*hp3;
- // if(working_element == 169 && tx == 0) {
- // atomicAdd(&d_debug[0],1.0);
- // d_debug[1+3*l] = tempz3l;
- // d_debug[2+3*l] = s_dummyz_loc[offset];
- // d_debug[3+3*l] = hp3;
- // }
-
}
#else
@@ -676,7 +667,7 @@
// computes deviatoric strain attenuation and/or for kernel calculations
if(COMPUTE_AND_STORE_STRAIN) {
- float templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
+ realw templ = 0.33333333333333333333f * (duxdxl + duydyl + duzdzl); // 1./3. = 0.33333
/*
epsilondev_xx[offset] = duxdxl - templ;
epsilondev_yy[offset] = duydyl - templ;
@@ -692,7 +683,7 @@
epsilondev_yz_loc = 0.5f * duzdyl_plus_duydzl;
if(SIMULATION_TYPE == 3) {
- epsilon_trace_over_3[tx + working_element*125] = templ;
+ epsilon_trace_over_3[tx + working_element*NGLL3] = templ;
}
}
@@ -703,22 +694,64 @@
// attenuation
if(ATTENUATION){
// use unrelaxed parameters if attenuation
- mul = mul * one_minus_sum_beta[tx+working_element*125]; // (i,j,k,ispec)
+ mul = mul * one_minus_sum_beta[tx+working_element*NGLL3]; // (i,j,k,ispec)
}
- // isotropic case
- lambdalplus2mul = kappal + 1.33333333333333333333f * mul; // 4./3. = 1.3333333
- lambdal = lambdalplus2mul - 2.0f * mul;
+ // full anisotropic case, stress calculations
+ if(ANISOTROPY){
- // compute the six components of the stress tensor sigma
- sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
- sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
- sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+ c11 = d_c11store[offset];
+ c12 = d_c12store[offset];
+ c13 = d_c13store[offset];
+ c14 = d_c14store[offset];
+ c15 = d_c15store[offset];
+ c16 = d_c16store[offset];
+ c22 = d_c22store[offset];
+ c23 = d_c23store[offset];
+ c24 = d_c24store[offset];
+ c25 = d_c25store[offset];
+ c26 = d_c26store[offset];
+ c33 = d_c33store[offset];
+ c34 = d_c34store[offset];
+ c35 = d_c35store[offset];
+ c36 = d_c36store[offset];
+ c44 = d_c44store[offset];
+ c45 = d_c45store[offset];
+ c46 = d_c46store[offset];
+ c55 = d_c55store[offset];
+ c56 = d_c56store[offset];
+ c66 = d_c66store[offset];
- sigma_xy = mul*duxdyl_plus_duydxl;
- sigma_xz = mul*duzdxl_plus_duxdzl;
- sigma_yz = mul*duzdyl_plus_duydzl;
+ sigma_xx = c11*duxdxl + c16*duxdyl_plus_duydxl + c12*duydyl +
+ c15*duzdxl_plus_duxdzl + c14*duzdyl_plus_duydzl + c13*duzdzl;
+ sigma_yy = c12*duxdxl + c26*duxdyl_plus_duydxl + c22*duydyl +
+ c25*duzdxl_plus_duxdzl + c24*duzdyl_plus_duydzl + c23*duzdzl;
+ sigma_zz = c13*duxdxl + c36*duxdyl_plus_duydxl + c23*duydyl +
+ c35*duzdxl_plus_duxdzl + c34*duzdyl_plus_duydzl + c33*duzdzl;
+ sigma_xy = c16*duxdxl + c66*duxdyl_plus_duydxl + c26*duydyl +
+ c56*duzdxl_plus_duxdzl + c46*duzdyl_plus_duydzl + c36*duzdzl;
+ sigma_xz = c15*duxdxl + c56*duxdyl_plus_duydxl + c25*duydyl +
+ c55*duzdxl_plus_duxdzl + c45*duzdyl_plus_duydzl + c35*duzdzl;
+ sigma_yz = c14*duxdxl + c46*duxdyl_plus_duydxl + c24*duydyl +
+ c45*duzdxl_plus_duxdzl + c44*duzdyl_plus_duydzl + c34*duzdzl;
+ }else{
+
+ // isotropic case
+
+ lambdalplus2mul = kappal + 1.33333333333333333333f * mul; // 4./3. = 1.3333333
+ lambdal = lambdalplus2mul - 2.0f * mul;
+
+ // compute the six components of the stress tensor sigma
+ sigma_xx = lambdalplus2mul*duxdxl + lambdal*duydyl_plus_duzdzl;
+ sigma_yy = lambdalplus2mul*duydyl + lambdal*duxdxl_plus_duzdzl;
+ sigma_zz = lambdalplus2mul*duzdzl + lambdal*duxdxl_plus_duydyl;
+
+ sigma_xy = mul*duxdyl_plus_duydxl;
+ sigma_xz = mul*duzdxl_plus_duxdzl;
+ sigma_yz = mul*duzdyl_plus_duydzl;
+ }
+
if(ATTENUATION){
// subtract memory variables if attenuation
compute_element_att_stress(tx,working_element,NSPEC,
@@ -784,14 +817,6 @@
tempy3l += s_tempy3[offset]*fac3;
tempz3l += s_tempz3[offset]*fac3;
- //if(working_element == 169)
- // if(l==0)
- // if(I+J+K == 0) {
- // atomicAdd(&d_debug[0],1.0);
- // d_debug[0] = fac3;
- // d_debug[1] = offset;
- // d_debug[2] = s_tempz3[offset];
- // }
}
#else
@@ -876,24 +901,10 @@
d_accel[iglob*3] -= (fac1*tempx1l + fac2*tempx2l + fac3*tempx3l);
d_accel[iglob*3 + 1] -= (fac1*tempy1l + fac2*tempy2l + fac3*tempy3l);
d_accel[iglob*3 + 2] -= (fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
-
+
}else{
- //if(iglob*3+2 == 41153) {
- // int ot = d_debug[5];
- // d_debug[0+1+ot] = d_accel[iglob*3+2];
- // // d_debug[1+1+ot] = fac1*tempz1l;
- // // d_debug[2+1+ot] = fac2*tempz2l;
- // // d_debug[3+1+ot] = fac3*tempz3l;
- // d_debug[1+1+ot] = fac1;
- // d_debug[2+1+ot] = fac2;
- // d_debug[3+1+ot] = fac3;
- // d_debug[4+1+ot] = d_accel[iglob*3+2]-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l);
- // atomicAdd(&d_debug[0],1.0);
- // d_debug[6+ot] = d_displ[iglob*3+2];
- //}
-
atomicAdd(&d_accel[iglob*3],-(fac1*tempx1l + fac2*tempx2l + fac3*tempx3l));
atomicAdd(&d_accel[iglob*3+1],-(fac1*tempy1l + fac2*tempy2l + fac3*tempy3l));
atomicAdd(&d_accel[iglob*3+2],-(fac1*tempz1l + fac2*tempz2l + fac3*tempz3l));
@@ -914,7 +925,7 @@
// save deviatoric strain for Runge-Kutta scheme
if( COMPUTE_AND_STORE_STRAIN ){
- int ijk_ispec = tx + working_element*125;
+ int ijk_ispec = tx + working_element*NGLL3;
// fortran: epsilondev_xx(:,:,:,ispec) = epsilondev_xx_loc(:,:,:)
epsilondev_xx[ijk_ispec] = epsilondev_xx_loc;
@@ -936,43 +947,65 @@
/* ----------------------------------------------------------------------------------------------- */
void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,
- int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,int ATTENUATION,
+ int COMPUTE_AND_STORE_STRAIN,int SIMULATION_TYPE,
+ int ATTENUATION,int ANISOTROPY,
int* d_ibool,
- float* d_xix,
- float* d_xiy,
- float* d_xiz,
- float* d_etax,
- float* d_etay,
- float* d_etaz,
- float* d_gammax,
- float* d_gammay,
- float* d_gammaz,
- float* d_kappav,
- float* d_muv,
- float* d_epsilondev_xx,
- float* d_epsilondev_yy,
- float* d_epsilondev_xy,
- float* d_epsilondev_xz,
- float* d_epsilondev_yz,
- float* d_epsilon_trace_over_3,
- float* d_one_minus_sum_beta,
- float* d_factor_common,
- float* d_R_xx,
- float* d_R_yy,
- float* d_R_xy,
- float* d_R_xz,
- float* d_R_yz,
- float* d_b_epsilondev_xx,
- float* d_b_epsilondev_yy,
- float* d_b_epsilondev_xy,
- float* d_b_epsilondev_xz,
- float* d_b_epsilondev_yz,
- float* d_b_epsilon_trace_over_3,
- float* d_b_R_xx,
- float* d_b_R_yy,
- float* d_b_R_xy,
- float* d_b_R_xz,
- float* d_b_R_yz){
+ realw* d_xix,
+ realw* d_xiy,
+ realw* d_xiz,
+ realw* d_etax,
+ realw* d_etay,
+ realw* d_etaz,
+ realw* d_gammax,
+ realw* d_gammay,
+ realw* d_gammaz,
+ realw* d_kappav,
+ realw* d_muv,
+ realw* d_epsilondev_xx,
+ realw* d_epsilondev_yy,
+ realw* d_epsilondev_xy,
+ realw* d_epsilondev_xz,
+ realw* d_epsilondev_yz,
+ realw* d_epsilon_trace_over_3,
+ realw* d_one_minus_sum_beta,
+ realw* d_factor_common,
+ realw* d_R_xx,
+ realw* d_R_yy,
+ realw* d_R_xy,
+ realw* d_R_xz,
+ realw* d_R_yz,
+ realw* d_b_epsilondev_xx,
+ realw* d_b_epsilondev_yy,
+ realw* d_b_epsilondev_xy,
+ realw* d_b_epsilondev_xz,
+ realw* d_b_epsilondev_yz,
+ realw* d_b_epsilon_trace_over_3,
+ realw* d_b_R_xx,
+ realw* d_b_R_yy,
+ realw* d_b_R_xy,
+ realw* d_b_R_xz,
+ realw* d_b_R_yz,
+ realw* d_c11store,
+ realw* d_c12store,
+ realw* d_c13store,
+ realw* d_c14store,
+ realw* d_c15store,
+ realw* d_c16store,
+ realw* d_c22store,
+ realw* d_c23store,
+ realw* d_c24store,
+ realw* d_c25store,
+ realw* d_c26store,
+ realw* d_c33store,
+ realw* d_c34store,
+ realw* d_c35store,
+ realw* d_c36store,
+ realw* d_c44store,
+ realw* d_c45store,
+ realw* d_c46store,
+ realw* d_c55store,
+ realw* d_c56store,
+ realw* d_c66store){
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("before kernel Kernel 2");
@@ -989,29 +1022,17 @@
num_blocks_y = num_blocks_y*2;
}
- //int threads_2 = 128;//BLOCK_SIZE_K2;
- //dim3 grid_2(num_blocks_x,num_blocks_y);
-
- int blocksize = 128;
+ int blocksize = NGLL3_PADDED;
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- // debugging
- //printf("Starting with grid %dx%d for %d blocks\n",num_blocks_x,num_blocks_y,nb_blocks_to_compute);
- // float* d_debug;
- // float* h_debug;
- // h_debug = (float*)calloc(128,sizeof(float));
- // cudaMalloc((void**)&d_debug,128*sizeof(float));
- // cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
// Cuda timing
// cudaEvent_t start, stop;
- // float time;
+ // realw time;
// cudaEventCreate(&start);
// cudaEventCreate(&stop);
// cudaEventRecord( start, 0 );
- //Kernel_2_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,mp->NGLOB_AB, mp->d_ibool,
Kernel_2_impl<<<grid,threads>>>(nb_blocks_to_compute,
mp->NGLOB_AB,
d_ibool,
@@ -1024,7 +1045,6 @@
d_etax, d_etay, d_etaz,
d_gammax, d_gammay, d_gammaz,
d_kappav, d_muv,
- //d_debug,
COMPUTE_AND_STORE_STRAIN,
d_epsilondev_xx,
d_epsilondev_yy,
@@ -1037,26 +1057,33 @@
d_one_minus_sum_beta,
d_factor_common,
d_R_xx,d_R_yy,d_R_xy,d_R_xz,d_R_yz,
- mp->d_alphaval,mp->d_betaval,mp->d_gammaval
+ mp->d_alphaval,mp->d_betaval,mp->d_gammaval,
+ ANISOTROPY,
+ d_c11store,
+ d_c12store,
+ d_c13store,
+ d_c14store,
+ d_c15store,
+ d_c16store,
+ d_c22store,
+ d_c23store,
+ d_c24store,
+ d_c25store,
+ d_c26store,
+ d_c33store,
+ d_c34store,
+ d_c35store,
+ d_c36store,
+ d_c44store,
+ d_c45store,
+ d_c46store,
+ d_c55store,
+ d_c56store,
+ d_c66store
);
- // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
- // int procid;
- // MPI_Comm_rank(MPI_COMM_WORLD,&procid);
- // if(procid==0) {
- // for(int i=0;i<17;i++) {
- // printf("cudadebug[%d] = %e\n",i,h_debug[i]);
- // }
- // }
- // free(h_debug);
- // cudaFree(d_debug);
- // #ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- // exit_on_cuda_error("Kernel_2_impl");
- // #endif
-
if(SIMULATION_TYPE == 3) {
- //Kernel_2_impl<<< grid_2, threads_2, 0, 0 >>>(nb_blocks_to_compute,mp->NGLOB_AB, mp->d_ibool,
Kernel_2_impl<<< grid,threads>>>(nb_blocks_to_compute,
mp->NGLOB_AB,
d_ibool,
@@ -1069,7 +1096,6 @@
d_etax, d_etay, d_etaz,
d_gammax, d_gammay, d_gammaz,
d_kappav, d_muv,
- //d_debug,
COMPUTE_AND_STORE_STRAIN,
d_b_epsilondev_xx,
d_b_epsilondev_yy,
@@ -1082,7 +1108,29 @@
d_one_minus_sum_beta,
d_factor_common,
d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
- mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval
+ mp->d_b_alphaval,mp->d_b_betaval,mp->d_b_gammaval,
+ ANISOTROPY,
+ d_c11store,
+ d_c12store,
+ d_c13store,
+ d_c14store,
+ d_c15store,
+ d_c16store,
+ d_c22store,
+ d_c23store,
+ d_c24store,
+ d_c25store,
+ d_c26store,
+ d_c33store,
+ d_c34store,
+ d_c35store,
+ d_c36store,
+ d_c44store,
+ d_c45store,
+ d_c46store,
+ d_c55store,
+ d_c56store,
+ d_c66store
);
}
@@ -1093,11 +1141,6 @@
// cudaEventDestroy( stop );
// printf("Kernel2 Execution Time: %f ms\n",time);
- // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
- // for(int i=0;i<10;i++) {
- // printf("debug[%d]=%e\n",i,h_debug[i]);
- // }
-
/* cudaThreadSynchronize(); */
/* LOG("Kernel 2 finished"); */
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1116,7 +1159,8 @@
int* nspec_inner_elastic,
int* SIMULATION_TYPE,
int* COMPUTE_AND_STORE_STRAIN,
- int* ATTENUATION) {
+ int* ATTENUATION,
+ int* ANISOTROPY) {
TRACE("compute_forces_elastic_cuda");
// EPIK_TRACER("compute_forces_elastic_cuda");
@@ -1135,10 +1179,6 @@
// checks if anything to do
if( num_elements == 0 ) return;
- //int myrank;
- /* MPI_Comm_rank(MPI_COMM_WORLD,&myrank); */
- /* if(myrank==0) { */
-
// mesh coloring
if( mp->use_mesh_coloring_gpu ){
@@ -1167,8 +1207,8 @@
// array offsets
color_offset = (*nspec_outer_elastic) * NGLL3_PADDED;
- color_offset_nonpadded = (*nspec_outer_elastic) * NGLL3_NONPADDED;
- color_offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3_NONPADDED * N_SLS;
+ color_offset_nonpadded = (*nspec_outer_elastic) * NGLL3;
+ color_offset_nonpadded_att2 = (*nspec_outer_elastic) * NGLL3 * N_SLS;
}
// loops over colors
@@ -1183,7 +1223,8 @@
//}
Kernel_2(nb_blocks_to_compute,mp,*iphase,
- *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION,
+ *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,
+ *ATTENUATION,*ANISOTROPY,
mp->d_ibool + color_offset_nonpadded,
mp->d_xix + color_offset,
mp->d_xiy + color_offset,
@@ -1219,14 +1260,35 @@
mp->d_b_R_yy + color_offset_nonpadded,
mp->d_b_R_xy + color_offset_nonpadded,
mp->d_b_R_xz + color_offset_nonpadded,
- mp->d_b_R_yz + color_offset_nonpadded);
+ mp->d_b_R_yz + color_offset_nonpadded,
+ mp->d_c11store + color_offset,
+ mp->d_c12store + color_offset,
+ mp->d_c13store + color_offset,
+ mp->d_c14store + color_offset,
+ mp->d_c15store + color_offset,
+ mp->d_c16store + color_offset,
+ mp->d_c22store + color_offset,
+ mp->d_c23store + color_offset,
+ mp->d_c24store + color_offset,
+ mp->d_c25store + color_offset,
+ mp->d_c26store + color_offset,
+ mp->d_c33store + color_offset,
+ mp->d_c34store + color_offset,
+ mp->d_c35store + color_offset,
+ mp->d_c36store + color_offset,
+ mp->d_c44store + color_offset,
+ mp->d_c45store + color_offset,
+ mp->d_c46store + color_offset,
+ mp->d_c55store + color_offset,
+ mp->d_c56store + color_offset,
+ mp->d_c66store + color_offset);
// for padded and aligned arrays
color_offset += nb_blocks_to_compute * NGLL3_PADDED;
// for no-aligned arrays
- color_offset_nonpadded += nb_blocks_to_compute * NGLL3_NONPADDED;
+ color_offset_nonpadded += nb_blocks_to_compute * NGLL3;
// for factor_common array
- color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3_NONPADDED * N_SLS;
+ color_offset_nonpadded_att2 += nb_blocks_to_compute * NGLL3 * N_SLS;
}
}else{
@@ -1234,7 +1296,8 @@
// no mesh coloring: uses atomic updates
Kernel_2(num_elements,mp,*iphase,
- *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,*ATTENUATION,
+ *COMPUTE_AND_STORE_STRAIN,*SIMULATION_TYPE,
+ *ATTENUATION,*ANISOTROPY,
mp->d_ibool,
mp->d_xix,
mp->d_xiy,
@@ -1270,18 +1333,29 @@
mp->d_b_R_yy,
mp->d_b_R_xy,
mp->d_b_R_xz,
- mp->d_b_R_yz);
+ mp->d_b_R_yz,
+ mp->d_c11store,
+ mp->d_c12store,
+ mp->d_c13store,
+ mp->d_c14store,
+ mp->d_c15store,
+ mp->d_c16store,
+ mp->d_c22store,
+ mp->d_c23store,
+ mp->d_c24store,
+ mp->d_c25store,
+ mp->d_c26store,
+ mp->d_c33store,
+ mp->d_c34store,
+ mp->d_c35store,
+ mp->d_c36store,
+ mp->d_c44store,
+ mp->d_c45store,
+ mp->d_c46store,
+ mp->d_c55store,
+ mp->d_c56store,
+ mp->d_c66store);
}
-
-
-
- //cudaThreadSynchronize();
-
- //#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- /* MPI_Barrier(MPI_COMM_WORLD); */
- //double end_time = get_time();
- //printf("Elapsed time: %e\n",end_time-start_time);
- //#endif
}
@@ -1350,9 +1424,9 @@
void FC_FUNC_(kernel_3_a_cuda,
KERNEL_3_A_CUDA)(long* Mesh_pointer,
int* size_F,
- float* deltatover2_F,
+ realw* deltatover2_F,
int* SIMULATION_TYPE_f,
- float* b_deltatover2_F,
+ realw* b_deltatover2_F,
int* OCEANS) {
TRACE("kernel_3_a_cuda");
@@ -1362,7 +1436,7 @@
realw deltatover2 = *deltatover2_F;
realw b_deltatover2 = *b_deltatover2_F;
- int blocksize=128;
+ int blocksize = BLOCKSIZE_KERNEL3;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
@@ -1404,9 +1478,9 @@
void FC_FUNC_(kernel_3_b_cuda,
KERNEL_3_B_CUDA)(long* Mesh_pointer,
int* size_F,
- float* deltatover2_F,
+ realw* deltatover2_F,
int* SIMULATION_TYPE_f,
- float* b_deltatover2_F) {
+ realw* b_deltatover2_F) {
TRACE("kernel_3_b_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); // get Mesh from fortran integer wrapper
@@ -1415,7 +1489,7 @@
realw deltatover2 = *deltatover2_F;
realw b_deltatover2 = *b_deltatover2_F;
- int blocksize=128;
+ int blocksize = BLOCKSIZE_KERNEL3;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
@@ -1444,6 +1518,140 @@
/* ----------------------------------------------------------------------------------------------- */
+/* OCEANS load on free surface */
+
+/* ----------------------------------------------------------------------------------------------- */
+
+
+__global__ void elastic_ocean_load_cuda_kernel(realw* accel,
+ realw* rmass,
+ realw* rmass_ocean_load,
+ int num_free_surface_faces,
+ int* free_surface_ispec,
+ int* free_surface_ijk,
+ realw* free_surface_normal,
+ int* ibool,
+ int* updated_dof_ocean_load) {
+ // gets spectral element face id
+ int igll = threadIdx.x ; // threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
+ int iface = blockIdx.x + gridDim.x*blockIdx.y;
+ realw nx,ny,nz;
+ realw force_normal_comp,additional_term;
+
+ // for all faces on free surface
+ if( iface < num_free_surface_faces ){
+
+ int ispec = free_surface_ispec[iface]-1;
+
+ // gets global point index
+ int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
+ int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
+ int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
+
+ int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
+
+ //if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob);
+
+ // only update this global point once
+
+ // daniel: TODO - there might be better ways to implement a mutex like below,
+ // and find a workaround to not use the temporary update array.
+ // atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point
+
+ if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
+
+ // get normal
+ nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface)
+ ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
+ nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
+
+ // make updated component of right-hand side
+ // we divide by rmass() which is 1 / M
+ // we use the total force which includes the Coriolis term above
+ force_normal_comp = ( accel[iglob*3]*nx + accel[iglob*3+1]*ny + accel[iglob*3+2]*nz ) / rmass[iglob];
+
+ additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
+
+ // probably wouldn't need atomicAdd anymore, but just to be sure...
+ atomicAdd(&accel[iglob*3], + additional_term * nx);
+ atomicAdd(&accel[iglob*3+1], + additional_term * ny);
+ atomicAdd(&accel[iglob*3+2], + additional_term * nz);
+ }
+ }
+}
+
+/* ----------------------------------------------------------------------------------------------- */
+
+extern "C"
+void FC_FUNC_(elastic_ocean_load_cuda,
+ ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
+ int* SIMULATION_TYPE) {
+
+ TRACE("elastic_ocean_load_cuda");
+
+ Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
+
+ // checks if anything to do
+ if( mp->num_free_surface_faces == 0 ) return;
+
+ // block sizes: exact blocksize to match NGLLSQUARE
+ int blocksize = NGLL2;
+
+ int num_blocks_x = mp->num_free_surface_faces;
+ int num_blocks_y = 1;
+ while(num_blocks_x > 65535) {
+ num_blocks_x = ceil(num_blocks_x/2.0);
+ num_blocks_y = num_blocks_y*2;
+ }
+
+ dim3 grid(num_blocks_x,num_blocks_y);
+ dim3 threads(blocksize,1,1);
+
+
+ // initializes temporary array to zero
+ print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
+ sizeof(int)*mp->NGLOB_AB),88501);
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("before kernel elastic_ocean_load_cuda");
+#endif
+
+ elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_accel,
+ mp->d_rmass,
+ mp->d_rmass_ocean_load,
+ mp->num_free_surface_faces,
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->d_free_surface_normal,
+ mp->d_ibool,
+ mp->d_updated_dof_ocean_load);
+ // for backward/reconstructed potentials
+ if(*SIMULATION_TYPE == 3) {
+ // re-initializes array
+ print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
+ sizeof(int)*mp->NGLOB_AB),88502);
+
+ elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
+ mp->d_rmass,
+ mp->d_rmass_ocean_load,
+ mp->num_free_surface_faces,
+ mp->d_free_surface_ispec,
+ mp->d_free_surface_ijk,
+ mp->d_free_surface_normal,
+ mp->d_ibool,
+ mp->d_updated_dof_ocean_load);
+
+ }
+
+
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("elastic_ocean_load_cuda");
+#endif
+}
+
+
+/* ----------------------------------------------------------------------------------------------- */
+
/* note:
constant arrays when used in compute_forces_acoustic_cuda.cu routines stay zero,
constant declaration and cudaMemcpyToSymbol would have to be in the same file...
@@ -1467,10 +1675,10 @@
// constant arrays
-void setConst_hprime_xx(float* array,Mesh* mp)
+void setConst_hprime_xx(realw* array,Mesh* mp)
{
- cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(float));
+ cudaError_t err = cudaMemcpyToSymbol(d_hprime_xx, array, NGLL2*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in setConst_hprime_xx: %s\n", cudaGetErrorString(err));
@@ -1485,10 +1693,10 @@
}
}
-void setConst_hprime_yy(float* array,Mesh* mp)
+void setConst_hprime_yy(realw* array,Mesh* mp)
{
- cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(float));
+ cudaError_t err = cudaMemcpyToSymbol(d_hprime_yy, array, NGLL2*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in setConst_hprime_yy: %s\n", cudaGetErrorString(err));
@@ -1503,10 +1711,10 @@
}
}
-void setConst_hprime_zz(float* array,Mesh* mp)
+void setConst_hprime_zz(realw* array,Mesh* mp)
{
- cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(float));
+ cudaError_t err = cudaMemcpyToSymbol(d_hprime_zz, array, NGLL2*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in setConst_hprime_zz: %s\n", cudaGetErrorString(err));
@@ -1522,9 +1730,9 @@
}
-void setConst_hprimewgll_xx(float* array,Mesh* mp)
+void setConst_hprimewgll_xx(realw* array,Mesh* mp)
{
- cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(float));
+ cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_xx, array, NGLL2*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in setConst_hprimewgll_xx: %s\n", cudaGetErrorString(err));
@@ -1538,9 +1746,9 @@
}
}
-void setConst_hprimewgll_yy(float* array,Mesh* mp)
+void setConst_hprimewgll_yy(realw* array,Mesh* mp)
{
- cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(float));
+ cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_yy, array, NGLL2*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in setConst_hprimewgll_yy: %s\n", cudaGetErrorString(err));
@@ -1554,9 +1762,9 @@
}
}
-void setConst_hprimewgll_zz(float* array,Mesh* mp)
+void setConst_hprimewgll_zz(realw* array,Mesh* mp)
{
- cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(float));
+ cudaError_t err = cudaMemcpyToSymbol(d_hprimewgll_zz, array, NGLL2*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in setConst_hprimewgll_zz: %s\n", cudaGetErrorString(err));
@@ -1570,9 +1778,9 @@
}
}
-void setConst_wgllwgll_xy(float* array,Mesh* mp)
+void setConst_wgllwgll_xy(realw* array,Mesh* mp)
{
- cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(float));
+ cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xy, array, NGLL2*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in setConst_wgllwgll_xy: %s\n", cudaGetErrorString(err));
@@ -1587,9 +1795,9 @@
}
-void setConst_wgllwgll_xz(float* array,Mesh* mp)
+void setConst_wgllwgll_xz(realw* array,Mesh* mp)
{
- cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(float));
+ cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_xz, array, NGLL2*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in setConst_wgllwgll_xz: %s\n", cudaGetErrorString(err));
@@ -1604,9 +1812,9 @@
}
-void setConst_wgllwgll_yz(float* array,Mesh* mp)
+void setConst_wgllwgll_yz(realw* array,Mesh* mp)
{
- cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(float));
+ cudaError_t err = cudaMemcpyToSymbol(d_wgllwgll_yz, array, NGLL2*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in setConst_wgllwgll_yz: %s\n", cudaGetErrorString(err));
@@ -1621,136 +1829,3 @@
}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-/* OCEANS load on free surface */
-
-/* ----------------------------------------------------------------------------------------------- */
-
-
-__global__ void elastic_ocean_load_cuda_kernel(float* accel,
- float* rmass,
- float* rmass_ocean_load,
- int num_free_surface_faces,
- int* free_surface_ispec,
- int* free_surface_ijk,
- float* free_surface_normal,
- int* ibool,
- int* updated_dof_ocean_load) {
- // gets spectral element face id
- int igll = threadIdx.x ; // threadIdx.y*blockDim.x will be always = 0 for thread block (25,1,1)
- int iface = blockIdx.x + gridDim.x*blockIdx.y;
- realw nx,ny,nz;
- realw force_normal_comp,additional_term;
-
- // for all faces on free surface
- if( iface < num_free_surface_faces ){
-
- int ispec = free_surface_ispec[iface]-1;
-
- // gets global point index
- int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1; // (1,igll,iface)
- int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
- int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
-
- int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1;
-
- //if(igll == 0 ) printf("igll %d %d %d %d\n",igll,i,j,k,iglob);
-
- // only update this global point once
-
- // daniel: TODO - there might be better ways to implement a mutex like below,
- // and find a workaround to not use the temporary update array.
- // atomicExch: returns the old value, i.e. 0 indicates that we still have to do this point
-
- if( atomicExch(&updated_dof_ocean_load[iglob],1) == 0){
-
- // get normal
- nx = free_surface_normal[INDEX3(NDIM,NGLL2,0,igll,iface)]; //(1,igll,iface)
- ny = free_surface_normal[INDEX3(NDIM,NGLL2,1,igll,iface)];
- nz = free_surface_normal[INDEX3(NDIM,NGLL2,2,igll,iface)];
-
- // make updated component of right-hand side
- // we divide by rmass() which is 1 / M
- // we use the total force which includes the Coriolis term above
- force_normal_comp = ( accel[iglob*3]*nx + accel[iglob*3+1]*ny + accel[iglob*3+2]*nz ) / rmass[iglob];
-
- additional_term = (rmass_ocean_load[iglob] - rmass[iglob]) * force_normal_comp;
-
- // probably wouldn't need atomicAdd anymore, but just to be sure...
- atomicAdd(&accel[iglob*3], + additional_term * nx);
- atomicAdd(&accel[iglob*3+1], + additional_term * ny);
- atomicAdd(&accel[iglob*3+2], + additional_term * nz);
- }
- }
-}
-
-/* ----------------------------------------------------------------------------------------------- */
-
-extern "C"
-void FC_FUNC_(elastic_ocean_load_cuda,
- ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
- int* SIMULATION_TYPE) {
-
-TRACE("elastic_ocean_load_cuda");
-
- Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
-
- // checks if anything to do
- if( mp->num_free_surface_faces == 0 ) return;
-
- // block sizes: exact blocksize to match NGLLSQUARE
- int blocksize = 25;
-
- int num_blocks_x = mp->num_free_surface_faces;
- int num_blocks_y = 1;
- while(num_blocks_x > 65535) {
- num_blocks_x = ceil(num_blocks_x/2.0);
- num_blocks_y = num_blocks_y*2;
- }
-
- dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(blocksize,1,1);
-
-
- // initializes temporary array to zero
- print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
- sizeof(int)*mp->NGLOB_AB),88501);
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("before kernel elastic_ocean_load_cuda");
-#endif
-
- elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_accel,
- mp->d_rmass,
- mp->d_rmass_ocean_load,
- mp->num_free_surface_faces,
- mp->d_free_surface_ispec,
- mp->d_free_surface_ijk,
- mp->d_free_surface_normal,
- mp->d_ibool,
- mp->d_updated_dof_ocean_load);
- // for backward/reconstructed potentials
- if(*SIMULATION_TYPE == 3) {
- // re-initializes array
- print_CUDA_error_if_any(cudaMemset(mp->d_updated_dof_ocean_load,0,
- sizeof(int)*mp->NGLOB_AB),88502);
-
- elastic_ocean_load_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
- mp->d_rmass,
- mp->d_rmass_ocean_load,
- mp->num_free_surface_faces,
- mp->d_free_surface_ispec,
- mp->d_free_surface_ijk,
- mp->d_free_surface_normal,
- mp->d_ibool,
- mp->d_updated_dof_ocean_load);
-
- }
-
-
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("elastic_ocean_load_cuda");
-#endif
-}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_kernels_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,7 @@
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
-#include <mpi.h>
+
#include <sys/types.h>
#include <unistd.h>
#include <sys/time.h>
@@ -47,26 +47,25 @@
__global__ void compute_kernels_cudakernel(int* ispec_is_elastic,
int* ibool,
- float* accel,
- float* b_displ,
- float* epsilondev_xx,
- float* epsilondev_yy,
- float* epsilondev_xy,
- float* epsilondev_xz,
- float* epsilondev_yz,
- float* b_epsilondev_xx,
- float* b_epsilondev_yy,
- float* b_epsilondev_xy,
- float* b_epsilondev_xz,
- float* b_epsilondev_yz,
- float* rho_kl,
- float deltat,
- float* mu_kl,
- float* kappa_kl,
- float* epsilon_trace_over_3,
- float* b_epsilon_trace_over_3,
- int NSPEC_AB //,float* d_debug
- ) {
+ realw* accel,
+ realw* b_displ,
+ realw* epsilondev_xx,
+ realw* epsilondev_yy,
+ realw* epsilondev_xy,
+ realw* epsilondev_xz,
+ realw* epsilondev_yz,
+ realw* b_epsilondev_xx,
+ realw* b_epsilondev_yy,
+ realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,
+ realw* b_epsilondev_yz,
+ realw* rho_kl,
+ realw deltat,
+ realw* mu_kl,
+ realw* kappa_kl,
+ realw* epsilon_trace_over_3,
+ realw* b_epsilon_trace_over_3,
+ int NSPEC_AB) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
@@ -77,20 +76,9 @@
if( ispec_is_elastic[ispec] ) {
int ijk = threadIdx.x;
- int ijk_ispec = ijk + 125*ispec;
+ int ijk_ispec = ijk + NGLL3*ispec;
int iglob = ibool[ijk_ispec] - 1 ;
- // debug
-// if(ijk_ispec == 9480531) {
-// d_debug[0] = rho_kl[ijk_ispec];
-// d_debug[1] = accel[3*iglob];
-// d_debug[2] = b_displ[3*iglob];
-// d_debug[3] = deltat * (accel[3*iglob]*b_displ[3*iglob]+
-// accel[3*iglob+1]*b_displ[3*iglob+1]+
-// accel[3*iglob+2]*b_displ[3*iglob+2]);
-// }
-
-
// isotropic kernels:
// density kernel
rho_kl[ijk_ispec] += deltat * (accel[3*iglob]*b_displ[3*iglob]+
@@ -98,18 +86,9 @@
accel[3*iglob+2]*b_displ[3*iglob+2]);
- // debug
- // if(rho_kl[ijk_ispec] < 1.9983e+18) {
- // atomicAdd(&d_debug[3],1.0);
- // d_debug[4] = ijk_ispec;
- // d_debug[0] = rho_kl[ijk_ispec];
- // d_debug[1] = accel[3*iglob];
- // d_debug[2] = b_displ[3*iglob];
- // }
-
// shear modulus kernel
- mu_kl[ijk_ispec] += deltat * (epsilondev_xx[ijk_ispec]*b_epsilondev_xx[ijk_ispec]+ // 1*b1
- epsilondev_yy[ijk_ispec]*b_epsilondev_yy[ijk_ispec]+ // 2*b2
+ mu_kl[ijk_ispec] += deltat * (epsilondev_xx[ijk_ispec]*b_epsilondev_xx[ijk_ispec]+
+ epsilondev_yy[ijk_ispec]*b_epsilondev_yy[ijk_ispec]+
(epsilondev_xx[ijk_ispec]+epsilondev_yy[ijk_ispec])*
(b_epsilondev_xx[ijk_ispec]+b_epsilondev_yy[ijk_ispec])+
2*(epsilondev_xy[ijk_ispec]*b_epsilondev_xy[ijk_ispec]+
@@ -129,13 +108,13 @@
extern "C"
void FC_FUNC_(compute_kernels_elastic_cuda,
COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
- float* deltat_f) {
+ realw* deltat_f) {
TRACE("compute_kernels_elastic_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- int blocksize = 125; // NGLLX*NGLLY*NGLLZ
- float deltat = *deltat_f;
+ int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+ realw deltat = *deltat_f;
int num_blocks_x = mp->NSPEC_AB;
int num_blocks_y = 1;
@@ -147,12 +126,6 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- //float* d_debug;
- //float* h_debug;
- //h_debug = (float*)calloc(128,sizeof(float));
- //cudaMalloc((void**)&d_debug,128*sizeof(float));
- //cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
compute_kernels_cudakernel<<<grid,threads>>>(mp->d_ispec_is_elastic,mp->d_ibool,
mp->d_accel, mp->d_b_displ,
mp->d_epsilondev_xx,
@@ -171,31 +144,8 @@
mp->d_kappa_kl,
mp->d_epsilon_trace_over_3,
mp->d_b_epsilon_trace_over_3,
- mp->NSPEC_AB //,d_debug
- );
- /*
- cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
- cudaFree(d_debug);
- */
- // for(int i=0;i<5;i++) {
- // printf("d_debug[%d]=%e\n",i,h_debug[i]);
- // }
- /*
- free(h_debug);
- */
- // float* h_rho = (float*)malloc(sizeof(float)*mp->NSPEC_AB*125);
- // float maxval = 0;
- // cudaMemcpy(h_rho,mp->d_rho_kl,sizeof(float)*mp->NSPEC_AB*125,cudaMemcpyDeviceToHost);
- // int number_big_values = 0;
- // for(int i=0;i<mp->NSPEC_AB*125;i++) {
- // maxval = MAX(maxval,fabsf(h_rho[i]));
- // if(fabsf(h_rho[i]) > 1e10) {
- // number_big_values++;
- // }
- // }
+ mp->NSPEC_AB);
- // printf("maval rho = %e, number>1e10 = %d vs. %d\n",maxval,number_big_values,mp->NSPEC_AB*125);
-
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_kernels_elastic_cuda");
#endif
@@ -209,46 +159,34 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void compute_kernels_strength_noise_cuda_kernel(float* displ,
+__global__ void compute_kernels_strength_noise_cuda_kernel(realw* displ,
int* free_surface_ispec,
int* free_surface_ijk,
int* ibool,
- float* noise_surface_movie,
- float* normal_x_noise,
- float* normal_y_noise,
- float* normal_z_noise,
- float* Sigma_kl,
- float deltat,
- int num_free_surface_faces //,float* d_debug
- ) {
+ realw* noise_surface_movie,
+ realw* normal_x_noise,
+ realw* normal_y_noise,
+ realw* normal_z_noise,
+ realw* Sigma_kl,
+ realw deltat,
+ int num_free_surface_faces) {
int iface = blockIdx.x + blockIdx.y*gridDim.x;
if(iface < num_free_surface_faces) {
int ispec = free_surface_ispec[iface]-1;
int igll = threadIdx.x;
- int ipoin = igll + 25*iface;
+ int ipoin = igll + NGLL2*iface;
int i = free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)] - 1 ;
int j = free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)] - 1;
int k = free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)] - 1;
int iglob = ibool[INDEX4(5,5,5,i,j,k,ispec)] - 1 ;
- float eta = ( noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x_noise[ipoin]+
+ realw eta = ( noise_surface_movie[INDEX3(NDIM,NGLL2,0,igll,iface)]*normal_x_noise[ipoin]+
noise_surface_movie[INDEX3(NDIM,NGLL2,1,igll,iface)]*normal_y_noise[ipoin]+
noise_surface_movie[INDEX3(NDIM,NGLL2,2,igll,iface)]*normal_z_noise[ipoin]);
- // if(ijk_ispec == 78496) {
- // d_debug[0] = Sigma_kl[ijk_ispec];
- // d_debug[1] = eta;
- // d_debug[2] = normal_x_noise[ipoin];
- // d_debug[3] = normal_y_noise[ipoin];
- // d_debug[4] = normal_z_noise[ipoin];
- // d_debug[5] = displ[3*iglob+2];
- // d_debug[6] = deltat*eta*normal_z_noise[ipoin]*displ[2+3*iglob];
- // d_debug[7] = 0.008*1.000000e-24*normal_z_noise[ipoin]*3.740546e-13;
- // }
-
Sigma_kl[INDEX4(5,5,5,i,j,k,ispec)] += deltat*eta*(normal_x_noise[ipoin]*displ[3*iglob]+
normal_y_noise[ipoin]*displ[1+3*iglob]+
normal_z_noise[ipoin]*displ[2+3*iglob]);
@@ -261,15 +199,15 @@
extern "C"
void FC_FUNC_(compute_kernels_strgth_noise_cu,
COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
- float* h_noise_surface_movie,
- float* deltat) {
+ realw* h_noise_surface_movie,
+ realw* deltat) {
TRACE("compute_kernels_strgth_noise_cu");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
- 3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice);
+ 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
int num_blocks_x = mp->num_free_surface_faces;
@@ -280,13 +218,8 @@
}
dim3 grid(num_blocks_x,num_blocks_y);
- dim3 threads(25,1,1);
+ dim3 threads(NGLL2,1,1);
- // float* h_debug = (float*)calloc(128,sizeof(float));
- //float* d_debug;
- // cudaMalloc((void**)&d_debug,128*sizeof(float));
- // cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
compute_kernels_strength_noise_cuda_kernel<<<grid,threads>>>(mp->d_displ,
mp->d_free_surface_ispec,
mp->d_free_surface_ijk,
@@ -296,14 +229,8 @@
mp->d_normal_y_noise,
mp->d_normal_z_noise,
mp->d_Sigma_kl,*deltat,
- mp->num_free_surface_faces //,d_debug
- );
+ mp->num_free_surface_faces);
- // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
- // for(int i=0;i<8;i++) {
- // printf("debug[%d]= %e\n",i,h_debug[i]);
- // }
-
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_kernels_strength_noise_cuda_kernel");
#endif
@@ -320,30 +247,30 @@
__device__ void compute_gradient_kernel(int ijk,
int ispec,
- float* scalar_field,
- float* vector_field_element,
- float* hprime_xx,
- float* hprime_yy,
- float* hprime_zz,
- float* d_xix,
- float* d_xiy,
- float* d_xiz,
- float* d_etax,
- float* d_etay,
- float* d_etaz,
- float* d_gammax,
- float* d_gammay,
- float* d_gammaz,
- float rhol) {
+ realw* scalar_field,
+ realw* vector_field_element,
+ realw* hprime_xx,
+ realw* hprime_yy,
+ realw* hprime_zz,
+ realw* d_xix,
+ realw* d_xiy,
+ realw* d_xiz,
+ realw* d_etax,
+ realw* d_etay,
+ realw* d_etaz,
+ realw* d_gammax,
+ realw* d_gammay,
+ realw* d_gammaz,
+ realw rhol) {
- float temp1l,temp2l,temp3l;
- float hp1,hp2,hp3;
- float xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl;
- float rho_invl;
+ realw temp1l,temp2l,temp3l;
+ realw hp1,hp2,hp3;
+ realw xixl,xiyl,xizl,etaxl,etayl,etazl,gammaxl,gammayl,gammazl;
+ realw rho_invl;
int l,offset,offset1,offset2,offset3;
//const int NGLLX = 5;
- const int NGLL3_ALIGN = 128;
+ const int NGLL3_ALIGN = NGLL3_PADDED;
int K = (ijk/NGLL2);
int J = ((ijk-K*NGLL2)/NGLLX);
@@ -400,26 +327,26 @@
__global__ void compute_kernels_acoustic_kernel(int* ispec_is_acoustic,
int* ibool,
- float* rhostore,
- float* kappastore,
- float* hprime_xx,
- float* hprime_yy,
- float* hprime_zz,
- float* d_xix,
- float* d_xiy,
- float* d_xiz,
- float* d_etax,
- float* d_etay,
- float* d_etaz,
- float* d_gammax,
- float* d_gammay,
- float* d_gammaz,
- float* potential_dot_dot_acoustic,
- float* b_potential_acoustic,
- float* b_potential_dot_dot_acoustic,
- float* rho_ac_kl,
- float* kappa_ac_kl,
- float deltat,
+ realw* rhostore,
+ realw* kappastore,
+ realw* hprime_xx,
+ realw* hprime_yy,
+ realw* hprime_zz,
+ realw* d_xix,
+ realw* d_xiy,
+ realw* d_xiz,
+ realw* d_etax,
+ realw* d_etay,
+ realw* d_etaz,
+ realw* d_gammax,
+ realw* d_gammay,
+ realw* d_gammaz,
+ realw* potential_dot_dot_acoustic,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_dot_acoustic,
+ realw* rho_ac_kl,
+ realw* kappa_ac_kl,
+ realw deltat,
int NSPEC_AB) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
@@ -433,17 +360,17 @@
int ijk = threadIdx.x;
// local and global indices
- int ijk_ispec = ijk + 125*ispec;
- int ijk_ispec_padded = ijk + 128*ispec;
+ int ijk_ispec = ijk + NGLL3*ispec;
+ int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
int iglob = ibool[ijk_ispec] - 1;
- float accel_elm[3];
- float b_displ_elm[3];
- float rhol,kappal;
+ realw accel_elm[3];
+ realw b_displ_elm[3];
+ realw rhol,kappal;
// shared memory between all threads within this block
- __shared__ float scalar_field_displ[125];
- __shared__ float scalar_field_accel[125];
+ __shared__ realw scalar_field_displ[NGLL3];
+ __shared__ realw scalar_field_accel[NGLL3];
// copy field values
scalar_field_displ[ijk] = b_potential_acoustic[iglob];
@@ -485,14 +412,14 @@
void FC_FUNC_(compute_kernels_acoustic_cuda,
COMPUTE_KERNELS_ACOUSTIC_CUDA)(
long* Mesh_pointer,
- float* deltat_f) {
+ realw* deltat_f) {
TRACE("compute_kernels_acoustic_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- int blocksize = 125; // NGLLX*NGLLY*NGLLZ
- float deltat = *deltat_f;
+ int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+ realw deltat = *deltat_f;
int num_blocks_x = mp->NSPEC_AB;
int num_blocks_y = 1;
@@ -541,10 +468,10 @@
__global__ void compute_kernels_hess_el_cudakernel(int* ispec_is_elastic,
int* ibool,
- float* accel,
- float* b_accel,
- float* hess_kl,
- float deltat,
+ realw* accel,
+ realw* b_accel,
+ realw* hess_kl,
+ realw deltat,
int NSPEC_AB) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
@@ -556,7 +483,7 @@
if( ispec_is_elastic[ispec] ) {
int ijk = threadIdx.x;
- int ijk_ispec = ijk + 125*ispec;
+ int ijk_ispec = ijk + NGLL3*ispec;
int iglob = ibool[ijk_ispec] - 1 ;
// approximate hessian
@@ -571,23 +498,23 @@
__global__ void compute_kernels_hess_ac_cudakernel(int* ispec_is_acoustic,
int* ibool,
- float* potential_dot_dot_acoustic,
- float* b_potential_dot_dot_acoustic,
- float* rhostore,
- float* hprime_xx,
- float* hprime_yy,
- float* hprime_zz,
- float* d_xix,
- float* d_xiy,
- float* d_xiz,
- float* d_etax,
- float* d_etay,
- float* d_etaz,
- float* d_gammax,
- float* d_gammay,
- float* d_gammaz,
- float* hess_kl,
- float deltat,
+ realw* potential_dot_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
+ realw* rhostore,
+ realw* hprime_xx,
+ realw* hprime_yy,
+ realw* hprime_zz,
+ realw* d_xix,
+ realw* d_xiy,
+ realw* d_xiz,
+ realw* d_etax,
+ realw* d_etay,
+ realw* d_etaz,
+ realw* d_gammax,
+ realw* d_gammay,
+ realw* d_gammaz,
+ realw* hess_kl,
+ realw deltat,
int NSPEC_AB) {
int ispec = blockIdx.x + blockIdx.y*gridDim.x;
@@ -600,18 +527,18 @@
// local and global indices
int ijk = threadIdx.x;
- int ijk_ispec = ijk + 125*ispec;
+ int ijk_ispec = ijk + NGLL3*ispec;
int iglob = ibool[ijk_ispec] - 1 ;
- int ijk_ispec_padded = ijk + 128*ispec;
+ int ijk_ispec_padded = ijk + NGLL3_PADDED*ispec;
- float accel_elm[3];
- float b_accel_elm[3];
- float rhol;
+ realw accel_elm[3];
+ realw b_accel_elm[3];
+ realw rhol;
// shared memory between all threads within this block
- __shared__ float scalar_field_accel[125];
- __shared__ float scalar_field_b_accel[125];
+ __shared__ realw scalar_field_accel[NGLL3];
+ __shared__ realw scalar_field_b_accel[NGLL3];
// copy field values
scalar_field_accel[ijk] = potential_dot_dot_acoustic[iglob];
@@ -649,15 +576,15 @@
extern "C"
void FC_FUNC_(compute_kernels_hess_cuda,
COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
- float* deltat_f,
+ realw* deltat_f,
int* ELASTIC_SIMULATION,
int* ACOUSTIC_SIMULATION) {
TRACE("compute_kernels_hess_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- int blocksize = 125; // NGLLX*NGLLY*NGLLZ
- float deltat = *deltat_f;
+ int blocksize = NGLL3; // NGLLX*NGLLY*NGLLZ
+ realw deltat = *deltat_f;
int num_blocks_x = mp->NSPEC_AB;
int num_blocks_y = 1;
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_acoustic_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
-#include <mpi.h>
#include <sys/time.h>
#include <sys/resource.h>
@@ -40,22 +39,22 @@
/* ----------------------------------------------------------------------------------------------- */
-__global__ void compute_stacey_acoustic_kernel(float* potential_dot_acoustic,
- float* potential_dot_dot_acoustic,
+__global__ void compute_stacey_acoustic_kernel(realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
int* abs_boundary_ispec,
int* abs_boundary_ijk,
realw* abs_boundary_jacobian2Dw,
int* ibool,
- float* rhostore,
- float* kappastore,
+ realw* rhostore,
+ realw* kappastore,
int* ispec_is_inner,
int* ispec_is_acoustic,
int phase_is_inner,
int SIMULATION_TYPE, int SAVE_FORWARD,
int num_abs_boundary_faces,
- float* b_potential_dot_acoustic,
- float* b_potential_dot_dot_acoustic,
- float* b_absorb_potential
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
+ realw* b_absorb_potential
) {
int igll = threadIdx.x;
@@ -116,7 +115,7 @@
int* phase_is_innerf,
int* SIMULATION_TYPEf,
int* SAVE_FORWARDf,
- float* h_b_absorb_potential) {
+ realw* h_b_absorb_potential) {
TRACE("compute_stacey_acoustic_cuda");
//double start_time = get_time();
@@ -131,7 +130,7 @@
// way 2: Elapsed time: 4.379034e-03
// > NGLLSQUARE==NGLL2==25, no further check inside kernel
- int blocksize = 25;
+ int blocksize = NGLL2;
int num_blocks_x = mp->d_num_abs_boundary_faces;
int num_blocks_y = 1;
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/compute_stacey_elastic_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
-#include <mpi.h>
#include <sys/time.h>
#include <sys/resource.h>
@@ -56,8 +55,7 @@
int SAVE_FORWARD,
int num_abs_boundary_faces,
realw* b_accel,
- realw* b_absorb_field //,float* debug_val,int* debug_val_int
- ) {
+ realw* b_absorb_field) {
int igll = threadIdx.x; // tx
int iface = blockIdx.x + gridDim.x*blockIdx.y; // bx
@@ -136,7 +134,7 @@
int* phase_is_innerf,
int* SIMULATION_TYPEf,
int* SAVE_FORWARDf,
- float* h_b_absorb_field) {
+ realw* h_b_absorb_field) {
TRACE("compute_stacey_elastic_cuda");
@@ -155,7 +153,7 @@
// way 2: seems sligthly faster
// > NGLLSQUARE==NGLL2==25, no further check inside kernel
- int blocksize = 25;
+ int blocksize = NGLL2;
int num_blocks_x = mp->d_num_abs_boundary_faces;
int num_blocks_y = 1;
@@ -167,12 +165,7 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- //float* d_debug_val;
- //int* d_debug_val_int;
-
if(SIMULATION_TYPE == 3 && mp->d_num_abs_boundary_faces > 0) {
- // int val = NSTEP-it+1;
- // read_abs_(&fid,(char*)b_absorb_field,&b_reclen_field,&val);
// The read is done in fortran
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_absorb_field,h_b_absorb_field,
mp->d_b_reclen_field,cudaMemcpyHostToDevice),7700);
@@ -197,8 +190,7 @@
SIMULATION_TYPE,SAVE_FORWARD,
mp->d_num_abs_boundary_faces,
mp->d_b_accel,
- mp->d_b_absorb_field //,d_debug_val,d_debug_val_int
- );
+ mp->d_b_absorb_field);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("compute_stacey_elastic_kernel");
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/it_update_displacement_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
-#include <mpi.h>
#include "config.h"
#include "mesh_constants_cuda.h"
@@ -70,13 +69,13 @@
void FC_FUNC_(it_update_displacement_cuda,
IT_UPDATE_DISPLACMENT_CUDA)(long* Mesh_pointer_f,
int* size_F,
- float* deltat_F,
- float* deltatsqover2_F,
- float* deltatover2_F,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
+ realw* deltatover2_F,
int* SIMULATION_TYPE,
- float* b_deltat_F,
- float* b_deltatsqover2_F,
- float* b_deltatover2_F) {
+ realw* b_deltat_F,
+ realw* b_deltatsqover2_F,
+ realw* b_deltatover2_F) {
TRACE("it_update_displacement_cuda");
@@ -93,7 +92,7 @@
realw b_deltatover2 = *b_deltatover2_F;
//cublasStatus status;
- int blocksize = 128;
+ int blocksize = BLOCKSIZE_KERNEL1;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
@@ -175,13 +174,13 @@
void FC_FUNC_(it_update_displacement_ac_cuda,
it_update_displacement_ac_cuda)(long* Mesh_pointer_f,
int* size_F,
- float* deltat_F,
- float* deltatsqover2_F,
- float* deltatover2_F,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
+ realw* deltatover2_F,
int* SIMULATION_TYPE,
- float* b_deltat_F,
- float* b_deltatsqover2_F,
- float* b_deltatover2_F) {
+ realw* b_deltat_F,
+ realw* b_deltatsqover2_F,
+ realw* b_deltatover2_F) {
TRACE("it_update_displacement_ac_cuda");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
@@ -195,7 +194,7 @@
realw b_deltatover2 = *b_deltatover2_F;
//cublasStatus status;
- int blocksize = 128;
+ int blocksize = BLOCKSIZE_KERNEL1;
int size_padded = ((int)ceil(((double)size)/((double)blocksize)))*blocksize;
int num_blocks_x = size_padded/blocksize;
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/mesh_constants_cuda.h 2011-11-06 02:02:36 UTC (rev 19152)
@@ -44,6 +44,7 @@
#ifndef GPU_MESH_
#define GPU_MESH_
+
#include <sys/types.h>
#include <unistd.h>
@@ -74,22 +75,6 @@
// error checking after cuda function calls
#define ENABLE_VERY_SLOW_ERROR_CHECKING
-
-/* ----------------------------------------------------------------------------------------------- */
-
-// indexing
-
-#define INDEX2(xsize,x,y) x + (y)*xsize
-#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
-#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
-#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*j)))
-#define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
-
-#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*128
-
-
-/* ----------------------------------------------------------------------------------------------- */
-
#define MAX(x,y) (((x) < (y)) ? (y) : (x))
double get_time();
@@ -108,14 +93,22 @@
/* ----------------------------------------------------------------------------------------------- */
+// dimensions
#define NDIM 3
+
+// Gauss-Lobatto-Legendre
#define NGLLX 5
#define NGLL2 25
-#define N_SLS 3
+#define NGLL3 125 // no padding: requires same size as in fortran for NGLLX * NGLLY * NGLLZ
-#define NGLL3_NONPADDED 125
+// padding: 128 == 2**7 might improve on older graphics cards w/ coalescent memory accesses:
#define NGLL3_PADDED 128
+// no padding: 125 == 5*5*5 to avoid allocation of extra memory
+//#define NGLL3_PADDED 125
+// number of standard linear solids
+#define N_SLS 3
+
//typedef float real; // type of variables passed into function
typedef float realw; // type of "working" variables
@@ -127,8 +120,33 @@
// leads up to ~ 5% performance increase
//#define USE_MESH_COLORING_GPU
+// cuda kernel block size for updating displacements/potential (newmark time scheme)
+#define BLOCKSIZE_KERNEL1 128
+#define BLOCKSIZE_KERNEL3 128
+#define BLOCKSIZE_TRANSFER 256
+
/* ----------------------------------------------------------------------------------------------- */
+// indexing
+
+#define INDEX2(xsize,x,y) x + (y)*xsize
+
+#define INDEX3(xsize,ysize,x,y,z) x + xsize*(y + ysize*z)
+//#define INDEX3(xsize,ysize,x,y,z) x + (y)*xsize + (z)*xsize*ysize
+
+#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*(z + zsize*i))
+//#define INDEX4(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize
+
+#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + xsize*(y + ysize*(z + zsize*(i + isize*(j))))
+//#define INDEX5(xsize,ysize,zsize,isize,x,y,z,i,j) x + (y)*xsize + (z)*xsize*ysize + (i)*xsize*ysize*zsize + (j)*xsize*ysize*zsize*isize
+
+#define INDEX6(xsize,ysize,zsize,isize,jsize,x,y,z,i,j,k) x + xsize*(y + ysize*(z + zsize*(i + isize*(j + jsize*k))))
+
+#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + xsize*(y + ysize*z) + (i)*NGLL3_PADDED
+//#define INDEX4_PADDED(xsize,ysize,zsize,x,y,z,i) x + (y)*xsize + (z)*xsize*ysize + (i)*NGLL3_PADDED
+
+/* ----------------------------------------------------------------------------------------------- */
+
// mesh pointer wrapper structure
/* ----------------------------------------------------------------------------------------------- */
@@ -140,12 +158,12 @@
int NGLOB_AB;
// interpolators
- float* d_xix; float* d_xiy; float* d_xiz;
- float* d_etax; float* d_etay; float* d_etaz;
- float* d_gammax; float* d_gammay; float* d_gammaz;
+ realw* d_xix; realw* d_xiy; realw* d_xiz;
+ realw* d_etax; realw* d_etay; realw* d_etaz;
+ realw* d_gammax; realw* d_gammay; realw* d_gammaz;
// model parameters
- float* d_kappav; float* d_muv;
+ realw* d_kappav; realw* d_muv;
// global indexing
int* d_ibool;
@@ -157,18 +175,22 @@
int use_mesh_coloring_gpu;
// pointers to constant memory arrays
- float* d_hprime_xx; float* d_hprime_yy; float* d_hprime_zz;
- float* d_hprimewgll_xx; float* d_hprimewgll_yy; float* d_hprimewgll_zz;
- float* d_wgllwgll_xy; float* d_wgllwgll_xz; float* d_wgllwgll_yz;
+ realw* d_hprime_xx; realw* d_hprime_yy; realw* d_hprime_zz;
+ realw* d_hprimewgll_xx; realw* d_hprimewgll_yy; realw* d_hprimewgll_zz;
+ realw* d_wgllwgll_xy; realw* d_wgllwgll_xz; realw* d_wgllwgll_yz;
+ // mpi buffers
+ int num_interfaces_ext_mesh;
+ int max_nibool_interfaces_ext_mesh;
+
// ------------------------------------------------------------------ //
// elastic wavefield parameters
// ------------------------------------------------------------------ //
// displacement, velocity, acceleration
- float* d_displ; float* d_veloc; float* d_accel;
+ realw* d_displ; realw* d_veloc; realw* d_accel;
// backward/reconstructed elastic wavefield
- float* d_b_displ; float* d_b_veloc; float* d_b_accel;
+ realw* d_b_displ; realw* d_b_veloc; realw* d_b_accel;
// elastic elements
int* d_ispec_is_elastic;
@@ -182,8 +204,10 @@
int num_colors_outer_elastic,num_colors_inner_elastic;
int nspec_elastic;
- float* d_rmass;
- float* d_send_accel_buffer;
+ realw* d_rmass;
+
+ // mpi buffer
+ realw* d_send_accel_buffer;
// interfaces
int* d_nibool_interfaces_ext_mesh;
@@ -193,18 +217,18 @@
int d_num_abs_boundary_faces;
int* d_abs_boundary_ispec;
int* d_abs_boundary_ijk;
- float* d_abs_boundary_normal;
- float* d_abs_boundary_jacobian2Dw;
+ realw* d_abs_boundary_normal;
+ realw* d_abs_boundary_jacobian2Dw;
- float* d_b_absorb_field;
+ realw* d_b_absorb_field;
int d_b_reclen_field;
- float* d_rho_vp;
- float* d_rho_vs;
+ realw* d_rho_vp;
+ realw* d_rho_vs;
// sources
int nsources_local;
- float* d_sourcearrays;
+ realw* d_sourcearrays;
double* d_stf_pre_compute;
int* d_islice_selected_source;
int* d_ispec_selected_source;
@@ -214,12 +238,13 @@
int* d_ispec_selected_rec;
int* d_islice_selected_rec;
int nrec_local;
- float* d_station_seismo_field;
- float* h_station_seismo_field;
+ realw* d_station_seismo_field;
+ realw* h_station_seismo_field;
+ // adjoint receivers/sources
int nadj_rec_local;
- float* d_adj_sourcearrays;
- float* h_adj_sourcearrays_slice;
+ realw* d_adj_sourcearrays;
+ realw* h_adj_sourcearrays_slice;
int* d_pre_computed_irec;
// surface elements (to save for noise tomography and acoustic simulations)
@@ -228,80 +253,103 @@
int num_free_surface_faces;
// surface movie elements to save for noise tomography
- float* d_noise_surface_movie;
+ realw* d_noise_surface_movie;
// attenuation
- float* d_R_xx;
- float* d_R_yy;
- float* d_R_xy;
- float* d_R_xz;
- float* d_R_yz;
+ realw* d_R_xx;
+ realw* d_R_yy;
+ realw* d_R_xy;
+ realw* d_R_xz;
+ realw* d_R_yz;
- float* d_one_minus_sum_beta;
- float* d_factor_common;
+ realw* d_one_minus_sum_beta;
+ realw* d_factor_common;
- float* d_alphaval;
- float* d_betaval;
- float* d_gammaval;
+ realw* d_alphaval;
+ realw* d_betaval;
+ realw* d_gammaval;
// attenuation & kernel
- float* d_epsilondev_xx;
- float* d_epsilondev_yy;
- float* d_epsilondev_xy;
- float* d_epsilondev_xz;
- float* d_epsilondev_yz;
- float* d_epsilon_trace_over_3;
+ realw* d_epsilondev_xx;
+ realw* d_epsilondev_yy;
+ realw* d_epsilondev_xy;
+ realw* d_epsilondev_xz;
+ realw* d_epsilondev_yz;
+ realw* d_epsilon_trace_over_3;
+ // anisotropy
+ realw* d_c11store;
+ realw* d_c12store;
+ realw* d_c13store;
+ realw* d_c14store;
+ realw* d_c15store;
+ realw* d_c16store;
+ realw* d_c22store;
+ realw* d_c23store;
+ realw* d_c24store;
+ realw* d_c25store;
+ realw* d_c26store;
+ realw* d_c33store;
+ realw* d_c34store;
+ realw* d_c35store;
+ realw* d_c36store;
+ realw* d_c44store;
+ realw* d_c45store;
+ realw* d_c46store;
+ realw* d_c55store;
+ realw* d_c56store;
+ realw* d_c66store;
+
// noise
- float* d_normal_x_noise;
- float* d_normal_y_noise;
- float* d_normal_z_noise;
- float* d_mask_noise;
- float* d_free_surface_jacobian2Dw;
+ realw* d_normal_x_noise;
+ realw* d_normal_y_noise;
+ realw* d_normal_z_noise;
+ realw* d_mask_noise;
+ realw* d_free_surface_jacobian2Dw;
- float* d_noise_sourcearray;
+ realw* d_noise_sourcearray;
// attenuation & kernel backward fields
- float* d_b_R_xx;
- float* d_b_R_yy;
- float* d_b_R_xy;
- float* d_b_R_xz;
- float* d_b_R_yz;
+ realw* d_b_R_xx;
+ realw* d_b_R_yy;
+ realw* d_b_R_xy;
+ realw* d_b_R_xz;
+ realw* d_b_R_yz;
- float* d_b_epsilondev_xx;
- float* d_b_epsilondev_yy;
- float* d_b_epsilondev_xy;
- float* d_b_epsilondev_xz;
- float* d_b_epsilondev_yz;
- float* d_b_epsilon_trace_over_3;
+ realw* d_b_epsilondev_xx;
+ realw* d_b_epsilondev_yy;
+ realw* d_b_epsilondev_xy;
+ realw* d_b_epsilondev_xz;
+ realw* d_b_epsilondev_yz;
+ realw* d_b_epsilon_trace_over_3;
- float* d_b_alphaval;
- float* d_b_betaval;
- float* d_b_gammaval;
+ realw* d_b_alphaval;
+ realw* d_b_betaval;
+ realw* d_b_gammaval;
// sensitivity kernels
- float* d_rho_kl;
- float* d_mu_kl;
- float* d_kappa_kl;
+ realw* d_rho_kl;
+ realw* d_mu_kl;
+ realw* d_kappa_kl;
// noise sensitivity kernel
- float* d_Sigma_kl;
+ realw* d_Sigma_kl;
// approximative hessian for preconditioning kernels
- float* d_hess_el_kl;
+ realw* d_hess_el_kl;
// oceans
- float* d_rmass_ocean_load;
- float* d_free_surface_normal;
+ realw* d_rmass_ocean_load;
+ realw* d_free_surface_normal;
int* d_updated_dof_ocean_load;
// ------------------------------------------------------------------ //
// acoustic wavefield
// ------------------------------------------------------------------ //
// potential and first and second time derivative
- float* d_potential_acoustic; float* d_potential_dot_acoustic; float* d_potential_dot_dot_acoustic;
+ realw* d_potential_acoustic; realw* d_potential_dot_acoustic; realw* d_potential_dot_dot_acoustic;
// backward/reconstructed wavefield
- float* d_b_potential_acoustic; float* d_b_potential_dot_acoustic; float* d_b_potential_dot_dot_acoustic;
+ realw* d_b_potential_acoustic; realw* d_b_potential_dot_acoustic; realw* d_b_potential_dot_dot_acoustic;
// acoustic domain parameters
int* d_ispec_is_acoustic;
@@ -314,34 +362,33 @@
int num_colors_outer_acoustic,num_colors_inner_acoustic;
int nspec_acoustic;
- float* d_rhostore;
- float* d_kappastore;
- float* d_rmass_acoustic;
+ realw* d_rhostore;
+ realw* d_kappastore;
+ realw* d_rmass_acoustic;
+
+ // mpi buffer
+ realw* d_send_potential_dot_dot_buffer;
- float* d_send_potential_dot_dot_buffer;
-
- float* d_b_absorb_potential;
+ realw* d_b_absorb_potential;
int d_b_reclen_potential;
// for writing seismograms
- float* d_station_seismo_potential;
- float* h_station_seismo_potential;
+ realw* d_station_seismo_potential;
+ realw* h_station_seismo_potential;
// sensitivity kernels
- float* d_rho_ac_kl;
- float* d_kappa_ac_kl;
+ realw* d_rho_ac_kl;
+ realw* d_kappa_ac_kl;
// approximative hessian for preconditioning kernels
- float* d_hess_ac_kl;
+ realw* d_hess_ac_kl;
// coupling acoustic-elastic
int* d_coupling_ac_el_ispec;
int* d_coupling_ac_el_ijk;
- float* d_coupling_ac_el_normal;
- float* d_coupling_ac_el_jacobian2Dw;
+ realw* d_coupling_ac_el_normal;
+ realw* d_coupling_ac_el_jacobian2Dw;
-
-
} Mesh;
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/noise_tomography_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -63,7 +63,7 @@
/* ----------------------------------------------------------------------------------------------- */
extern "C"
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(float* val) {
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val) {
TRACE("fortranprintf");
int procid;
@@ -86,15 +86,15 @@
// randomize displ for testing
extern "C"
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,float* h_displ) {
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ) {
TRACE("make_displ_rand");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); // get Mesh from fortran integer wrapper
- // float* displ_rnd = (float*)malloc(mp->NGLOB_AB*3*sizeof(float));
+ // realw* displ_rnd = (realw*)malloc(mp->NGLOB_AB*3*sizeof(realw));
for(int i=0;i<mp->NGLOB_AB*3;i++) {
h_displ[i] = rand();
}
- cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_displ,h_displ,mp->NGLOB_AB*3*sizeof(realw),cudaMemcpyHostToDevice);
}
/* ----------------------------------------------------------------------------------------------- */
@@ -142,7 +142,7 @@
num_blocks_y = num_blocks_y*2;
}
dim3 grid(num_blocks_x,num_blocks_y,1);
- dim3 threads(25,1,1);
+ dim3 threads(NGLL2,1,1);
transfer_surface_to_host_kernel<<<grid,threads>>>(mp->d_free_surface_ispec,
mp->d_free_surface_ijk,
@@ -152,7 +152,7 @@
mp->d_noise_surface_movie);
cudaMemcpy(h_noise_surface_movie,mp->d_noise_surface_movie,
- 3*25*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
+ 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyDeviceToHost);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("transfer_surface_to_host");
@@ -170,8 +170,7 @@
realw* normal_y_noise,
realw* normal_z_noise,
realw* mask_noise,
- realw* free_surface_jacobian2Dw //,float* d_debug
- ) {
+ realw* free_surface_jacobian2Dw) {
int iface = blockIdx.x + gridDim.x*blockIdx.y; // surface element id
@@ -181,7 +180,7 @@
int igll = threadIdx.x;
- int ipoin = 25*iface + igll;
+ int ipoin = NGLL2*iface + igll;
int i=free_surface_ijk[INDEX3(NDIM,NGLL2,0,igll,iface)]-1;
int j=free_surface_ijk[INDEX3(NDIM,NGLL2,1,igll,iface)]-1;
int k=free_surface_ijk[INDEX3(NDIM,NGLL2,2,igll,iface)]-1;
@@ -208,9 +207,9 @@
// 0x203000c8 is out of bounds
// non atomic version for speed testing -- atomic updates are needed for correctness
- // accel[3*iglob] += eta*mask_noise[ipoin] * normal_x * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + 25*ispec2D];
- // accel[3*iglob+1] += eta*mask_noise[ipoin] * normal_y * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + 25*ispec2D];
- // accel[3*iglob+2] += eta*mask_noise[ipoin] * normal_z * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + 25*ispec2D];
+ // accel[3*iglob] += eta*mask_noise[ipoin] * normal_x * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
+ // accel[3*iglob+1] += eta*mask_noise[ipoin] * normal_y * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
+ // accel[3*iglob+2] += eta*mask_noise[ipoin] * normal_z * wgllwgll_xy[tx] * free_surface_jacobian2Dw[tx + NGLL2*ispec2D];
// Fortran version in SVN -- note deletion of wgllwgll_xy?
// accel(1,iglob) = accel(1,iglob) + eta * mask_noise(ipoin) * normal_x_noise(ipoin) &
@@ -220,13 +219,13 @@
// accel(3,iglob) = accel(3,iglob) + eta * mask_noise(ipoin) * normal_z_noise(ipoin) &
// * free_surface_jacobian2Dw(igll,iface) ! wgllwgll_xy(i,j) * jacobian2D_top(i,j,iface)
- // atomicAdd(&accel[iglob*3] ,eta*mask_noise[ipoin]*normal_x*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+25*iface]);
- // atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+25*iface]);
- // atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+25*iface]);
+ // atomicAdd(&accel[iglob*3] ,eta*mask_noise[ipoin]*normal_x*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+ // atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+ // atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*wgllwgll_xy[tx]*free_surface_jacobian2Dw[igll+NGLL2*iface]);
- atomicAdd(&accel[iglob*3] ,eta*mask_noise[ipoin]*normal_x*free_surface_jacobian2Dw[igll+25*iface]);
- atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*free_surface_jacobian2Dw[igll+25*iface]);
- atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*free_surface_jacobian2Dw[igll+25*iface]);
+ atomicAdd(&accel[iglob*3] ,eta*mask_noise[ipoin]*normal_x*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+ atomicAdd(&accel[iglob*3+1],eta*mask_noise[ipoin]*normal_y*free_surface_jacobian2Dw[igll+NGLL2*iface]);
+ atomicAdd(&accel[iglob*3+2],eta*mask_noise[ipoin]*normal_z*free_surface_jacobian2Dw[igll+NGLL2*iface]);
}
}
@@ -245,13 +244,8 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
int NOISE_TOMOGRAPHY = *NOISE_TOMOGRAPHYf;
- //float* d_noise_surface_movie;
- //cudaMalloc((void**)&d_noise_surface_movie,3*25*num_free_surface_faces*sizeof(float));
- //cudaMemcpy(d_noise_surface_movie, h_noise_surface_movie,
- // 3*25*num_free_surface_faces*sizeof(realw),cudaMemcpyHostToDevice);
-
cudaMemcpy(mp->d_noise_surface_movie,h_noise_surface_movie,
- 3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice);
+ 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice);
int num_blocks_x = mp->num_free_surface_faces;
int num_blocks_y = 1;
@@ -260,13 +254,8 @@
num_blocks_y = num_blocks_y*2;
}
dim3 grid(num_blocks_x,num_blocks_y,1);
- dim3 threads(25,1,1);
+ dim3 threads(NGLL2,1,1);
- // float* h_debug = (float*)calloc(128,sizeof(float));
- //float* d_debug;
- // cudaMalloc((void**)&d_debug,128*sizeof(float));
- // cudaMemcpy(d_debug,h_debug,128*sizeof(float),cudaMemcpyHostToDevice);
-
if(NOISE_TOMOGRAPHY == 2) { // add surface source to forward field
noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_accel,
mp->d_ibool,
@@ -278,8 +267,7 @@
mp->d_normal_y_noise,
mp->d_normal_z_noise,
mp->d_mask_noise,
- mp->d_free_surface_jacobian2Dw //,d_debug
- );
+ mp->d_free_surface_jacobian2Dw);
}
else if(NOISE_TOMOGRAPHY == 3) { // add surface source to adjoint (backward) field
noise_read_add_surface_movie_cuda_kernel<<<grid,threads>>>(mp->d_b_accel,
@@ -292,16 +280,9 @@
mp->d_normal_y_noise,
mp->d_normal_z_noise,
mp->d_mask_noise,
- mp->d_free_surface_jacobian2Dw //,d_debug
- );
+ mp->d_free_surface_jacobian2Dw);
}
- // cudaMemcpy(h_debug,d_debug,128*sizeof(float),cudaMemcpyDeviceToHost);
- // for(int i=0;i<8;i++) {
- // printf("debug[%d]= %e\n",i,h_debug[i]);
- // }
- // MPI_Abort(MPI_COMM_WORLD,1);
- //cudaFree(d_noise_surface_movie);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("noise_read_add_surface_movie_cuda_kernel");
#endif
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_constants_cuda.h 2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,22 +29,24 @@
#ifndef CUDA_HEADER_H
#define CUDA_HEADER_H
+typedef float realw; // type of "working" variables
+
/* ----------------------------------------------------------------------------------------------- */
// setters for these const arrays (very ugly hack, but will have to do)
// elastic
-void setConst_hprime_xx(float* array,Mesh* mp);
-void setConst_hprime_yy(float* array,Mesh* mp);
-void setConst_hprime_zz(float* array,Mesh* mp);
+void setConst_hprime_xx(realw* array,Mesh* mp);
+void setConst_hprime_yy(realw* array,Mesh* mp);
+void setConst_hprime_zz(realw* array,Mesh* mp);
-void setConst_hprimewgll_xx(float* array,Mesh* mp);
-void setConst_hprimewgll_yy(float* array,Mesh* mp);
-void setConst_hprimewgll_zz(float* array,Mesh* mp);
+void setConst_hprimewgll_xx(realw* array,Mesh* mp);
+void setConst_hprimewgll_yy(realw* array,Mesh* mp);
+void setConst_hprimewgll_zz(realw* array,Mesh* mp);
-void setConst_wgllwgll_xy(float* array,Mesh* mp);
-void setConst_wgllwgll_xz(float* array, Mesh* mp);
-void setConst_wgllwgll_yz(float* array, Mesh* mp);
+void setConst_wgllwgll_xy(realw* array,Mesh* mp);
+void setConst_wgllwgll_xz(realw* array, Mesh* mp);
+void setConst_wgllwgll_yz(realw* array, Mesh* mp);
/* ----------------------------------------------------------------------------------------------- */
@@ -52,21 +54,21 @@
#ifdef USE_TEXTURES
// declaration of textures
- texture<float, 1, cudaReadModeElementType> tex_displ;
- texture<float, 1, cudaReadModeElementType> tex_accel;
+ texture<realw, 1, cudaReadModeElementType> tex_displ;
+ texture<realw, 1, cudaReadModeElementType> tex_accel;
- texture<float, 1, cudaReadModeElementType> tex_potential_acoustic;
- texture<float, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
+ texture<realw, 1, cudaReadModeElementType> tex_potential_acoustic;
+ texture<realw, 1, cudaReadModeElementType> tex_potential_dot_dot_acoustic;
// for binding the textures
- void bindTexturesDispl(float* d_displ)
+ void bindTexturesDispl(realw* d_displ)
{
cudaError_t err;
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<float>();
+ cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
- err = cudaBindTexture(NULL,tex_displ, d_displ, channelDescFloat, NDIM*NGLOB*sizeof(float));
+ err = cudaBindTexture(NULL,tex_displ, d_displ, channelDescFloat, NDIM*NGLOB*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in bindTexturesDispl for displ: %s\n", cudaGetErrorString(err));
@@ -74,13 +76,13 @@
}
}
- void bindTexturesAccel(float* d_accel)
+ void bindTexturesAccel(realw* d_accel)
{
cudaError_t err;
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<float>();
+ cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
- err = cudaBindTexture(NULL,tex_accel, d_accel, channelDescFloat, NDIM*NGLOB*sizeof(float));
+ err = cudaBindTexture(NULL,tex_accel, d_accel, channelDescFloat, NDIM*NGLOB*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in bindTexturesAccel for accel: %s\n", cudaGetErrorString(err));
@@ -88,14 +90,14 @@
}
}
- void bindTexturesPotential(float* d_potential_acoustic)
+ void bindTexturesPotential(realw* d_potential_acoustic)
{
cudaError_t err;
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<float>();
+ cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
err = cudaBindTexture(NULL,tex_potential_acoustic, d_potential_acoustic,
- channelDescFloat, NGLOB*sizeof(float));
+ channelDescFloat, NGLOB*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in bindTexturesPotential for potential_acoustic: %s\n", cudaGetErrorString(err));
@@ -103,14 +105,14 @@
}
}
- void bindTexturesPotential_dot_dot(float* d_potential_dot_dot_acoustic)
+ void bindTexturesPotential_dot_dot(realw* d_potential_dot_dot_acoustic)
{
cudaError_t err;
- cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<float>();
+ cudaChannelFormatDesc channelDescFloat = cudaCreateChannelDesc<realw>();
err = cudaBindTexture(NULL,tex_potential_dot_dot_acoustic, d_potential_dot_dot_acoustic,
- channelDescFloat, NGLOB*sizeof(float));
+ channelDescFloat, NGLOB*sizeof(realw));
if (err != cudaSuccess)
{
fprintf(stderr, "Error in bindTexturesPotential_dot_dot for potential_dot_dot_acoustic: %s\n", cudaGetErrorString(err));
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/prepare_mesh_constants_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -205,7 +205,7 @@
extern "C"
void FC_FUNC_(get_free_device_memory,
- get_FREE_DEVICE_MEMORY)(float* free, float* used, float* total ) {
+ get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ) {
TRACE("get_free_device_memory");
double free_db,used_db,total_db;
@@ -213,9 +213,9 @@
get_free_memory(&free_db,&used_db,&total_db);
// converts to MB
- *free = (float) free_db/1024.0/1024.0;
- *used = (float) used_db/1024.0/1024.0;
- *total = (float) total_db/1024.0/1024.0;
+ *free = (realw) free_db/1024.0/1024.0;
+ *used = (realw) used_db/1024.0/1024.0;
+ *total = (realw) total_db/1024.0/1024.0;
return;
}
@@ -424,27 +424,27 @@
PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
int* h_NGLLX,
int* NSPEC_AB, int* NGLOB_AB,
- float* h_xix, float* h_xiy, float* h_xiz,
- float* h_etax, float* h_etay, float* h_etaz,
- float* h_gammax, float* h_gammay, float* h_gammaz,
- float* h_kappav, float* h_muv,
+ realw* h_xix, realw* h_xiy, realw* h_xiz,
+ realw* h_etax, realw* h_etay, realw* h_etaz,
+ realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+ realw* h_kappav, realw* h_muv,
int* h_ibool,
int* num_interfaces_ext_mesh,
int* max_nibool_interfaces_ext_mesh,
int* h_nibool_interfaces_ext_mesh,
int* h_ibool_interfaces_ext_mesh,
- float* h_hprime_xx,float* h_hprime_yy,float* h_hprime_zz,
- float* h_hprimewgll_xx,float* h_hprimewgll_yy,float* h_hprimewgll_zz,
- float* h_wgllwgll_xy,float* h_wgllwgll_xz,float* h_wgllwgll_yz,
+ realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+ realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
+ realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
int* ABSORBING_CONDITIONS,
int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
- float* h_abs_boundary_normal,
- float* h_abs_boundary_jacobian2Dw,
+ realw* h_abs_boundary_normal,
+ realw* h_abs_boundary_jacobian2Dw,
int* h_num_abs_boundary_faces,
int* h_ispec_is_inner,
int* NSOURCES,
int* nsources_local_f,
- float* h_sourcearrays,
+ realw* h_sourcearrays,
int* h_islice_selected_source,
int* h_ispec_selected_source,
int* h_number_receiver_global,
@@ -454,11 +454,10 @@
int* SIMULATION_TYPE,
int* USE_MESH_COLORING_GPU_f,
int* nspec_acoustic,int* nspec_elastic,
- int* ncuda_devices) {
+ int* myrank_f,int* ncuda_devices) {
TRACE("prepare_constants_device");
- int procid;
int device_count = 0;
// cuda initialization (needs -lcuda library)
@@ -473,12 +472,12 @@
*ncuda_devices = device_count;
// Gets rank number of MPI process
- MPI_Comm_rank(MPI_COMM_WORLD, &procid);
+ int myrank = *myrank_f;
// Sets the active device
if(device_count > 1) {
// generalized for more GPUs per node
- cudaSetDevice((procid)%device_count);
+ cudaSetDevice( myrank%device_count );
exit_on_cuda_error("cudaSetDevice");
}
@@ -508,65 +507,67 @@
setConst_wgllwgll_yz(h_wgllwgll_yz,mp);
/* Assuming NGLLX=5. Padded is then 128 (5^3+3) */
- int size_padded = 128 * (mp->NSPEC_AB);
- int size = 125 * (mp->NSPEC_AB);
+ int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
+ //int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
// mesh
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(float)),1001);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(float)),1002);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(float)),1003);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax, size_padded*sizeof(float)),1004);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay, size_padded*sizeof(float)),1005);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz, size_padded*sizeof(float)),1006);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax, size_padded*sizeof(float)),1007);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay, size_padded*sizeof(float)),1008);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz, size_padded*sizeof(float)),1009);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappav, size_padded*sizeof(float)),1010);
- print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muv, size_padded*sizeof(float)),1011);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xix, size_padded*sizeof(realw)),1001);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiy, size_padded*sizeof(realw)),1002);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_xiz, size_padded*sizeof(realw)),1003);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etax, size_padded*sizeof(realw)),1004);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etay, size_padded*sizeof(realw)),1005);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_etaz, size_padded*sizeof(realw)),1006);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammax, size_padded*sizeof(realw)),1007);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammay, size_padded*sizeof(realw)),1008);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_gammaz, size_padded*sizeof(realw)),1009);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_kappav, size_padded*sizeof(realw)),1010);
+ print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_muv, size_padded*sizeof(realw)),1011);
// transfer constant element data with padding
for(int i=0;i < mp->NSPEC_AB;i++) {
- print_CUDA_error_if_any(cudaMemcpy(mp->d_xix + i*128, &h_xix[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1501);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy+i*128, &h_xiy[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1502);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz+i*128, &h_xiz[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1503);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_etax+i*128, &h_etax[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1504);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_etay+i*128, &h_etay[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1505);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz+i*128, &h_etaz[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1506);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax+i*128,&h_gammax[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1507);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay+i*128,&h_gammay[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1508);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz+i*128,&h_gammaz[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1509);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_kappav+i*128,&h_kappav[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1510);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_muv+i*128, &h_muv[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),1511);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_xix + i*NGLL3_PADDED, &h_xix[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1501);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_xiy+i*NGLL3_PADDED, &h_xiy[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1502);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_xiz+i*NGLL3_PADDED, &h_xiz[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1503);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_etax+i*NGLL3_PADDED, &h_etax[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1504);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_etay+i*NGLL3_PADDED, &h_etay[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1505);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_etaz+i*NGLL3_PADDED, &h_etaz[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1506);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_gammax+i*NGLL3_PADDED,&h_gammax[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1507);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_gammay+i*NGLL3_PADDED,&h_gammay[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1508);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaz+i*NGLL3_PADDED,&h_gammaz[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1509);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_kappav+i*NGLL3_PADDED,&h_kappav[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1510);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_muv+i*NGLL3_PADDED, &h_muv[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),1511);
}
// global indexing
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool,size_padded*sizeof(int)),1021);
print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool, h_ibool,
- size*sizeof(int),cudaMemcpyHostToDevice),1022);
+ NGLL3*(mp->NSPEC_AB)*sizeof(int),cudaMemcpyHostToDevice),1022);
// prepare interprocess-edge exchange information
- if( *num_interfaces_ext_mesh > 0 ){
+ mp->num_interfaces_ext_mesh = *num_interfaces_ext_mesh;
+ mp->max_nibool_interfaces_ext_mesh = *max_nibool_interfaces_ext_mesh;
+ if( mp->num_interfaces_ext_mesh > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_nibool_interfaces_ext_mesh,
- (*num_interfaces_ext_mesh)*sizeof(int)),1201);
+ (mp->num_interfaces_ext_mesh)*sizeof(int)),1201);
print_CUDA_error_if_any(cudaMemcpy(mp->d_nibool_interfaces_ext_mesh,h_nibool_interfaces_ext_mesh,
- (*num_interfaces_ext_mesh)*sizeof(int),cudaMemcpyHostToDevice),1202);
+ (mp->num_interfaces_ext_mesh)*sizeof(int),cudaMemcpyHostToDevice),1202);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_ibool_interfaces_ext_mesh,
- (*num_interfaces_ext_mesh)*(*max_nibool_interfaces_ext_mesh)*sizeof(int)),1203);
+ (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int)),1203);
print_CUDA_error_if_any(cudaMemcpy(mp->d_ibool_interfaces_ext_mesh,h_ibool_interfaces_ext_mesh,
- (*num_interfaces_ext_mesh)*(*max_nibool_interfaces_ext_mesh)*sizeof(int),
+ (mp->num_interfaces_ext_mesh)*(mp->max_nibool_interfaces_ext_mesh)*sizeof(int),
cudaMemcpyHostToDevice),1204);
}
@@ -592,21 +593,21 @@
print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_ijk),
- 3*25*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
+ 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int)),1103);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_ijk, h_abs_boundary_ijk,
- 3*25*(mp->d_num_abs_boundary_faces)*sizeof(int),
+ 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(int),
cudaMemcpyHostToDevice),1104);
print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_normal),
- 3*25*(mp->d_num_abs_boundary_faces)*sizeof(float)),1105);
+ 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1105);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_normal, h_abs_boundary_normal,
- 3*25*(mp->d_num_abs_boundary_faces)*sizeof(float),
+ 3*NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
cudaMemcpyHostToDevice),1106);
print_CUDA_error_if_any(cudaMalloc((void**) &(mp->d_abs_boundary_jacobian2Dw),
- 25*(mp->d_num_abs_boundary_faces)*sizeof(float)),1107);
+ NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw)),1107);
print_CUDA_error_if_any(cudaMemcpy(mp->d_abs_boundary_jacobian2Dw, h_abs_boundary_jacobian2Dw,
- 25*(mp->d_num_abs_boundary_faces)*sizeof(float),
+ NGLL2*(mp->d_num_abs_boundary_faces)*sizeof(realw),
cudaMemcpyHostToDevice),1108);
}
@@ -615,9 +616,9 @@
if (*SIMULATION_TYPE == 1 || *SIMULATION_TYPE == 3){
// not needed in case of pure adjoint simulations (SIMULATION_TYPE == 2)
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_sourcearrays,
- sizeof(float)* *NSOURCES*3*125),1301);
+ sizeof(realw)* *NSOURCES*3*NGLL3),1301);
print_CUDA_error_if_any(cudaMemcpy(mp->d_sourcearrays, h_sourcearrays,
- sizeof(float)* *NSOURCES*3*125,cudaMemcpyHostToDevice),1302);
+ sizeof(realw)* *NSOURCES*3*NGLL3,cudaMemcpyHostToDevice),1302);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_stf_pre_compute,
*NSOURCES*sizeof(double)),1303);
@@ -700,7 +701,8 @@
mp->nadj_rec_local = *nadj_rec_local;
if( mp->nadj_rec_local > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_adj_sourcearrays,
- (mp->nadj_rec_local)*3*125*sizeof(float)),7003);
+ (mp->nadj_rec_local)*3*NGLL3*sizeof(realw)),7003);
+
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_pre_computed_irec,
(mp->nadj_rec_local)*sizeof(int)),7004);
@@ -725,7 +727,7 @@
free(h_pre_computed_irec);
// temporary array to prepare extracted source array values
- mp->h_adj_sourcearrays_slice = (float*) malloc( (mp->nadj_rec_local)*3*125*sizeof(float) );
+ mp->h_adj_sourcearrays_slice = (realw*) malloc( (mp->nadj_rec_local)*3*NGLL3*sizeof(realw) );
if( mp->h_adj_sourcearrays_slice == NULL ) exit_on_error("h_adj_sourcearrays_slice not allocated\n");
}
@@ -744,9 +746,9 @@
extern "C"
void FC_FUNC_(prepare_fields_acoustic_device,
PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
- float* rmass_acoustic,
- float* rhostore,
- float* kappastore,
+ realw* rmass_acoustic,
+ realw* rhostore,
+ realw* kappastore,
int* num_phase_ispec_acoustic,
int* phase_ispec_inner_acoustic,
int* ispec_is_acoustic,
@@ -756,13 +758,13 @@
int* free_surface_ijk,
int* ABSORBING_CONDITIONS,
int* b_reclen_potential,
- float* b_absorb_potential,
+ realw* b_absorb_potential,
int* ELASTIC_SIMULATION,
int* num_coupling_ac_el_faces,
int* coupling_ac_el_ispec,
int* coupling_ac_el_ijk,
- float* coupling_ac_el_normal,
- float* coupling_ac_el_jacobian2Dw,
+ realw* coupling_ac_el_normal,
+ realw* coupling_ac_el_jacobian2Dw,
int* num_colors_outer_acoustic,
int* num_colors_inner_acoustic,
int* num_elem_colors_acoustic) {
@@ -771,32 +773,36 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
/* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
- int size_padded = 128 * mp->NSPEC_AB;
- int size_nonpadded = 125 * mp->NSPEC_AB;
- int size = mp->NGLOB_AB;
+ int size_padded = NGLL3_PADDED * mp->NSPEC_AB;
+ int size_nonpadded = NGLL3 * mp->NSPEC_AB;
+ int size_glob = mp->NGLOB_AB;
// allocates arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(float)*size),9001);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(float)*size),9002);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(float)*size),9003);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),sizeof(float)*size),9004);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(float)*size),9005);
- // padded array
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(float)),9006);
- // non-padded array
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(float)),9007);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_acoustic),sizeof(realw)*size_glob),9001);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_acoustic),sizeof(realw)*size_glob),9002);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_potential_dot_dot_acoustic),sizeof(realw)*size_glob),9003);
- // transfer element data
+ // mpi buffer
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_potential_dot_dot_buffer),
+ (mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),9004);
+
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_acoustic),sizeof(realw)*size_glob),9005);
print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_acoustic,rmass_acoustic,
- sizeof(float)*size,cudaMemcpyHostToDevice),9100);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
- size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),9105);
+ sizeof(realw)*size_glob,cudaMemcpyHostToDevice),9100);
+
+ // padded array
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rhostore),size_padded*sizeof(realw)),9006);
// transfer constant element data with padding
for(int i=0; i < mp->NSPEC_AB; i++) {
- print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*128, &rhostore[i*125],
- 125*sizeof(float),cudaMemcpyHostToDevice),9106);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_rhostore+i*NGLL3_PADDED, &rhostore[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),9106);
}
+ // non-padded array
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappastore),size_nonpadded*sizeof(realw)),9007);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_kappastore,kappastore,
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),9105);
+
// phase elements
mp->num_phase_ispec_acoustic = *num_phase_ispec_acoustic;
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_phase_ispec_inner_acoustic),
@@ -820,9 +826,9 @@
mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9203);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
- 3*25*mp->num_free_surface_faces*sizeof(int)),9202);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),9202);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
- 3*25*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9204);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9204);
}
}
@@ -838,8 +844,9 @@
// for seismograms
if( mp->nrec_local > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_potential),
- mp->nrec_local*125*sizeof(float)),9107);
- mp->h_station_seismo_potential = (float*) malloc( mp->nrec_local*125*sizeof(float) );
+ mp->nrec_local*NGLL3*sizeof(realw)),9107);
+
+ mp->h_station_seismo_potential = (realw*) malloc( mp->nrec_local*NGLL3*sizeof(realw) );
if( mp->h_station_seismo_potential == NULL) exit_on_error("error allocating h_station_seismo_potential");
}
@@ -852,19 +859,19 @@
(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9602);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_ijk),
- 3*25*(*num_coupling_ac_el_faces)*sizeof(int)),9603);
+ 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int)),9603);
print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_ijk,coupling_ac_el_ijk,
- 3*25*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9604);
+ 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(int),cudaMemcpyHostToDevice),9604);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_normal),
- 3*25*(*num_coupling_ac_el_faces)*sizeof(float)),9605);
+ 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),9605);
print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_normal,coupling_ac_el_normal,
- 3*25*(*num_coupling_ac_el_faces)*sizeof(float),cudaMemcpyHostToDevice),9606);
+ 3*NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),9606);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_coupling_ac_el_jacobian2Dw),
- 25*(*num_coupling_ac_el_faces)*sizeof(float)),9607);
+ NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw)),9607);
print_CUDA_error_if_any(cudaMemcpy(mp->d_coupling_ac_el_jacobian2Dw,coupling_ac_el_jacobian2Dw,
- 25*(*num_coupling_ac_el_faces)*sizeof(float),cudaMemcpyHostToDevice),9608);
+ NGLL2*(*num_coupling_ac_el_faces)*sizeof(realw),cudaMemcpyHostToDevice),9608);
}
@@ -893,32 +900,32 @@
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
- int size = mp->NGLOB_AB;
+ int size_glob = mp->NGLOB_AB;
// kernel simulations
if( *SIMULATION_TYPE != 3 ) return;
// allocates backward/reconstructed arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(float)*size),9014);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(float)*size),9015);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(float)*size),9016);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_acoustic),sizeof(realw)*size_glob),9014);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_acoustic),sizeof(realw)*size_glob),9015);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_potential_dot_dot_acoustic),sizeof(realw)*size_glob),9016);
// allocates kernels
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),125*mp->NSPEC_AB*sizeof(float)),9017);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),125*mp->NSPEC_AB*sizeof(float)),9018);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9017);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9018);
// initializes kernel values to zero
print_CUDA_error_if_any(cudaMemset(mp->d_rho_ac_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),9019);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),9019);
print_CUDA_error_if_any(cudaMemset(mp->d_kappa_ac_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),9020);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),9020);
// preconditioner
if( *APPROXIMATE_HESS_KL ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),125*mp->NSPEC_AB*sizeof(float)),9030);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_ac_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),9030);
// initializes with zeros
print_CUDA_error_if_any(cudaMemset(mp->d_hess_ac_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),9031);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),9031);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -937,54 +944,78 @@
void FC_FUNC_(prepare_fields_elastic_device,
PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
int* size,
- float* rmass,
- float* rho_vp,
- float* rho_vs,
+ realw* rmass,
+ realw* rho_vp,
+ realw* rho_vs,
int* num_phase_ispec_elastic,
int* phase_ispec_inner_elastic,
int* ispec_is_elastic,
int* ABSORBING_CONDITIONS,
- float* h_b_absorb_field,
+ realw* h_b_absorb_field,
int* h_b_reclen_field,
int* SIMULATION_TYPE,int* SAVE_FORWARD,
int* COMPUTE_AND_STORE_STRAIN,
- float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
- float* epsilondev_xz,float* epsilondev_yz,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
int* ATTENUATION,
int* R_size,
- float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
- float* one_minus_sum_beta,float* factor_common,
- float* alphaval,float* betaval,float* gammaval,
+ realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+ realw* one_minus_sum_beta,realw* factor_common,
+ realw* alphaval,realw* betaval,realw* gammaval,
int* OCEANS,
- float* rmass_ocean_load,
+ realw* rmass_ocean_load,
int* NOISE_TOMOGRAPHY,
- float* free_surface_normal,
+ realw* free_surface_normal,
int* free_surface_ispec,
int* free_surface_ijk,
int* num_free_surface_faces,
int* ACOUSTIC_SIMULATION,
int* num_colors_outer_elastic,
int* num_colors_inner_elastic,
- int* num_elem_colors_elastic){
+ int* num_elem_colors_elastic,
+ int* ANISOTROPY,
+ realw *c11store,
+ realw *c12store,
+ realw *c13store,
+ realw *c14store,
+ realw *c15store,
+ realw *c16store,
+ realw *c22store,
+ realw *c23store,
+ realw *c24store,
+ realw *c25store,
+ realw *c26store,
+ realw *c33store,
+ realw *c34store,
+ realw *c35store,
+ realw *c36store,
+ realw *c44store,
+ realw *c45store,
+ realw *c46store,
+ realw *c55store,
+ realw *c56store,
+ realw *c66store){
TRACE("prepare_fields_elastic_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f);
/* Assuming NGLLX==5. Padded is then 128 (5^3+3) */
- //int size_padded = 128 * mp->NSPEC_AB;
- int size_nonpadded = 125 * mp->NSPEC_AB;
+ int size_padded = NGLL3_PADDED * (mp->NSPEC_AB);
+ int size_nonpadded = NGLL3 * (mp->NSPEC_AB);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(float)*(*size)),8001);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(float)*(*size)),8002);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(float)*(*size)),8003);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_displ),sizeof(realw)*(*size)),8001);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_veloc),sizeof(realw)*(*size)),8002);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_accel),sizeof(realw)*(*size)),8003);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),sizeof(float)*(*size)),8004);
+ // mpi buffer
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_send_accel_buffer),
+ 3*(mp->max_nibool_interfaces_ext_mesh)*(mp->num_interfaces_ext_mesh)*sizeof(realw)),8004);
// mass matrix
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(float)*mp->NGLOB_AB),8005);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass),sizeof(realw)*mp->NGLOB_AB),8005);
// transfer element data
print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass,rmass,
- sizeof(float)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8010);
+ sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8010);
// element indices
@@ -1008,22 +1039,23 @@
// for seismograms
if( mp->nrec_local > 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_station_seismo_field),
- 3*125*(mp->nrec_local)*sizeof(float)),8015);
- mp->h_station_seismo_field = (float*) malloc( 3*125*(mp->nrec_local)*sizeof(float) );
+ 3*NGLL3*(mp->nrec_local)*sizeof(realw)),8015);
+
+ mp->h_station_seismo_field = (realw*) malloc( 3*NGLL3*(mp->nrec_local)*sizeof(realw) );
if( mp->h_station_seismo_field == NULL) exit_on_error("h_station_seismo_field not allocated \n");
}
// absorbing conditions
if( *ABSORBING_CONDITIONS && mp->d_num_abs_boundary_faces > 0){
// non-padded arrays
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(float)),8006);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(float)),8007);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vp),size_nonpadded*sizeof(realw)),8006);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_vs),size_nonpadded*sizeof(realw)),8007);
// rho_vp, rho_vs non-padded; they are needed for stacey boundary condition
print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vp, rho_vp,
- size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),8013);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8013);
print_CUDA_error_if_any(cudaMemcpy(mp->d_rho_vs, rho_vs,
- size_nonpadded*sizeof(float),cudaMemcpyHostToDevice),8014);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8014);
// absorb_field array used for file i/o
if(*SIMULATION_TYPE == 3 || ( *SIMULATION_TYPE == 1 && *SAVE_FORWARD )){
@@ -1038,27 +1070,27 @@
// strains used for attenuation and kernel simulations
if( *COMPUTE_AND_STORE_STRAIN ){
// strains
- int epsilondev_size = 125*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+ int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xx,
- epsilondev_size*sizeof(float)),8301);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(float),
+ epsilondev_size*sizeof(realw)),8301);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xx,epsilondev_xx,epsilondev_size*sizeof(realw),
cudaMemcpyHostToDevice),8302);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yy,
- epsilondev_size*sizeof(float)),8302);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(float),
+ epsilondev_size*sizeof(realw)),8302);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yy,epsilondev_yy,epsilondev_size*sizeof(realw),
cudaMemcpyHostToDevice),8303);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xy,
- epsilondev_size*sizeof(float)),8304);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(float),
+ epsilondev_size*sizeof(realw)),8304);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xy,epsilondev_xy,epsilondev_size*sizeof(realw),
cudaMemcpyHostToDevice),8305);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_xz,
- epsilondev_size*sizeof(float)),8306);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(float),
+ epsilondev_size*sizeof(realw)),8306);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_xz,epsilondev_xz,epsilondev_size*sizeof(realw),
cudaMemcpyHostToDevice),8307);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_epsilondev_yz,
- epsilondev_size*sizeof(float)),8308);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(float),
+ epsilondev_size*sizeof(realw)),8308);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilondev_yz,epsilondev_yz,epsilondev_size*sizeof(realw),
cudaMemcpyHostToDevice),8309);
}
@@ -1067,74 +1099,167 @@
if( *ATTENUATION ){
// memory arrays
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xx),
- (*R_size)*sizeof(float)),8401);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(float),
+ (*R_size)*sizeof(realw)),8401);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xx,R_xx,(*R_size)*sizeof(realw),
cudaMemcpyHostToDevice),8402);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yy),
- (*R_size)*sizeof(float)),8403);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(float),
+ (*R_size)*sizeof(realw)),8403);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yy,R_yy,(*R_size)*sizeof(realw),
cudaMemcpyHostToDevice),8404);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xy),
- (*R_size)*sizeof(float)),8405);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(float),
+ (*R_size)*sizeof(realw)),8405);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xy,R_xy,(*R_size)*sizeof(realw),
cudaMemcpyHostToDevice),8406);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_xz),
- (*R_size)*sizeof(float)),8407);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(float),
+ (*R_size)*sizeof(realw)),8407);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_R_xz,R_xz,(*R_size)*sizeof(realw),
cudaMemcpyHostToDevice),8408);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_R_yz),
- (*R_size)*sizeof(float)),8409);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(float),
+ (*R_size)*sizeof(realw)),8409);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_R_yz,R_yz,(*R_size)*sizeof(realw),
cudaMemcpyHostToDevice),8410);
// attenuation factors
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_one_minus_sum_beta),
- 125*mp->NSPEC_AB*sizeof(float)),8430);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),8430);
print_CUDA_error_if_any(cudaMemcpy(mp->d_one_minus_sum_beta ,one_minus_sum_beta,
- 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8431);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8431);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_factor_common),
- N_SLS*125*mp->NSPEC_AB*sizeof(float)),8432);
+ N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw)),8432);
print_CUDA_error_if_any(cudaMemcpy(mp->d_factor_common ,factor_common,
- N_SLS*125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8433);
+ N_SLS*NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8433);
// alpha,beta,gamma factors
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_alphaval),
- N_SLS*sizeof(float)),8434);
+ N_SLS*sizeof(realw)),8434);
print_CUDA_error_if_any(cudaMemcpy(mp->d_alphaval ,alphaval,
- N_SLS*sizeof(float),cudaMemcpyHostToDevice),8435);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8435);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_betaval),
- N_SLS*sizeof(float)),8436);
+ N_SLS*sizeof(realw)),8436);
print_CUDA_error_if_any(cudaMemcpy(mp->d_betaval ,betaval,
- N_SLS*sizeof(float),cudaMemcpyHostToDevice),8437);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8437);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_gammaval),
- N_SLS*sizeof(float)),8438);
+ N_SLS*sizeof(realw)),8438);
print_CUDA_error_if_any(cudaMemcpy(mp->d_gammaval ,gammaval,
- N_SLS*sizeof(float),cudaMemcpyHostToDevice),8439);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8439);
}
+ // anisotropy
+ if( *ANISOTROPY ){
+ // allocates memory on GPU
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c11store),
+ size_padded*sizeof(realw)),8700);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c12store),
+ size_padded*sizeof(realw)),8701);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c13store),
+ size_padded*sizeof(realw)),8702);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c14store),
+ size_padded*sizeof(realw)),8703);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c15store),
+ size_padded*sizeof(realw)),8704);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c16store),
+ size_padded*sizeof(realw)),8705);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c22store),
+ size_padded*sizeof(realw)),8706);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c23store),
+ size_padded*sizeof(realw)),8707);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c24store),
+ size_padded*sizeof(realw)),8708);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c25store),
+ size_padded*sizeof(realw)),8709);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c26store),
+ size_padded*sizeof(realw)),8710);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c33store),
+ size_padded*sizeof(realw)),8711);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c34store),
+ size_padded*sizeof(realw)),8712);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c35store),
+ size_padded*sizeof(realw)),8713);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c36store),
+ size_padded*sizeof(realw)),8714);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c44store),
+ size_padded*sizeof(realw)),8715);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c45store),
+ size_padded*sizeof(realw)),8716);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c46store),
+ size_padded*sizeof(realw)),8717);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c55store),
+ size_padded*sizeof(realw)),8718);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c56store),
+ size_padded*sizeof(realw)),8719);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_c66store),
+ size_padded*sizeof(realw)),8720);
+ // transfer constant element data with padding
+ for(int i=0;i < mp->NSPEC_AB;i++) {
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c11store + i*NGLL3_PADDED, &c11store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8800);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c12store + i*NGLL3_PADDED, &c12store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8801);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c13store + i*NGLL3_PADDED, &c13store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8802);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c14store + i*NGLL3_PADDED, &c14store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8803);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c15store + i*NGLL3_PADDED, &c15store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8804);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c16store + i*NGLL3_PADDED, &c16store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8805);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c22store + i*NGLL3_PADDED, &c22store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8806);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c23store + i*NGLL3_PADDED, &c23store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8807);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c24store + i*NGLL3_PADDED, &c24store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8808);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c25store + i*NGLL3_PADDED, &c25store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8809);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c26store + i*NGLL3_PADDED, &c26store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8810);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c33store + i*NGLL3_PADDED, &c33store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8811);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c34store + i*NGLL3_PADDED, &c34store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8812);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c35store + i*NGLL3_PADDED, &c35store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8813);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c36store + i*NGLL3_PADDED, &c36store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8814);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c44store + i*NGLL3_PADDED, &c44store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8815);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c45store + i*NGLL3_PADDED, &c45store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8816);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c46store + i*NGLL3_PADDED, &c46store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8817);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c55store + i*NGLL3_PADDED, &c55store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8818);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c56store + i*NGLL3_PADDED, &c56store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8819);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_c66store + i*NGLL3_PADDED, &c66store[i*NGLL3],
+ NGLL3*sizeof(realw),cudaMemcpyHostToDevice),8820);
+ }
+ }
+
+ // ocean load approximation
if( *OCEANS ){
// oceans needs a free surface
mp->num_free_surface_faces = *num_free_surface_faces;
if( mp->num_free_surface_faces > 0 ){
// mass matrix
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rmass_ocean_load),
- sizeof(float)*mp->NGLOB_AB),8501);
+ sizeof(realw)*mp->NGLOB_AB),8501);
print_CUDA_error_if_any(cudaMemcpy(mp->d_rmass_ocean_load,rmass_ocean_load,
- sizeof(float)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8502);
+ sizeof(realw)*mp->NGLOB_AB,cudaMemcpyHostToDevice),8502);
// surface normal
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_normal),
- 3*25*(mp->num_free_surface_faces)*sizeof(float)),8503);
+ 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw)),8503);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_normal,free_surface_normal,
- 3*25*(mp->num_free_surface_faces)*sizeof(float),cudaMemcpyHostToDevice),8504);
+ 3*NGLL2*(mp->num_free_surface_faces)*sizeof(realw),cudaMemcpyHostToDevice),8504);
// temporary global array: used to synchronize updates on global accel array
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_updated_dof_ocean_load),
@@ -1142,13 +1267,13 @@
if( *NOISE_TOMOGRAPHY == 0 && *ACOUSTIC_SIMULATION == 0 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ispec),
- mp->num_free_surface_faces*sizeof(int)),9201);
+ mp->num_free_surface_faces*sizeof(int)),8601);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ispec,free_surface_ispec,
- mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9203);
+ mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),8603);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_free_surface_ijk),
- 3*25*mp->num_free_surface_faces*sizeof(int)),9202);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),8602);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
- 3*25*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),9204);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),8604);
}
}
}
@@ -1173,14 +1298,14 @@
int* size,
int* SIMULATION_TYPE,
int* COMPUTE_AND_STORE_STRAIN,
- float* epsilon_trace_over_3,
- float* b_epsilondev_xx,float* b_epsilondev_yy,float* b_epsilondev_xy,
- float* b_epsilondev_xz,float* b_epsilondev_yz,
- float* b_epsilon_trace_over_3,
+ realw* epsilon_trace_over_3,
+ realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+ realw* b_epsilon_trace_over_3,
int* ATTENUATION,
int* R_size,
- float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
- float* b_alphaval,float* b_betaval,float* b_gammaval,
+ realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+ realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
int* APPROXIMATE_HESS_KL){
TRACE("prepare_fields_elastic_adj_dev");
@@ -1192,111 +1317,111 @@
// kernel simulations
// allocates backward/reconstructed arrays on device (GPU)
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(float)*(*size)),8201);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(float)*(*size)),8202);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(float)*(*size)),8203);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_displ),sizeof(realw)*(*size)),8201);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_veloc),sizeof(realw)*(*size)),8202);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_accel),sizeof(realw)*(*size)),8203);
// allocates kernels
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),125*mp->NSPEC_AB*sizeof(float)),8204);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),125*mp->NSPEC_AB*sizeof(float)),8205);
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),125*mp->NSPEC_AB*sizeof(float)),8206);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_rho_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8204);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_mu_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8205);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_kappa_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8206);
// initializes kernel values to zero
print_CUDA_error_if_any(cudaMemset(mp->d_rho_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),8207);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),8207);
print_CUDA_error_if_any(cudaMemset(mp->d_mu_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),8208);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),8208);
print_CUDA_error_if_any(cudaMemset(mp->d_kappa_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),8209);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),8209);
// strains used for attenuation and kernel simulations
if( *COMPUTE_AND_STORE_STRAIN ){
// strains
- int epsilondev_size = 125*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
+ int epsilondev_size = NGLL3*mp->NSPEC_AB; // note: non-aligned; if align, check memcpy below and indexing
// solid pressure
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_epsilon_trace_over_3),
- 125*mp->NSPEC_AB*sizeof(float)),8310);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),8310);
print_CUDA_error_if_any(cudaMemcpy(mp->d_epsilon_trace_over_3,epsilon_trace_over_3,
- 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8311);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8311);
// backward solid pressure
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilon_trace_over_3),
- 125*mp->NSPEC_AB*sizeof(float)),8312);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),8312);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilon_trace_over_3 ,b_epsilon_trace_over_3,
- 125*mp->NSPEC_AB*sizeof(float),cudaMemcpyHostToDevice),8313);
+ NGLL3*mp->NSPEC_AB*sizeof(realw),cudaMemcpyHostToDevice),8313);
// prepares backward strains
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xx),
- epsilondev_size*sizeof(float)),8321);
+ epsilondev_size*sizeof(realw)),8321);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yy),
- epsilondev_size*sizeof(float)),8322);
+ epsilondev_size*sizeof(realw)),8322);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xy),
- epsilondev_size*sizeof(float)),8323);
+ epsilondev_size*sizeof(realw)),8323);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_xz),
- epsilondev_size*sizeof(float)),8324);
+ epsilondev_size*sizeof(realw)),8324);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_epsilondev_yz),
- epsilondev_size*sizeof(float)),8325);
+ epsilondev_size*sizeof(realw)),8325);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,
- epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8326);
+ epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8326);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,
- epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8327);
+ epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8327);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,
- epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8328);
+ epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8328);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,
- epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8329);
+ epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8329);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,
- epsilondev_size*sizeof(float),cudaMemcpyHostToDevice),8330);
+ epsilondev_size*sizeof(realw),cudaMemcpyHostToDevice),8330);
}
// attenuation memory variables
if( *ATTENUATION ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xx),
- (*R_size)*sizeof(float)),8421);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(float),
+ (*R_size)*sizeof(realw)),8421);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xx,b_R_xx,(*R_size)*sizeof(realw),
cudaMemcpyHostToDevice),8422);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yy),
- (*R_size)*sizeof(float)),8423);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(float),
+ (*R_size)*sizeof(realw)),8423);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yy,b_R_yy,(*R_size)*sizeof(realw),
cudaMemcpyHostToDevice),8424);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xy),
- (*R_size)*sizeof(float)),8425);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(float),
+ (*R_size)*sizeof(realw)),8425);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xy,b_R_xy,(*R_size)*sizeof(realw),
cudaMemcpyHostToDevice),8426);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_xz),
- (*R_size)*sizeof(float)),8427);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(float),
+ (*R_size)*sizeof(realw)),8427);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_xz,b_R_xz,(*R_size)*sizeof(realw),
cudaMemcpyHostToDevice),8428);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_R_yz),
- (*R_size)*sizeof(float)),8429);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(float),
+ (*R_size)*sizeof(realw)),8429);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_b_R_yz,b_R_yz,(*R_size)*sizeof(realw),
cudaMemcpyHostToDevice),8420);
// alpha,beta,gamma factors for backward fields
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_alphaval),
- N_SLS*sizeof(float)),8434);
+ N_SLS*sizeof(realw)),8434);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_alphaval ,b_alphaval,
- N_SLS*sizeof(float),cudaMemcpyHostToDevice),8435);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8435);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_betaval),
- N_SLS*sizeof(float)),8436);
+ N_SLS*sizeof(realw)),8436);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_betaval ,b_betaval,
- N_SLS*sizeof(float),cudaMemcpyHostToDevice),8437);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8437);
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_b_gammaval),
- N_SLS*sizeof(float)),8438);
+ N_SLS*sizeof(realw)),8438);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_gammaval ,b_gammaval,
- N_SLS*sizeof(float),cudaMemcpyHostToDevice),8439);
+ N_SLS*sizeof(realw),cudaMemcpyHostToDevice),8439);
}
if( *APPROXIMATE_HESS_KL ){
- print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),125*mp->NSPEC_AB*sizeof(float)),8450);
+ print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_hess_el_kl),NGLL3*mp->NSPEC_AB*sizeof(realw)),8450);
// initializes with zeros
print_CUDA_error_if_any(cudaMemset(mp->d_hess_el_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),8451);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),8451);
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -1322,12 +1447,12 @@
int* SIMULATION_TYPE,
int* NOISE_TOMOGRAPHY,
int* NSTEP,
- float* noise_sourcearray,
- float* normal_x_noise,
- float* normal_y_noise,
- float* normal_z_noise,
- float* mask_noise,
- float* free_surface_jacobian2Dw) {
+ realw* noise_sourcearray,
+ realw* normal_x_noise,
+ realw* normal_y_noise,
+ realw* normal_z_noise,
+ realw* mask_noise,
+ realw* free_surface_jacobian2Dw) {
TRACE("prepare_fields_noise_device");
@@ -1342,56 +1467,56 @@
mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4002);
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_free_surface_ijk,
- 3*25*mp->num_free_surface_faces*sizeof(int)),4003);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int)),4003);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_ijk,free_surface_ijk,
- 3*25*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4004);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(int),cudaMemcpyHostToDevice),4004);
// alloc storage for the surface buffer to be copied
print_CUDA_error_if_any(cudaMalloc((void**) &mp->d_noise_surface_movie,
- 3*25*mp->num_free_surface_faces*sizeof(float)),4005);
+ 3*NGLL2*mp->num_free_surface_faces*sizeof(realw)),4005);
// prepares noise source array
if( *NOISE_TOMOGRAPHY == 1 ){
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_noise_sourcearray,
- 3*125*(*NSTEP)*sizeof(float)),4101);
+ 3*NGLL3*(*NSTEP)*sizeof(realw)),4101);
print_CUDA_error_if_any(cudaMemcpy(mp->d_noise_sourcearray, noise_sourcearray,
- 3*125*(*NSTEP)*sizeof(float),cudaMemcpyHostToDevice),4102);
+ 3*NGLL3*(*NSTEP)*sizeof(realw),cudaMemcpyHostToDevice),4102);
}
// prepares noise directions
if( *NOISE_TOMOGRAPHY > 1 ){
- int nface_size = 25*(*num_free_surface_faces);
+ int nface_size = NGLL2*(*num_free_surface_faces);
// allocates memory on GPU
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_x_noise,
- nface_size*sizeof(float)),4301);
+ nface_size*sizeof(realw)),4301);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_y_noise,
- nface_size*sizeof(float)),4302);
+ nface_size*sizeof(realw)),4302);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_normal_z_noise,
- nface_size*sizeof(float)),4303);
+ nface_size*sizeof(realw)),4303);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_mask_noise,
- nface_size*sizeof(float)),4304);
+ nface_size*sizeof(realw)),4304);
print_CUDA_error_if_any(cudaMalloc((void**)&mp->d_free_surface_jacobian2Dw,
- nface_size*sizeof(float)),4305);
+ nface_size*sizeof(realw)),4305);
// transfers data onto GPU
print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_x_noise, normal_x_noise,
- nface_size*sizeof(float),cudaMemcpyHostToDevice),4306);
+ nface_size*sizeof(realw),cudaMemcpyHostToDevice),4306);
print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_y_noise, normal_y_noise,
- nface_size*sizeof(float),cudaMemcpyHostToDevice),4307);
+ nface_size*sizeof(realw),cudaMemcpyHostToDevice),4307);
print_CUDA_error_if_any(cudaMemcpy(mp->d_normal_z_noise, normal_z_noise,
- nface_size*sizeof(float),cudaMemcpyHostToDevice),4308);
+ nface_size*sizeof(realw),cudaMemcpyHostToDevice),4308);
print_CUDA_error_if_any(cudaMemcpy(mp->d_mask_noise, mask_noise,
- nface_size*sizeof(float),cudaMemcpyHostToDevice),4309);
+ nface_size*sizeof(realw),cudaMemcpyHostToDevice),4309);
print_CUDA_error_if_any(cudaMemcpy(mp->d_free_surface_jacobian2Dw, free_surface_jacobian2Dw,
- nface_size*sizeof(float),cudaMemcpyHostToDevice),4310);
+ nface_size*sizeof(realw),cudaMemcpyHostToDevice),4310);
}
// prepares noise strength kernel
if( *NOISE_TOMOGRAPHY == 3 ){
print_CUDA_error_if_any(cudaMalloc((void**)&(mp->d_Sigma_kl),
- 125*(mp->NSPEC_AB)*sizeof(float)),4401);
+ NGLL3*(mp->NSPEC_AB)*sizeof(realw)),4401);
// initializes kernel values to zero
print_CUDA_error_if_any(cudaMemset(mp->d_Sigma_kl,0,
- 125*mp->NSPEC_AB*sizeof(float)),4403);
+ NGLL3*mp->NSPEC_AB*sizeof(realw)),4403);
}
@@ -1419,6 +1544,7 @@
int* NOISE_TOMOGRAPHY,
int* COMPUTE_AND_STORE_STRAIN,
int* ATTENUATION,
+ int* ANISOTROPY,
int* OCEANS,
int* APPROXIMATE_HESS_KL) {
@@ -1579,6 +1705,30 @@
}
}
+ if( *ANISOTROPY ){
+ cudaFree(mp->d_c11store);
+ cudaFree(mp->d_c12store);
+ cudaFree(mp->d_c13store);
+ cudaFree(mp->d_c14store);
+ cudaFree(mp->d_c15store);
+ cudaFree(mp->d_c16store);
+ cudaFree(mp->d_c22store);
+ cudaFree(mp->d_c23store);
+ cudaFree(mp->d_c24store);
+ cudaFree(mp->d_c25store);
+ cudaFree(mp->d_c26store);
+ cudaFree(mp->d_c33store);
+ cudaFree(mp->d_c34store);
+ cudaFree(mp->d_c35store);
+ cudaFree(mp->d_c36store);
+ cudaFree(mp->d_c44store);
+ cudaFree(mp->d_c45store);
+ cudaFree(mp->d_c46store);
+ cudaFree(mp->d_c55store);
+ cudaFree(mp->d_c56store);
+ cudaFree(mp->d_c66store);
+ }
+
if( *OCEANS ){
if( mp->num_free_surface_faces > 0 ){
cudaFree(mp->d_rmass_ocean_load);
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/specfem3D_gpu_cuda_method_stubs.c 2011-11-06 02:02:36 UTC (rev 19152)
@@ -1,67 +1,64 @@
#include "config.h"
#include <stdio.h>
-typedef float realw;
+typedef realw realw;
/* from check_fields_cuda.cu */
void FC_FUNC_(check_max_norm_displ_gpu,
- CHECK_MAX_NORM_DISPL_GPU)(int* size, float* displ,long* Mesh_pointer_f,int* announceID){}
+ CHECK_MAX_NORM_DISPL_GPU)(int* size, realw* displ,long* Mesh_pointer_f,int* announceID){}
void FC_FUNC_(check_max_norm_vector,
- CHECK_MAX_NORM_VECTOR)(int* size, float* vector1, int* announceID){}
+ CHECK_MAX_NORM_VECTOR)(int* size, realw* vector1, int* announceID){}
void FC_FUNC_(check_max_norm_displ,
- CHECK_MAX_NORM_DISPL)(int* size, float* displ, int* announceID){}
+ CHECK_MAX_NORM_DISPL)(int* size, realw* displ, int* announceID){}
void FC_FUNC_(check_max_norm_b_displ_gpu,
- CHECK_MAX_NORM_B_DISPL_GPU)(int* size, float* b_displ,long* Mesh_pointer_f,int* announceID){}
+ CHECK_MAX_NORM_B_DISPL_GPU)(int* size, realw* b_displ,long* Mesh_pointer_f,int* announceID){}
void FC_FUNC_(check_max_norm_b_accel_gpu,
- CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, float* b_accel,long* Mesh_pointer_f,int* announceID){}
+ CHECK_MAX_NORM_B_ACCEL_GPU)(int* size, realw* b_accel,long* Mesh_pointer_f,int* announceID){}
void FC_FUNC_(check_max_norm_b_veloc_gpu,
- CHECK_MAX_NORM_B_VELOC_GPU)(int* size, float* b_veloc,long* Mesh_pointer_f,int* announceID){}
+ CHECK_MAX_NORM_B_VELOC_GPU)(int* size, realw* b_veloc,long* Mesh_pointer_f,int* announceID){}
void FC_FUNC_(check_max_norm_b_displ,
- CHECK_MAX_NORM_B_DISPL)(int* size, float* b_displ,int* announceID){}
+ CHECK_MAX_NORM_B_DISPL)(int* size, realw* b_displ,int* announceID){}
void FC_FUNC_(check_max_norm_b_accel,
- CHECK_MAX_NORM_B_ACCEL)(int* size, float* b_accel,int* announceID){}
+ CHECK_MAX_NORM_B_ACCEL)(int* size, realw* b_accel,int* announceID){}
void FC_FUNC_(check_error_vectors,
- CHECK_ERROR_VECTORS)(int* sizef, float* vector1,float* vector2){}
+ CHECK_ERROR_VECTORS)(int* sizef, realw* vector1,realw* vector2){}
void FC_FUNC_(get_max_accel,
GET_MAX_ACCEL)(int* itf,int* sizef,long* Mesh_pointer){}
void FC_FUNC_(get_norm_acoustic_from_device,
- GET_NORM_ACOUSTIC_FROM_DEVICE)(float* norm,
+ GET_NORM_ACOUSTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
int* SIMULATION_TYPE){}
void FC_FUNC_(get_norm_elastic_from_device,
- GET_NORM_ELASTIC_FROM_DEVICE)(float* norm,
+ GET_NORM_ELASTIC_FROM_DEVICE)(realw* norm,
long* Mesh_pointer_f,
int* SIMULATION_TYPE){}
-/* from file compute_add_sources_cuda.cu */
+/* from file compute_add_sources_elastic_cuda.cu */
void FC_FUNC_(compute_add_sources_el_cuda,
COMPUTE_ADD_SOURCES_EL_CUDA)(long* Mesh_pointer_f,
- int* NSPEC_ABf, int* NGLOB_ABf,
- int* phase_is_innerf,int* NSOURCESf,
- int* itf, float* dtf, float* t0f,
- int* SIMULATION_TYPEf,int* NSTEPf,
- int* NOISE_TOMOGRAPHYf,
- int* USE_FORCE_POINT_SOURCEf,
- double* h_stf_pre_compute, int* myrankf){}
+ int* phase_is_innerf,
+ int* NSOURCESf,
+ double* h_stf_pre_compute,
+ int* myrankf){}
void FC_FUNC_(compute_add_sources_el_s3_cuda,
COMPUTE_ADD_SOURCES_EL_S3_CUDA)(long* Mesh_pointer,
- int* USE_FORCE_POINT_SOURCE,
double* h_stf_pre_compute,
int* NSOURCES,
- int* phase_is_inner,int* myrank){}
+ int* phase_is_inner,
+ int* myrank){}
void FC_FUNC_(add_source_master_rec_noise_cu,
ADD_SOURCE_MASTER_REC_NOISE_CU)(long* Mesh_pointer_f,
@@ -72,7 +69,7 @@
void FC_FUNC_(add_sources_el_sim_type_2_or_3,
ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
- float* h_adj_sourcearrays,
+ realw* h_adj_sourcearrays,
int* phase_is_inner,
int* h_ispec_is_inner,
int* h_ispec_is_elastic,
@@ -84,12 +81,13 @@
int* nadj_rec_local,
int* NTSTEP_BETWEEN_READ_ADJSRC){}
+/* from file compute_add_sources_acoustic_cuda.cu */
+
void FC_FUNC_(compute_add_sources_ac_cuda,
COMPUTE_ADD_SOURCES_AC_CUDA)(long* Mesh_pointer_f,
int* phase_is_innerf,
int* NSOURCESf,
int* SIMULATION_TYPEf,
- int* USE_FORCE_POINT_SOURCEf,
double* h_stf_pre_compute,
int* myrankf){}
@@ -98,13 +96,12 @@
int* phase_is_innerf,
int* NSOURCESf,
int* SIMULATION_TYPEf,
- int* USE_FORCE_POINT_SOURCEf,
double* h_stf_pre_compute,
int* myrankf){}
void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
- float* h_adj_sourcearrays,
+ realw* h_adj_sourcearrays,
int* phase_is_inner,
int* h_ispec_is_inner,
int* h_ispec_is_acoustic,
@@ -138,8 +135,8 @@
TRANSFER_BOUN_POT_FROM_DEVICE)(
int* size,
long* Mesh_pointer_f,
- float* potential_dot_dot_acoustic,
- float* send_potential_dot_dot_buffer,
+ realw* potential_dot_dot_acoustic,
+ realw* send_potential_dot_dot_buffer,
int* num_interfaces_ext_mesh,
int* max_nibool_interfaces_ext_mesh,
int* nibool_interfaces_ext_mesh,
@@ -172,9 +169,9 @@
void FC_FUNC_(kernel_3_b_acoustic_cuda,KERNEL_3_ACOUSTIC_CUDA)(
long* Mesh_pointer,
int* size_F,
- float* deltatover2_F,
+ realw* deltatover2_F,
int* SIMULATION_TYPE,
- float* b_deltatover2_F){}
+ realw* b_deltatover2_F){}
void FC_FUNC_(acoustic_enforce_free_surf_cuda,
ACOUSTIC_ENFORCE_FREE_SURF_CUDA)(long* Mesh_pointer_f,
@@ -184,8 +181,8 @@
/* from compute_forces_elastic_cuda.cu */
void FC_FUNC_(transfer_boun_accel_from_device,
- TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, float* accel,
- float* send_accel_buffer,
+ TRANSFER_BOUN_ACCEL_FROM_DEVICE)(int* size, long* Mesh_pointer_f, realw* accel,
+ realw* send_accel_buffer,
int* num_interfaces_ext_mesh,
int* max_nibool_interfaces_ext_mesh,
int* nibool_interfaces_ext_mesh,
@@ -209,22 +206,23 @@
int* nspec_inner_elastic,
int* SIMULATION_TYPE,
int* COMPUTE_AND_STORE_STRAIN,
- int* ATTENUATION){}
+ int* ATTENUATION,
+ int* ANISOTROPY){}
void FC_FUNC_(kernel_3_a_cuda,
KERNEL_3_A_CUDA)(long* Mesh_pointer,
int* size_F,
- float* deltatover2_F,
+ realw* deltatover2_F,
int* SIMULATION_TYPE_f,
- float* b_deltatover2_F,
+ realw* b_deltatover2_F,
int* OCEANS){}
void FC_FUNC_(kernel_3_b_cuda,
KERNEL_3_B_CUDA)(long* Mesh_pointer,
int* size_F,
- float* deltatover2_F,
+ realw* deltatover2_F,
int* SIMULATION_TYPE_f,
- float* b_deltatover2_F){}
+ realw* b_deltatover2_F){}
void FC_FUNC_(elastic_ocean_load_cuda,
ELASTIC_OCEAN_LOAD_CUDA)(long* Mesh_pointer_f,
@@ -234,21 +232,21 @@
void FC_FUNC_(compute_kernels_elastic_cuda,
COMPUTE_KERNELS_ELASTIC_CUDA)(long* Mesh_pointer,
- float* deltat_f){}
+ realw* deltat_f){}
void FC_FUNC_(compute_kernels_strgth_noise_cu,
COMPUTE_KERNELS_STRGTH_NOISE_CU)(long* Mesh_pointer,
- float* h_noise_surface_movie,
- float* deltat){}
+ realw* h_noise_surface_movie,
+ realw* deltat){}
void FC_FUNC_(compute_kernels_acoustic_cuda,
COMPUTE_KERNELS_ACOUSTIC_CUDA)(
long* Mesh_pointer,
- float* deltat_f){}
+ realw* deltat_f){}
void FC_FUNC_(compute_kernels_hess_cuda,
COMPUTE_KERNELS_HESS_CUDA)(long* Mesh_pointer,
- float* deltat_f) {}
+ realw* deltat_f) {}
/* from file compute_stacey_acoustic_cuda.cu */
void FC_FUNC_(compute_stacey_acoustic_cuda,
@@ -257,7 +255,7 @@
int* phase_is_innerf,
int* SIMULATION_TYPEf,
int* SAVE_FORWARDf,
- float* h_b_absorb_potential){}
+ realw* h_b_absorb_potential){}
/* from file compute_stacey_elastic_cuda.cu */
@@ -267,42 +265,42 @@
int* phase_is_innerf,
int* SIMULATION_TYPEf,
int* SAVE_FORWARDf,
- float* h_b_absorb_field){}
+ realw* h_b_absorb_field){}
/* from file it_update_displacement_cuda.cu */
void FC_FUNC_(it_update_displacement_cuda,
it_update_displacement_cuda)(long* Mesh_pointer_f,
int* size_F,
- float* deltat_F,
- float* deltatsqover2_F,
- float* deltatover2_F,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
+ realw* deltatover2_F,
int* SIMULATION_TYPE,
- float* b_deltat_F,
- float* b_deltatsqover2_F,
- float* b_deltatover2_F){}
+ realw* b_deltat_F,
+ realw* b_deltatsqover2_F,
+ realw* b_deltatover2_F){}
void FC_FUNC_(it_update_displacement_ac_cuda,
IT_UPDATE_DISPLACEMENT_AC_CUDA)(long* Mesh_pointer_f,
int* size_F,
- float* deltat_F,
- float* deltatsqover2_F,
- float* deltatover2_F,
+ realw* deltat_F,
+ realw* deltatsqover2_F,
+ realw* deltatover2_F,
int* SIMULATION_TYPE,
- float* b_deltat_F,
- float* b_deltatsqover2_F,
- float* b_deltatover2_F){}
+ realw* b_deltat_F,
+ realw* b_deltatsqover2_F,
+ realw* b_deltatover2_F){}
/* from file noise_tomography_cuda.cu */
void FC_FUNC_(fortranflush,FORTRANFLUSH)(int* rank){}
void FC_FUNC_(fortranprint,FORTRANPRINT)(int* id){}
-void FC_FUNC_(fortranprintf,FORTRANPRINTF)(float* val){}
+void FC_FUNC_(fortranprintf,FORTRANPRINTF)(realw* val){}
void FC_FUNC_(fortranprintd,FORTRANPRINTD)(double* val){}
-void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,float* h_displ){}
+void FC_FUNC_(make_displ_rand,MAKE_DISPL_RAND)(long* Mesh_pointer_f,realw* h_displ){}
void FC_FUNC_(transfer_surface_to_host,
TRANSFER_SURFACE_TO_HOST)(long* Mesh_pointer_f,
@@ -324,31 +322,31 @@
SHOW_FREE_DEVICE_MEMORY)(){}
void FC_FUNC_(get_free_device_memory,
- get_FREE_DEVICE_MEMORY)(float* free, float* used, float* total ){}
+ get_FREE_DEVICE_MEMORY)(realw* free, realw* used, realw* total ){}
void FC_FUNC_(prepare_constants_device,
PREPARE_CONSTANTS_DEVICE)(long* Mesh_pointer,
int* h_NGLLX,
int* NSPEC_AB, int* NGLOB_AB,
- float* h_xix, float* h_xiy, float* h_xiz,
- float* h_etax, float* h_etay, float* h_etaz,
- float* h_gammax, float* h_gammay, float* h_gammaz,
- float* h_kappav, float* h_muv,
+ realw* h_xix, realw* h_xiy, realw* h_xiz,
+ realw* h_etax, realw* h_etay, realw* h_etaz,
+ realw* h_gammax, realw* h_gammay, realw* h_gammaz,
+ realw* h_kappav, realw* h_muv,
int* h_ibool,
int* num_interfaces_ext_mesh, int* max_nibool_interfaces_ext_mesh,
int* h_nibool_interfaces_ext_mesh, int* h_ibool_interfaces_ext_mesh,
- float* h_hprime_xx,float* h_hprime_yy,float* h_hprime_zz,
- float* h_hprimewgll_xx,float* h_hprimewgll_yy,float* h_hprimewgll_zz,
- float* h_wgllwgll_xy,float* h_wgllwgll_xz,float* h_wgllwgll_yz,
+ realw* h_hprime_xx,realw* h_hprime_yy,realw* h_hprime_zz,
+ realw* h_hprimewgll_xx,realw* h_hprimewgll_yy,realw* h_hprimewgll_zz,
+ realw* h_wgllwgll_xy,realw* h_wgllwgll_xz,realw* h_wgllwgll_yz,
int* ABSORBING_CONDITIONS,
int* h_abs_boundary_ispec, int* h_abs_boundary_ijk,
- float* h_abs_boundary_normal,
- float* h_abs_boundary_jacobian2Dw,
+ realw* h_abs_boundary_normal,
+ realw* h_abs_boundary_jacobian2Dw,
int* h_num_abs_boundary_faces,
int* h_ispec_is_inner,
int* NSOURCES,
int* nsources_local,
- float* h_sourcearrays,
+ realw* h_sourcearrays,
int* h_islice_selected_source,
int* h_ispec_selected_source,
int* h_number_receiver_global,
@@ -358,7 +356,7 @@
int* SIMULATION_TYPE,
int* USE_MESH_COLORING_GPU,
int* nspec_acoustic,int* nspec_elastic,
- int* ncuda_devices)
+ int* myrank_f, int* ncuda_devices)
{
fprintf(stderr,"ERROR: GPU_MODE enabled without GPU/CUDA Support. To enable GPU support, reconfigure with --with-cuda flag.\n");
exit(1);
@@ -375,9 +373,9 @@
void FC_FUNC_(prepare_fields_acoustic_device,
PREPARE_FIELDS_ACOUSTIC_DEVICE)(long* Mesh_pointer_f,
- float* rmass_acoustic,
- float* rhostore,
- float* kappastore,
+ realw* rmass_acoustic,
+ realw* rhostore,
+ realw* kappastore,
int* num_phase_ispec_acoustic,
int* phase_ispec_inner_acoustic,
int* ispec_is_acoustic,
@@ -387,13 +385,13 @@
int* free_surface_ijk,
int* ABSORBING_CONDITIONS,
int* b_reclen_potential,
- float* b_absorb_potential,
+ realw* b_absorb_potential,
int* ELASTIC_SIMULATION,
int* num_coupling_ac_el_faces,
int* coupling_ac_el_ispec,
int* coupling_ac_el_ijk,
- float* coupling_ac_el_normal,
- float* coupling_ac_el_jacobian2Dw,
+ realw* coupling_ac_el_normal,
+ realw* coupling_ac_el_jacobian2Dw,
int* num_colors_outer_acoustic,
int* num_colors_inner_acoustic,
int* num_elem_colors_acoustic){}
@@ -405,49 +403,71 @@
void FC_FUNC_(prepare_fields_elastic_device,
PREPARE_FIELDS_ELASTIC_DEVICE)(long* Mesh_pointer_f,
int* size,
- float* rmass,
- float* rho_vp,
- float* rho_vs,
+ realw* rmass,
+ realw* rho_vp,
+ realw* rho_vs,
int* num_phase_ispec_elastic,
int* phase_ispec_inner_elastic,
int* ispec_is_elastic,
int* ABSORBING_CONDITIONS,
- float* h_b_absorb_field,
+ realw* h_b_absorb_field,
int* h_b_reclen_field,
int* SIMULATION_TYPE,int* SAVE_FORWARD,
int* COMPUTE_AND_STORE_STRAIN,
- float* epsilondev_xx,float* epsilondev_yy,float* epsilondev_xy,
- float* epsilondev_xz,float* epsilondev_yz,
+ realw* epsilondev_xx,realw* epsilondev_yy,realw* epsilondev_xy,
+ realw* epsilondev_xz,realw* epsilondev_yz,
int* ATTENUATION,
int* R_size,
- float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
- float* one_minus_sum_beta,float* factor_common,
- float* alphaval,float* betaval,float* gammaval,
+ realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
+ realw* one_minus_sum_beta,realw* factor_common,
+ realw* alphaval,realw* betaval,realw* gammaval,
int* OCEANS,
- float* rmass_ocean_load,
+ realw* rmass_ocean_load,
int* NOISE_TOMOGRAPHY,
- float* free_surface_normal,
+ realw* free_surface_normal,
int* free_surface_ispec,
int* free_surface_ijk,
int* num_free_surface_faces,
int* ACOUSTIC_SIMULATION,
int* num_colors_outer_elastic,
int* num_colors_inner_elastic,
- int* num_elem_colors_elastic){}
+ int* num_elem_colors_elastic,
+ int* ANISOTROY,
+ realw *c11store,
+ realw *c12store,
+ realw *c13store,
+ realw *c14store,
+ realw *c15store,
+ realw *c16store,
+ realw *c22store,
+ realw *c23store,
+ realw *c24store,
+ realw *c25store,
+ realw *c26store,
+ realw *c33store,
+ realw *c34store,
+ realw *c35store,
+ realw *c36store,
+ realw *c44store,
+ realw *c45store,
+ realw *c46store,
+ realw *c55store,
+ realw *c56store,
+ realw *c66store){}
void FC_FUNC_(prepare_fields_elastic_adj_dev,
PREPARE_FIELDS_ELASTIC_ADJ_DEV)(long* Mesh_pointer_f,
int* size,
int* SIMULATION_TYPE,
int* COMPUTE_AND_STORE_STRAIN,
- float* epsilon_trace_over_3,
- float* b_epsilondev_xx,float* b_epsilondev_yy,float* b_epsilondev_xy,
- float* b_epsilondev_xz,float* b_epsilondev_yz,
- float* b_epsilon_trace_over_3,
+ realw* epsilon_trace_over_3,
+ realw* b_epsilondev_xx,realw* b_epsilondev_yy,realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,realw* b_epsilondev_yz,
+ realw* b_epsilon_trace_over_3,
int* ATTENUATION,
int* R_size,
- float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
- float* b_alphaval,float* b_betaval,float* b_gammaval,
+ realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
+ realw* b_alphaval,realw* b_betaval,realw* b_gammaval,
int* APPROXIMATE_HESS_KL){}
@@ -460,12 +480,12 @@
int* SIMULATION_TYPE,
int* NOISE_TOMOGRAPHY,
int* NSTEP,
- float* noise_sourcearray,
- float* normal_x_noise,
- float* normal_y_noise,
- float* normal_z_noise,
- float* mask_noise,
- float* free_surface_jacobian2Dw){}
+ realw* noise_sourcearray,
+ realw* normal_x_noise,
+ realw* normal_y_noise,
+ realw* normal_z_noise,
+ realw* mask_noise,
+ realw* free_surface_jacobian2Dw){}
void FC_FUNC_(prepare_cleanup_device,
PREPARE_CLEANUP_DEVICE)(long* Mesh_pointer_f,
@@ -477,178 +497,179 @@
int* NOISE_TOMOGRAPHY,
int* COMPUTE_AND_STORE_STRAIN,
int* ATTENUATION,
+ int* ANISOTROPY,
int* OCEANS,
int* APPROXIMATE_HESS_KL){}
/* from file transfer_fields_cuda.cu */
void FC_FUNC_(transfer_fields_el_to_device,
- TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f){}
+ TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_fields_el_from_device,
- TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f){}
+ TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_b_fields_to_device,
- TRANSFER_B_FIELDS_TO_DEVICE)(int* size, float* b_displ, float* b_veloc, float* b_accel,
+ TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
long* Mesh_pointer_f){}
void FC_FUNC_(transfer_b_fields_from_device,
- TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, float* b_displ, float* b_veloc, float* b_accel,long* Mesh_pointer_f){}
+ TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_accel_to_device,
- TRNASFER_ACCEL_TO_DEVICE)(int* size, float* accel,long* Mesh_pointer_f){}
+ TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_accel_from_device,
- TRANSFER_ACCEL_FROM_DEVICE)(int* size, float* accel,long* Mesh_pointer_f){}
+ TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_b_accel_from_device,
- TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, float* b_accel,long* Mesh_pointer_f){}
+ TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_sigma_from_device,
- TRANSFER_SIGMA_FROM_DEVICE)(int* size, float* sigma_kl,long* Mesh_pointer_f){}
+ TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_b_displ_from_device,
- TRANSFER_B_DISPL_FROM_DEVICE)(int* size, float* displ,long* Mesh_pointer_f){}
+ TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_displ_from_device,
- TRANSFER_DISPL_FROM_DEVICE)(int* size, float* displ,long* Mesh_pointer_f){}
+ TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f){}
/*
void FC_FUNC_(transfer_compute_kernel_answers_from_device,
TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
- float* rho_kl,int* size_rho,
- float* mu_kl, int* size_mu,
- float* kappa_kl, int* size_kappa){}
+ realw* rho_kl,int* size_rho,
+ realw* mu_kl, int* size_mu,
+ realw* kappa_kl, int* size_kappa){}
*/
/*
void FC_FUNC_(transfer_compute_kernel_fields_from_device,
TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
- float* accel, int* size_accel,
- float* b_displ, int* size_b_displ,
- float* epsilondev_xx,
- float* epsilondev_yy,
- float* epsilondev_xy,
- float* epsilondev_xz,
- float* epsilondev_yz,
+ realw* accel, int* size_accel,
+ realw* b_displ, int* size_b_displ,
+ realw* epsilondev_xx,
+ realw* epsilondev_yy,
+ realw* epsilondev_xy,
+ realw* epsilondev_xz,
+ realw* epsilondev_yz,
int* size_epsilondev,
- float* b_epsilondev_xx,
- float* b_epsilondev_yy,
- float* b_epsilondev_xy,
- float* b_epsilondev_xz,
- float* b_epsilondev_yz,
+ realw* b_epsilondev_xx,
+ realw* b_epsilondev_yy,
+ realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,
+ realw* b_epsilondev_yz,
int* size_b_epsilondev,
- float* rho_kl,int* size_rho,
- float* mu_kl, int* size_mu,
- float* kappa_kl, int* size_kappa,
- float* epsilon_trace_over_3,
- float* b_epsilon_trace_over_3,
+ realw* rho_kl,int* size_rho,
+ realw* mu_kl, int* size_mu,
+ realw* kappa_kl, int* size_kappa,
+ realw* epsilon_trace_over_3,
+ realw* b_epsilon_trace_over_3,
int* size_epsilon_trace_over_3) {}
*/
void FC_FUNC_(transfer_b_fields_att_to_device,
TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
- float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
+ realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
int* size_R,
- float* b_epsilondev_xx,
- float* b_epsilondev_yy,
- float* b_epsilondev_xy,
- float* b_epsilondev_xz,
- float* b_epsilondev_yz,
+ realw* b_epsilondev_xx,
+ realw* b_epsilondev_yy,
+ realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,
+ realw* b_epsilondev_yz,
int* size_epsilondev){}
void FC_FUNC_(transfer_fields_att_from_device,
TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
- float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+ realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
int* size_R,
- float* epsilondev_xx,
- float* epsilondev_yy,
- float* epsilondev_xy,
- float* epsilondev_xz,
- float* epsilondev_yz,
+ realw* epsilondev_xx,
+ realw* epsilondev_yy,
+ realw* epsilondev_xy,
+ realw* epsilondev_xz,
+ realw* epsilondev_yz,
int* size_epsilondev){}
void FC_FUNC_(transfer_kernels_el_to_host,
TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
- float* h_rho_kl,
- float* h_mu_kl,
- float* h_kappa_kl,
+ realw* h_rho_kl,
+ realw* h_mu_kl,
+ realw* h_kappa_kl,
int* NSPEC_AB){}
void FC_FUNC_(transfer_kernels_noise_to_host,
TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
- float* h_Sigma_kl,
+ realw* h_Sigma_kl,
int* NSPEC_AB){}
void FC_FUNC_(transfer_fields_ac_to_device,
TRANSFER_FIELDS_AC_TO_DEVICE)(
int* size,
- float* potential_acoustic,
- float* potential_dot_acoustic,
- float* potential_dot_dot_acoustic,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
long* Mesh_pointer_f){}
void FC_FUNC_(transfer_b_fields_ac_to_device,
TRANSFER_B_FIELDS_AC_TO_DEVICE)(
int* size,
- float* b_potential_acoustic,
- float* b_potential_dot_acoustic,
- float* b_potential_dot_dot_acoustic,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
long* Mesh_pointer_f){}
void FC_FUNC_(transfer_fields_ac_from_device,TRANSFER_FIELDS_AC_FROM_DEVICE)(
int* size,
- float* potential_acoustic,
- float* potential_dot_acoustic,
- float* potential_dot_dot_acoustic,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
long* Mesh_pointer_f){}
void FC_FUNC_(transfer_b_fields_ac_from_device,
TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
int* size,
- float* b_potential_acoustic,
- float* b_potential_dot_acoustic,
- float* b_potential_dot_dot_acoustic,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
long* Mesh_pointer_f){}
void FC_FUNC_(transfer_dot_dot_from_device,
- TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, float* potential_dot_dot_acoustic,long* Mesh_pointer_f){}
+ TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_b_dot_dot_from_device,
- TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, float* b_potential_dot_dot_acoustic,long* Mesh_pointer_f){}
+ TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f){}
void FC_FUNC_(transfer_kernels_ac_to_host,
TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
- float* h_rho_ac_kl,
- float* h_kappa_ac_kl,
+ realw* h_rho_ac_kl,
+ realw* h_kappa_ac_kl,
int* NSPEC_AB){}
void FC_FUNC_(transfer_kernels_hess_el_tohost,
TRANSFER_KERNELS_HESS_TO_HOST)(long* Mesh_pointer,
- float* h_hess_kl,
+ realw* h_hess_kl,
int* NSPEC_AB) {}
void FC_FUNC_(transfer_kernels_hess_ac_tohost,
TRANSFER_KERNELS_HESS_TO_HOST)(long* Mesh_pointer,
- float* h_hess_ac_kl,
+ realw* h_hess_ac_kl,
int* NSPEC_AB) {}
/* from file write_seismograms_cuda.cu */
void FC_FUNC_(transfer_station_el_from_device,
- TRANSFER_STATION_EL_FROM_DEVICE)(float* displ,float* veloc,float* accel,
- float* b_displ, float* b_veloc, float* b_accel,
+ TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
+ realw* b_displ, realw* b_veloc, realw* b_accel,
long* Mesh_pointer_f,int* number_receiver_global,
int* ispec_selected_rec,int* ispec_selected_source,
int* ibool,int* SIMULATION_TYPEf){}
void FC_FUNC_(transfer_station_ac_from_device,
TRANSFER_STATION_AC_FROM_DEVICE)(
- float* potential_acoustic,
- float* potential_dot_acoustic,
- float* potential_dot_dot_acoustic,
- float* b_potential_acoustic,
- float* b_potential_dot_acoustic,
- float* b_potential_dot_dot_acoustic,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
long* Mesh_pointer_f,
int* number_receiver_global,
int* ispec_selected_rec,
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/transfer_fields_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -29,7 +29,6 @@
#include <stdio.h>
#include <cuda.h>
#include <cublas.h>
-#include <mpi.h>
#include <sys/time.h>
#include <sys/resource.h>
@@ -54,15 +53,15 @@
extern "C"
void FC_FUNC_(transfer_fields_el_to_device,
- TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f) {
+ TRANSFER_FIELDS_EL_TO_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
TRACE("transfer_fields_el_to_device_");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(float)*(*size),cudaMemcpyHostToDevice),40003);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(float)*(*size),cudaMemcpyHostToDevice),40004);
- print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),40005);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_displ,displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40003);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_veloc,veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40004);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40005);
}
@@ -70,38 +69,31 @@
extern "C"
void FC_FUNC_(transfer_fields_el_from_device,
- TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, float* displ, float* veloc, float* accel,long* Mesh_pointer_f) {
+ TRANSFER_FIELDS_EL_FROM_DEVICE)(int* size, realw* displ, realw* veloc, realw* accel,long* Mesh_pointer_f) {
TRACE("transfer_fields_el_from_device_");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40006);
- print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40007);
- print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40008);
+ print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40006);
+ print_CUDA_error_if_any(cudaMemcpy(veloc,mp->d_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40007);
+ print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40008);
- // printf("Transfered Fields From Device\n");
- // int procid;
- // MPI_Comm_rank(MPI_COMM_WORLD,&procid);
- // printf("Quick check of answer for p:%d in transfer_fields_el_from_device\n",procid);
- // for(int i=0;i<5;i++) {
- // printf("accel[%d]=%2.20e\n",i,accel[i]);
- // }
}
/* ----------------------------------------------------------------------------------------------- */
extern "C"
void FC_FUNC_(transfer_b_fields_to_device,
- TRANSFER_B_FIELDS_TO_DEVICE)(int* size, float* b_displ, float* b_veloc, float* b_accel,
+ TRANSFER_B_FIELDS_TO_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,
long* Mesh_pointer_f) {
TRACE("transfer_b_fields_to_device_");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- cudaMemcpy(mp->d_b_displ,b_displ,sizeof(float)*(*size),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_veloc,b_veloc,sizeof(float)*(*size),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_accel,b_accel,sizeof(float)*(*size),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_displ,b_displ,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_veloc,b_veloc,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_accel,b_accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice);
}
@@ -109,15 +101,15 @@
extern "C"
void FC_FUNC_(transfer_b_fields_from_device,
- TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, float* b_displ, float* b_veloc, float* b_accel,long* Mesh_pointer_f) {
+ TRANSFER_B_FIELDS_FROM_DEVICE)(int* size, realw* b_displ, realw* b_veloc, realw* b_accel,long* Mesh_pointer_f) {
TRACE("transfer_b_fields_from_device_");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- cudaMemcpy(b_displ,mp->d_b_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_veloc,mp->d_b_veloc,sizeof(float)*(*size),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_accel,mp->d_b_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_displ,mp->d_b_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_veloc,mp->d_b_veloc,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost);
}
@@ -126,13 +118,13 @@
extern "C"
void FC_FUNC_(transfer_accel_to_device,
- TRNASFER_ACCEL_TO_DEVICE)(int* size, float* accel,long* Mesh_pointer_f) {
+ TRNASFER_ACCEL_TO_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
TRACE("transfer_accel_to_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(float)*(*size),cudaMemcpyHostToDevice),40016);
+ print_CUDA_error_if_any(cudaMemcpy(mp->d_accel,accel,sizeof(realw)*(*size),cudaMemcpyHostToDevice),40016);
}
@@ -140,13 +132,13 @@
extern "C"
void FC_FUNC_(transfer_accel_from_device,
- TRANSFER_ACCEL_FROM_DEVICE)(int* size, float* accel,long* Mesh_pointer_f) {
+ TRANSFER_ACCEL_FROM_DEVICE)(int* size, realw* accel,long* Mesh_pointer_f) {
TRACE("transfer_accel_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40026);
+ print_CUDA_error_if_any(cudaMemcpy(accel,mp->d_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40026);
}
@@ -154,13 +146,13 @@
extern "C"
void FC_FUNC_(transfer_b_accel_from_device,
- TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, float* b_accel,long* Mesh_pointer_f) {
+ TRNASFER_B_ACCEL_FROM_DEVICE)(int* size, realw* b_accel,long* Mesh_pointer_f) {
TRACE("transfer_b_accel_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40036);
+ print_CUDA_error_if_any(cudaMemcpy(b_accel,mp->d_b_accel,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40036);
}
@@ -168,13 +160,13 @@
extern "C"
void FC_FUNC_(transfer_sigma_from_device,
- TRANSFER_SIGMA_FROM_DEVICE)(int* size, float* sigma_kl,long* Mesh_pointer_f) {
+ TRANSFER_SIGMA_FROM_DEVICE)(int* size, realw* sigma_kl,long* Mesh_pointer_f) {
TRACE("transfer_sigma_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40046);
+ print_CUDA_error_if_any(cudaMemcpy(sigma_kl,mp->d_Sigma_kl,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40046);
}
@@ -182,13 +174,13 @@
extern "C"
void FC_FUNC_(transfer_b_displ_from_device,
- TRANSFER_B_DISPL_FROM_DEVICE)(int* size, float* displ,long* Mesh_pointer_f) {
+ TRANSFER_B_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
TRACE("transfer_b_displ_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40056);
+ print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40056);
}
@@ -196,13 +188,13 @@
extern "C"
void FC_FUNC_(transfer_displ_from_device,
- TRANSFER_DISPL_FROM_DEVICE)(int* size, float* displ,long* Mesh_pointer_f) {
+ TRANSFER_DISPL_FROM_DEVICE)(int* size, realw* displ,long* Mesh_pointer_f) {
TRACE("transfer_displ_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(float)*(*size),cudaMemcpyDeviceToHost),40066);
+ print_CUDA_error_if_any(cudaMemcpy(displ,mp->d_displ,sizeof(realw)*(*size),cudaMemcpyDeviceToHost),40066);
}
@@ -211,15 +203,15 @@
extern "C"
void FC_FUNC_(transfer_compute_kernel_answers_from_device,
TRANSFER_COMPUTE_KERNEL_ANSWERS_FROM_DEVICE)(long* Mesh_pointer,
- float* rho_kl,int* size_rho,
- float* mu_kl, int* size_mu,
- float* kappa_kl, int* size_kappa) {
+ realw* rho_kl,int* size_rho,
+ realw* mu_kl, int* size_mu,
+ realw* kappa_kl, int* size_kappa) {
TRACE("transfer_compute_kernel_answers_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
}
*/
@@ -229,47 +221,47 @@
extern "C"
void FC_FUNC_(transfer_compute_kernel_fields_from_device,
TRANSFER_COMPUTE_KERNEL_FIELDS_FROM_DEVICE)(long* Mesh_pointer,
- float* accel, int* size_accel,
- float* b_displ, int* size_b_displ,
- float* epsilondev_xx,
- float* epsilondev_yy,
- float* epsilondev_xy,
- float* epsilondev_xz,
- float* epsilondev_yz,
+ realw* accel, int* size_accel,
+ realw* b_displ, int* size_b_displ,
+ realw* epsilondev_xx,
+ realw* epsilondev_yy,
+ realw* epsilondev_xy,
+ realw* epsilondev_xz,
+ realw* epsilondev_yz,
int* size_epsilondev,
- float* b_epsilondev_xx,
- float* b_epsilondev_yy,
- float* b_epsilondev_xy,
- float* b_epsilondev_xz,
- float* b_epsilondev_yz,
+ realw* b_epsilondev_xx,
+ realw* b_epsilondev_yy,
+ realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,
+ realw* b_epsilondev_yz,
int* size_b_epsilondev,
- float* rho_kl,int* size_rho,
- float* mu_kl, int* size_mu,
- float* kappa_kl, int* size_kappa,
- float* epsilon_trace_over_3,
- float* b_epsilon_trace_over_3,
+ realw* rho_kl,int* size_rho,
+ realw* mu_kl, int* size_mu,
+ realw* kappa_kl, int* size_kappa,
+ realw* epsilon_trace_over_3,
+ realw* b_epsilon_trace_over_3,
int* size_epsilon_trace_over_3) {
TRACE("transfer_compute_kernel_fields_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(float),
+ cudaMemcpy(accel,mp->d_accel,*size_accel*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_displ,mp->d_b_displ,*size_b_displ*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilondev_xx,mp->d_b_epsilondev_xx,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilondev_yy,mp->d_b_epsilondev_yy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilondev_xy,mp->d_b_epsilondev_xy,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilondev_xz,mp->d_b_epsilondev_xz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(b_epsilondev_yz,mp->d_b_epsilondev_yz,*size_b_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(rho_kl,mp->d_rho_kl,*size_rho*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(mu_kl,mp->d_mu_kl,*size_mu*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(kappa_kl,mp->d_kappa_kl,*size_kappa*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilon_trace_over_3,mp->d_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
cudaMemcpyDeviceToHost);
- cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(float),
+ cudaMemcpy(b_epsilon_trace_over_3,mp->d_b_epsilon_trace_over_3,*size_epsilon_trace_over_3*sizeof(realw),
cudaMemcpyDeviceToHost);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -285,29 +277,29 @@
extern "C"
void FC_FUNC_(transfer_b_fields_att_to_device,
TRANSFER_B_FIELDS_ATT_TO_DEVICE)(long* Mesh_pointer,
- float* b_R_xx,float* b_R_yy,float* b_R_xy,float* b_R_xz,float* b_R_yz,
+ realw* b_R_xx,realw* b_R_yy,realw* b_R_xy,realw* b_R_xz,realw* b_R_yz,
int* size_R,
- float* b_epsilondev_xx,
- float* b_epsilondev_yy,
- float* b_epsilondev_xy,
- float* b_epsilondev_xz,
- float* b_epsilondev_yz,
+ realw* b_epsilondev_xx,
+ realw* b_epsilondev_yy,
+ realw* b_epsilondev_xy,
+ realw* b_epsilondev_xz,
+ realw* b_epsilondev_yz,
int* size_epsilondev) {
TRACE("transfer_b_fields_att_to_device");
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
- cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_R_xx,b_R_xx,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_R_yy,b_R_yy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_R_xy,b_R_xy,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_R_xz,b_R_xz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_R_yz,b_R_yz,*size_R*sizeof(realw),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
- cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(float),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_epsilondev_xx,b_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_epsilondev_yy,b_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_epsilondev_xy,b_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_epsilondev_xz,b_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
+ cudaMemcpy(mp->d_b_epsilondev_yz,b_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyHostToDevice);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -322,29 +314,29 @@
extern "C"
void FC_FUNC_(transfer_fields_att_from_device,
TRANSFER_FIELDS_ATT_FROM_DEVICE)(long* Mesh_pointer,
- float* R_xx,float* R_yy,float* R_xy,float* R_xz,float* R_yz,
+ realw* R_xx,realw* R_yy,realw* R_xy,realw* R_xz,realw* R_yz,
int* size_R,
- float* epsilondev_xx,
- float* epsilondev_yy,
- float* epsilondev_xy,
- float* epsilondev_xz,
- float* epsilondev_yz,
+ realw* epsilondev_xx,
+ realw* epsilondev_yy,
+ realw* epsilondev_xy,
+ realw* epsilondev_xz,
+ realw* epsilondev_yz,
int* size_epsilondev) {
TRACE("transfer_fields_att_from_device");
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
- cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(R_xx,mp->d_R_xx,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(R_yy,mp->d_R_yy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(R_xy,mp->d_R_xy,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(R_xz,mp->d_R_xz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(R_yz,mp->d_R_yz,*size_R*sizeof(realw),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
- cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(float),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xx,mp->d_epsilondev_xx,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_yy,mp->d_epsilondev_yy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xy,mp->d_epsilondev_xy,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_xz,mp->d_epsilondev_xz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
+ cudaMemcpy(epsilondev_yz,mp->d_epsilondev_yz,*size_epsilondev*sizeof(realw),cudaMemcpyDeviceToHost);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -358,19 +350,19 @@
extern "C"
void FC_FUNC_(transfer_kernels_el_to_host,
TRANSFER_KERNELS_EL_TO_HOST)(long* Mesh_pointer,
- float* h_rho_kl,
- float* h_mu_kl,
- float* h_kappa_kl,
+ realw* h_rho_kl,
+ realw* h_mu_kl,
+ realw* h_kappa_kl,
int* NSPEC_AB) {
TRACE("transfer_kernels_el_to_host");
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
- print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*125*sizeof(float),
+ print_CUDA_error_if_any(cudaMemcpy(h_rho_kl,mp->d_rho_kl,*NSPEC_AB*NGLL3*sizeof(realw),
cudaMemcpyDeviceToHost),40101);
- print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*125*sizeof(float),
+ print_CUDA_error_if_any(cudaMemcpy(h_mu_kl,mp->d_mu_kl,*NSPEC_AB*NGLL3*sizeof(realw),
cudaMemcpyDeviceToHost),40102);
- print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*125*sizeof(float),
+ print_CUDA_error_if_any(cudaMemcpy(h_kappa_kl,mp->d_kappa_kl,*NSPEC_AB*NGLL3*sizeof(realw),
cudaMemcpyDeviceToHost),40103);
}
@@ -384,13 +376,13 @@
extern "C"
void FC_FUNC_(transfer_kernels_noise_to_host,
TRANSFER_KERNELS_NOISE_TO_HOST)(long* Mesh_pointer,
- float* h_Sigma_kl,
+ realw* h_Sigma_kl,
int* NSPEC_AB) {
TRACE("transfer_kernels_noise_to_host");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,125*(*NSPEC_AB)*sizeof(float),
+ print_CUDA_error_if_any(cudaMemcpy(h_Sigma_kl,mp->d_Sigma_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
cudaMemcpyDeviceToHost),40201);
}
@@ -406,20 +398,20 @@
void FC_FUNC_(transfer_fields_ac_to_device,
TRANSFER_FIELDS_AC_TO_DEVICE)(
int* size,
- float* potential_acoustic,
- float* potential_dot_acoustic,
- float* potential_dot_dot_acoustic,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
long* Mesh_pointer_f) {
TRACE("transfer_fields_ac_to_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_acoustic,potential_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),50110);
+ sizeof(realw)*(*size),cudaMemcpyHostToDevice),50110);
print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_acoustic,potential_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),50120);
+ sizeof(realw)*(*size),cudaMemcpyHostToDevice),50120);
print_CUDA_error_if_any(cudaMemcpy(mp->d_potential_dot_dot_acoustic,potential_dot_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),50130);
+ sizeof(realw)*(*size),cudaMemcpyHostToDevice),50130);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_fields_ac_to_device");
@@ -432,20 +424,20 @@
void FC_FUNC_(transfer_b_fields_ac_to_device,
TRANSFER_B_FIELDS_AC_TO_DEVICE)(
int* size,
- float* b_potential_acoustic,
- float* b_potential_dot_acoustic,
- float* b_potential_dot_dot_acoustic,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
long* Mesh_pointer_f) {
TRACE("transfer_b_fields_ac_to_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_acoustic,b_potential_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),51110);
+ sizeof(realw)*(*size),cudaMemcpyHostToDevice),51110);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_acoustic,b_potential_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),51120);
+ sizeof(realw)*(*size),cudaMemcpyHostToDevice),51120);
print_CUDA_error_if_any(cudaMemcpy(mp->d_b_potential_dot_dot_acoustic,b_potential_dot_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyHostToDevice),51130);
+ sizeof(realw)*(*size),cudaMemcpyHostToDevice),51130);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_b_fields_ac_to_device");
@@ -458,20 +450,20 @@
extern "C"
void FC_FUNC_(transfer_fields_ac_from_device,TRANSFER_FIELDS_AC_FROM_DEVICE)(
int* size,
- float* potential_acoustic,
- float* potential_dot_acoustic,
- float* potential_dot_dot_acoustic,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
long* Mesh_pointer_f) {
TRACE("transfer_fields_ac_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(potential_acoustic,mp->d_potential_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),52111);
+ sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52111);
print_CUDA_error_if_any(cudaMemcpy(potential_dot_acoustic,mp->d_potential_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),52121);
+ sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52121);
print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),52131);
+ sizeof(realw)*(*size),cudaMemcpyDeviceToHost),52131);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_fields_ac_from_device");
@@ -484,20 +476,20 @@
void FC_FUNC_(transfer_b_fields_ac_from_device,
TRANSFER_B_FIELDS_AC_FROM_DEVICE)(
int* size,
- float* b_potential_acoustic,
- float* b_potential_dot_acoustic,
- float* b_potential_dot_dot_acoustic,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
long* Mesh_pointer_f) {
TRACE("transfer_b_fields_ac_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(b_potential_acoustic,mp->d_b_potential_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),53111);
+ sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53111);
print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_acoustic,mp->d_b_potential_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),53121);
+ sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53121);
print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),53131);
+ sizeof(realw)*(*size),cudaMemcpyDeviceToHost),53131);
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
exit_on_cuda_error("after transfer_b_fields_ac_from_device");
@@ -508,14 +500,14 @@
extern "C"
void FC_FUNC_(transfer_dot_dot_from_device,
- TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, float* potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+ TRNASFER_DOT_DOT_FROM_DEVICE)(int* size, realw* potential_dot_dot_acoustic,long* Mesh_pointer_f) {
TRACE("transfer_dot_dot_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(potential_dot_dot_acoustic,mp->d_potential_dot_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),50041);
+ sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50041);
}
@@ -523,14 +515,14 @@
extern "C"
void FC_FUNC_(transfer_b_dot_dot_from_device,
- TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, float* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {
+ TRNASFER_B_DOT_DOT_FROM_DEVICE)(int* size, realw* b_potential_dot_dot_acoustic,long* Mesh_pointer_f) {
TRACE("transfer_b_dot_dot_from_device");
Mesh* mp = (Mesh*)(*Mesh_pointer_f); //get mesh pointer out of fortran integer container
print_CUDA_error_if_any(cudaMemcpy(b_potential_dot_dot_acoustic,mp->d_b_potential_dot_dot_acoustic,
- sizeof(float)*(*size),cudaMemcpyDeviceToHost),50042);
+ sizeof(realw)*(*size),cudaMemcpyDeviceToHost),50042);
}
@@ -540,20 +532,20 @@
extern "C"
void FC_FUNC_(transfer_kernels_ac_to_host,
TRANSFER_KERNELS_AC_TO_HOST)(long* Mesh_pointer,
- float* h_rho_ac_kl,
- float* h_kappa_ac_kl,
+ realw* h_rho_ac_kl,
+ realw* h_kappa_ac_kl,
int* NSPEC_AB) {
TRACE("transfer_kernels_ac_to_host");
//get mesh pointer out of fortran integer container
Mesh* mp = (Mesh*)(*Mesh_pointer);
- int size = *NSPEC_AB*125;
+ int size = *NSPEC_AB*NGLL3;
// copies kernel values over to CPU host
- print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(float),
+ print_CUDA_error_if_any(cudaMemcpy(h_rho_ac_kl,mp->d_rho_ac_kl,size*sizeof(realw),
cudaMemcpyDeviceToHost),54101);
- print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(float),
+ print_CUDA_error_if_any(cudaMemcpy(h_kappa_ac_kl,mp->d_kappa_ac_kl,size*sizeof(realw),
cudaMemcpyDeviceToHost),54102);
}
@@ -566,13 +558,13 @@
extern "C"
void FC_FUNC_(transfer_kernels_hess_el_tohost,
TRANSFER_KERNELS_HESS_EL_TOHOST)(long* Mesh_pointer,
- float* h_hess_kl,
+ realw* h_hess_kl,
int* NSPEC_AB) {
TRACE("transfer_kernels_hess_el_tohost");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_el_kl,125*(*NSPEC_AB)*sizeof(float),
+ print_CUDA_error_if_any(cudaMemcpy(h_hess_kl,mp->d_hess_el_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
cudaMemcpyDeviceToHost),70201);
}
@@ -581,13 +573,13 @@
extern "C"
void FC_FUNC_(transfer_kernels_hess_ac_tohost,
TRANSFER_KERNELS_HESS_AC_TOHOST)(long* Mesh_pointer,
- float* h_hess_ac_kl,
+ realw* h_hess_ac_kl,
int* NSPEC_AB) {
TRACE("transfer_kernels_hess_ac_tohost");
Mesh* mp = (Mesh*)(*Mesh_pointer); //get mesh pointer out of fortran integer container
- print_CUDA_error_if_any(cudaMemcpy(h_hess_ac_kl,mp->d_hess_ac_kl,125*(*NSPEC_AB)*sizeof(float),
+ print_CUDA_error_if_any(cudaMemcpy(h_hess_ac_kl,mp->d_hess_ac_kl,NGLL3*(*NSPEC_AB)*sizeof(realw),
cudaMemcpyDeviceToHost),70202);
}
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/cuda/write_seismograms_cuda.cu 2011-11-06 02:02:36 UTC (rev 19152)
@@ -46,36 +46,25 @@
__global__ void transfer_stations_fields_from_device_kernel(int* number_receiver_global,
int* ispec_selected_rec,
int* ibool,
- float* station_seismo_field,
- float* desired_field,
- int nrec_local //,int* debug_index
- ) {
+ realw* station_seismo_field,
+ realw* desired_field,
+ int nrec_local) {
int blockID = blockIdx.x + blockIdx.y*gridDim.x;
if(blockID<nrec_local) {
- //int nodeID = threadIdx.x + blockID*blockDim.x;
int irec = number_receiver_global[blockID]-1;
- int ispec = ispec_selected_rec[irec]-1; // ispec==0 before -1???
- // if(threadIdx.x==1 && blockID < 125) {
- // // debug_index[threadIdx.x] = threadIdx.x + 125*ispec;
- // debug_index[blockID] = ispec;
- // debug_index[blockID + 4] = irec;
- // debug_index[blockID + 8] = ispec_selected_rec[0];
- // debug_index[blockID + 9] = ispec_selected_rec[1];
- // debug_index[blockID +10] = ispec_selected_rec[2];
- // debug_index[blockID +11] = ispec_selected_rec[3];
- // debug_index[blockID +12] = ispec_selected_rec[4];
- // }
- int iglob = ibool[threadIdx.x + 125*ispec]-1;
- station_seismo_field[3*125*blockID + 3*threadIdx.x+0] = desired_field[3*iglob];
- station_seismo_field[3*125*blockID + 3*threadIdx.x+1] = desired_field[3*iglob+1];
- station_seismo_field[3*125*blockID + 3*threadIdx.x+2] = desired_field[3*iglob+2];
+ int ispec = ispec_selected_rec[irec]-1;
+ int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
+
+ station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+0] = desired_field[3*iglob];
+ station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+1] = desired_field[3*iglob+1];
+ station_seismo_field[3*NGLL3*blockID + 3*threadIdx.x+2] = desired_field[3*iglob+2];
}
}
/* ----------------------------------------------------------------------------------------------- */
-void transfer_field_from_device(Mesh* mp, float* d_field,float* h_field,
+void transfer_field_from_device(Mesh* mp, realw* d_field,realw* h_field,
int* number_receiver_global,
int* d_ispec_selected,
int* h_ispec_selected,
@@ -86,11 +75,9 @@
// checks if anything to do
if( mp->nrec_local == 0 ) return;
- int blocksize = 125;
+ int blocksize = NGLL3;
int num_blocks_x = mp->nrec_local;
int num_blocks_y = 1;
- int myrank;
- MPI_Comm_rank(MPI_COMM_WORLD,&myrank);
while(num_blocks_x > 65535) {
num_blocks_x = ceil(num_blocks_x/2.0);
num_blocks_y = num_blocks_y*2;
@@ -99,59 +86,41 @@
dim3 grid(num_blocks_x,num_blocks_y);
dim3 threads(blocksize,1,1);
- //int* d_debug_index;
- //int* h_debug_index;
- //cudaMalloc((void**)&d_debug_index,125*sizeof(int));
- //h_debug_index = (int*)calloc(125,sizeof(int));
- //cudaMemcpy(d_debug_index,h_debug_index,125*sizeof(int),cudaMemcpyHostToDevice);
-
-
// prepare field transfer array on device
transfer_stations_fields_from_device_kernel<<<grid,threads>>>(mp->d_number_receiver_global,
d_ispec_selected,
mp->d_ibool,
mp->d_station_seismo_field,
d_field,
- mp->nrec_local //,d_debug_index
- );
+ mp->nrec_local);
- //cudaMemcpy(h_debug_index,d_debug_index,125*sizeof(int),cudaMemcpyDeviceToHost);
-
- // pause_for_debug(1);
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("transfer_stations_fields_from_device_kernel");
-#endif
-
cudaMemcpy(mp->h_station_seismo_field,mp->d_station_seismo_field,
- (3*125)*(mp->nrec_local)*sizeof(float),cudaMemcpyDeviceToHost);
+ (3*NGLL3)*(mp->nrec_local)*sizeof(realw),cudaMemcpyDeviceToHost);
-#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
- exit_on_cuda_error("transfer_stations_fields_from_device_kernel_memcpy");
-#endif
-
- // pause_for_debug(1);
int irec_local;
-
for(irec_local=0;irec_local<mp->nrec_local;irec_local++) {
int irec = number_receiver_global[irec_local] - 1;
int ispec = h_ispec_selected[irec] - 1;
- for(int i=0;i<125;i++) {
- int iglob = ibool[i+125*ispec] - 1;
- h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*125*3];
- h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*125*3];
- h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*125*3];
+ for(int i=0;i<NGLL3;i++) {
+ int iglob = ibool[i+NGLL3*ispec] - 1;
+ h_field[0+3*iglob] = mp->h_station_seismo_field[0+3*i+irec_local*NGLL3*3];
+ h_field[1+3*iglob] = mp->h_station_seismo_field[1+3*i+irec_local*NGLL3*3];
+ h_field[2+3*iglob] = mp->h_station_seismo_field[2+3*i+irec_local*NGLL3*3];
}
}
+#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
+ exit_on_cuda_error("transfer_field_from_device");
+#endif
}
/* ----------------------------------------------------------------------------------------------- */
extern "C"
void FC_FUNC_(transfer_station_el_from_device,
- TRANSFER_STATION_EL_FROM_DEVICE)(float* displ,float* veloc,float* accel,
- float* b_displ, float* b_veloc, float* b_accel,
+ TRANSFER_STATION_EL_FROM_DEVICE)(realw* displ,realw* veloc,realw* accel,
+ realw* b_displ, realw* b_veloc, realw* b_accel,
long* Mesh_pointer_f,int* number_receiver_global,
int* ispec_selected_rec,int* ispec_selected_source,
int* ibool,int* SIMULATION_TYPEf) {
@@ -199,15 +168,15 @@
__global__ void transfer_stations_fields_acoustic_from_device_kernel(int* number_receiver_global,
int* ispec_selected_rec,
int* ibool,
- float* station_seismo_potential,
- float* desired_potential) {
+ realw* station_seismo_potential,
+ realw* desired_potential) {
int blockID = blockIdx.x + blockIdx.y*gridDim.x;
int nodeID = threadIdx.x + blockID*blockDim.x;
int irec = number_receiver_global[blockID]-1;
int ispec = ispec_selected_rec[irec]-1;
- int iglob = ibool[threadIdx.x + 125*ispec]-1;
+ int iglob = ibool[threadIdx.x + NGLL3*ispec]-1;
//if(threadIdx.x == 0 ) printf("node acoustic: %i %i %i %i %i %e \n",blockID,nodeID,irec,ispec,iglob,desired_potential[iglob]);
@@ -217,8 +186,8 @@
/* ----------------------------------------------------------------------------------------------- */
void transfer_field_acoustic_from_device(Mesh* mp,
- float* d_potential,
- float* h_potential,
+ realw* d_potential,
+ realw* h_potential,
int* number_receiver_global,
int* d_ispec_selected,
int* h_ispec_selected,
@@ -232,7 +201,7 @@
if( mp->nrec_local == 0 ) return;
// sets up kernel dimensions
- int blocksize = 125;
+ int blocksize = NGLL3;
int num_blocks_x = mp->nrec_local;
int num_blocks_y = 1;
while(num_blocks_x > 65535) {
@@ -252,7 +221,7 @@
print_CUDA_error_if_any(cudaMemcpy(mp->h_station_seismo_potential,mp->d_station_seismo_potential,
- mp->nrec_local*125*sizeof(float),cudaMemcpyDeviceToHost),500);
+ mp->nrec_local*NGLL3*sizeof(realw),cudaMemcpyDeviceToHost),500);
//printf("copy local receivers: %i \n",mp->nrec_local);
@@ -262,14 +231,14 @@
// copy element values
// note: iglob may vary and can be irregularly accessing the h_potential array
- for(j=0; j < 125; j++){
- iglob = ibool[j+125*ispec]-1;
- h_potential[iglob] = mp->h_station_seismo_potential[j+irec_local*125];
+ for(j=0; j < NGLL3; j++){
+ iglob = ibool[j+NGLL3*ispec]-1;
+ h_potential[iglob] = mp->h_station_seismo_potential[j+irec_local*NGLL3];
}
// copy each station element's points to working array
// note: this works if iglob values would be all aligned...
- //memcpy(&(h_potential[iglob]),&(mp->h_station_seismo_potential[irec_local*125]),125*sizeof(float));
+ //memcpy(&(h_potential[iglob]),&(mp->h_station_seismo_potential[irec_local*NGLL3]),NGLL3*sizeof(realw));
}
#ifdef ENABLE_VERY_SLOW_ERROR_CHECKING
@@ -282,12 +251,12 @@
extern "C"
void FC_FUNC_(transfer_station_ac_from_device,
TRANSFER_STATION_AC_FROM_DEVICE)(
- float* potential_acoustic,
- float* potential_dot_acoustic,
- float* potential_dot_dot_acoustic,
- float* b_potential_acoustic,
- float* b_potential_dot_acoustic,
- float* b_potential_dot_dot_acoustic,
+ realw* potential_acoustic,
+ realw* potential_dot_acoustic,
+ realw* potential_dot_dot_acoustic,
+ realw* b_potential_acoustic,
+ realw* b_potential_dot_acoustic,
+ realw* b_potential_dot_dot_acoustic,
long* Mesh_pointer_f,
int* number_receiver_global,
int* ispec_selected_rec,
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/generate_databases.f90 2011-11-06 02:02:36 UTC (rev 19152)
@@ -559,7 +559,7 @@
integer :: num_xmin,num_xmax,num_ymin,num_ymax,num_top,num_bottom,num
integer :: num_moho
integer :: j
- character(len=128) :: line
+ !character(len=128) :: line
! read databases about external mesh simulation
! global node coordinates
@@ -823,10 +823,10 @@
! no moho informations given
nspec2D_moho_ext = 0
boundary_number = 7
- else
- ! tries to read in number of moho elements
- read(line,*,iostat=ier) boundary_number ,nspec2D_moho_ext
- if( ier /= 0 ) call exit_mpi(myrank,'error reading moho mesh in database')
+ !else
+ ! ! tries to read in number of moho elements
+ ! read(line,*,iostat=ier) boundary_number ,nspec2D_moho_ext
+ ! if( ier /= 0 ) call exit_mpi(myrank,'error reading moho mesh in database')
endif
if(boundary_number /= 7) stop "Error : invalid database file"
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_aniso.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_aniso.f90 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/generate_databases/model_aniso.f90 2011-11-06 02:02:36 UTC (rev 19152)
@@ -309,5 +309,28 @@
c56 = - d46
c66 = d66
+! unused: fills values with the isotropic model
+! c11 = rho*vpv*vpv
+! c12 = rho*(vpv*vpv-2.*vsv*vsv)
+! c13 = c12
+! c14 = 0.d0
+! c15 = 0.d0
+! c16 = 0.d0
+! c22 = c11
+! c23 = c12
+! c24 = 0.d0
+! c25 = 0.d0
+! c26 = 0.d0
+! c33 = c11
+! c34 = 0.d0
+! c35 = 0.d0
+! c36 = 0.d0
+! c44 = rho*vsv*vsv
+! c45 = 0.d0
+! c46 = 0.d0
+! c55 = c44
+! c56 = 0.d0
+! c66 = c44
+
end subroutine model_aniso
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/meshfem3D/save_databases.f90 2011-11-06 02:02:36 UTC (rev 19152)
@@ -72,7 +72,7 @@
! second dimension : #rho #vp #vs #Q_flag #anisotropy_flag #domain_id
double precision , dimension(NMATERIALS,6) :: material_properties
double precision , dimension(6) :: matpropl
- integer i,ispec,iglob
+ integer :: i,ispec,iglob,ier
! name of the database files
character(len=256) prname
@@ -82,67 +82,69 @@
logical, dimension(8) :: interfaces
integer, dimension(8) :: nspec_interface
+ integer, parameter :: IIN_database = 15
- !open(unit=15,file=prname(1:len_trim(prname))//'Database',status='unknown',action='write',form='formatted')
- open(unit=15,file=prname(1:len_trim(prname))//'Database', &
- status='unknown',action='write',form='unformatted')
+ open(unit=IIN_database,file=prname(1:len_trim(prname))//'Database', &
+ status='unknown',action='write',form='unformatted',iostat=ier)
+ if( ier /= 0 ) stop 'error opening Database file'
- write(15) nglob
+ write(IIN_database) nglob
do iglob=1,nglob
- write(15) iglob,nodes_coords(iglob,1),nodes_coords(iglob,2),nodes_coords(iglob,3)
+ write(IIN_database) iglob,nodes_coords(iglob,1),nodes_coords(iglob,2),nodes_coords(iglob,3)
end do
! Materials properties
- write(15) NMATERIALS, 0
+ write(IIN_database) NMATERIALS, 0
do idoubl = 1,NMATERIALS
- !write(15,*) material_properties(idoubl,:)
- matpropl(:) = material_properties(idoubl,:)
- write(15) matpropl
+ !write(IIN_database,*) material_properties(idoubl,:)
+ matpropl(:) = 0.d0
+ matpropl(1:6) = material_properties(idoubl,1:6)
+ ! pad dummy zeros to fill up 16 entries (poroelastic medium not allowed)
+ write(IIN_database) matpropl
end do
- write(15) nspec
+ write(IIN_database) nspec
do ispec=1,nspec
- !write(15,'(11i14)') ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
+ !write(IIN_database,'(11i14)') ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
! ibool(2,2,1,ispec),ibool(1,2,1,ispec),ibool(1,1,2,ispec),&
! ibool(2,1,2,ispec),ibool(2,2,2,ispec),ibool(1,2,2,ispec)
- write(15) ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
+ write(IIN_database) ispec,true_material_num(ispec),1,ibool(1,1,1,ispec),ibool(2,1,1,ispec),&
ibool(2,2,1,ispec),ibool(1,2,1,ispec),ibool(1,1,2,ispec),&
ibool(2,1,2,ispec),ibool(2,2,2,ispec),ibool(1,2,2,ispec)
-
end do
! Boundaries
- write(15) 1,nspec2D_xmin
- write(15) 2,nspec2D_xmax
- write(15) 3,nspec2D_ymin
- write(15) 4,nspec2D_ymax
- write(15) 5,NSPEC2D_BOTTOM
- write(15) 6,NSPEC2D_TOP
+ write(IIN_database) 1,nspec2D_xmin
+ write(IIN_database) 2,nspec2D_xmax
+ write(IIN_database) 3,nspec2D_ymin
+ write(IIN_database) 4,nspec2D_ymax
+ write(IIN_database) 5,NSPEC2D_BOTTOM
+ write(IIN_database) 6,NSPEC2D_TOP
do i=1,nspec2D_xmin
- write(15) ibelm_xmin(i),ibool(1,1,1,ibelm_xmin(i)),ibool(1,NGLLY,1,ibelm_xmin(i)),&
+ write(IIN_database) ibelm_xmin(i),ibool(1,1,1,ibelm_xmin(i)),ibool(1,NGLLY,1,ibelm_xmin(i)),&
ibool(1,1,NGLLZ,ibelm_xmin(i)),ibool(1,NGLLY,NGLLZ,ibelm_xmin(i))
end do
do i=1,nspec2D_xmax
- write(15) ibelm_xmax(i),ibool(NGLLX,1,1,ibelm_xmax(i)),ibool(NGLLX,NGLLY,1,ibelm_xmax(i)), &
+ write(IIN_database) ibelm_xmax(i),ibool(NGLLX,1,1,ibelm_xmax(i)),ibool(NGLLX,NGLLY,1,ibelm_xmax(i)), &
ibool(NGLLX,1,NGLLZ,ibelm_xmax(i)),ibool(NGLLX,NGLLY,NGLLZ,ibelm_xmax(i))
end do
do i=1,nspec2D_ymin
- write(15) ibelm_ymin(i),ibool(1,1,1,ibelm_ymin(i)),ibool(NGLLX,1,1,ibelm_ymin(i)),&
+ write(IIN_database) ibelm_ymin(i),ibool(1,1,1,ibelm_ymin(i)),ibool(NGLLX,1,1,ibelm_ymin(i)),&
ibool(1,1,NGLLZ,ibelm_ymin(i)),ibool(NGLLX,1,NGLLZ,ibelm_ymin(i))
end do
do i=1,nspec2D_ymax
- write(15) ibelm_ymax(i),ibool(NGLLX,NGLLY,1,ibelm_ymax(i)),ibool(1,NGLLY,1,ibelm_ymax(i)), &
+ write(IIN_database) ibelm_ymax(i),ibool(NGLLX,NGLLY,1,ibelm_ymax(i)),ibool(1,NGLLY,1,ibelm_ymax(i)), &
ibool(NGLLX,NGLLY,NGLLZ,ibelm_ymax(i)),ibool(1,NGLLY,NGLLZ,ibelm_ymax(i))
end do
do i=1,NSPEC2D_BOTTOM
- write(15) ibelm_bottom(i),ibool(1,1,1,ibelm_bottom(i)),ibool(NGLLX,1,1,ibelm_bottom(i)), &
+ write(IIN_database) ibelm_bottom(i),ibool(1,1,1,ibelm_bottom(i)),ibool(NGLLX,1,1,ibelm_bottom(i)), &
ibool(NGLLX,NGLLY,1,ibelm_bottom(i)),ibool(1,NGLLY,1,ibelm_bottom(i))
end do
do i=1,NSPEC2D_TOP
- write(15) ibelm_top(i),ibool(1,1,NGLLZ,ibelm_top(i)),ibool(NGLLX,1,NGLLZ,ibelm_top(i)), &
+ write(IIN_database) ibelm_top(i),ibool(1,1,NGLLZ,ibelm_top(i)),ibool(NGLLX,1,NGLLZ,ibelm_top(i)), &
ibool(NGLLX,NGLLY,NGLLZ,ibelm_top(i)),ibool(1,NGLLY,NGLLZ,ibelm_top(i))
end do
@@ -200,86 +202,82 @@
nspec_interfaces_max = maxval(nspec_interface)
- write(15) nb_interfaces,nspec_interfaces_max
+ write(IIN_database) nb_interfaces,nspec_interfaces_max
if(interfaces(W)) then
- write(15) addressing(iproc_xi-1,iproc_eta),nspec_interface(W)
+ write(IIN_database) addressing(iproc_xi-1,iproc_eta),nspec_interface(W)
do ispec = 1,nspec
- if(iMPIcut_xi(1,ispec)) write(15) ispec,4,ibool(1,1,1,ispec),ibool(1,2,1,ispec), &
+ if(iMPIcut_xi(1,ispec)) write(IIN_database) ispec,4,ibool(1,1,1,ispec),ibool(1,2,1,ispec), &
ibool(1,1,2,ispec),ibool(1,2,2,ispec)
end do
end if
if(interfaces(E)) then
- write(15) addressing(iproc_xi+1,iproc_eta),nspec_interface(E)
+ write(IIN_database) addressing(iproc_xi+1,iproc_eta),nspec_interface(E)
do ispec = 1,nspec
- if(iMPIcut_xi(2,ispec)) write(15) ispec,4,ibool(2,1,1,ispec),ibool(2,2,1,ispec), &
+ if(iMPIcut_xi(2,ispec)) write(IIN_database) ispec,4,ibool(2,1,1,ispec),ibool(2,2,1,ispec), &
ibool(2,1,2,ispec),ibool(2,2,2,ispec)
end do
end if
if(interfaces(S)) then
- write(15) addressing(iproc_xi,iproc_eta-1),nspec_interface(S)
+ write(IIN_database) addressing(iproc_xi,iproc_eta-1),nspec_interface(S)
do ispec = 1,nspec
- if(iMPIcut_eta(1,ispec)) write(15) ispec,4,ibool(1,1,1,ispec),ibool(2,1,1,ispec), &
+ if(iMPIcut_eta(1,ispec)) write(IIN_database) ispec,4,ibool(1,1,1,ispec),ibool(2,1,1,ispec), &
ibool(1,1,2,ispec),ibool(2,1,2,ispec)
end do
end if
if(interfaces(N)) then
- write(15) addressing(iproc_xi,iproc_eta+1),nspec_interface(N)
+ write(IIN_database) addressing(iproc_xi,iproc_eta+1),nspec_interface(N)
do ispec = 1,nspec
- if(iMPIcut_eta(2,ispec)) write(15) ispec,4,ibool(2,2,1,ispec),ibool(1,2,1,ispec), &
+ if(iMPIcut_eta(2,ispec)) write(IIN_database) ispec,4,ibool(2,2,1,ispec),ibool(1,2,1,ispec), &
ibool(2,2,2,ispec),ibool(1,2,2,ispec)
end do
end if
if(interfaces(NW)) then
- write(15) addressing(iproc_xi-1,iproc_eta+1),nspec_interface(NW)
+ write(IIN_database) addressing(iproc_xi-1,iproc_eta+1),nspec_interface(NW)
do ispec = 1,nspec
if((iMPIcut_xi(1,ispec) .eqv. .true.) .and. (iMPIcut_eta(2,ispec) .eqv. .true.)) then
- write(15) ispec,2,ibool(1,2,1,ispec),ibool(1,2,2,ispec),-1,-1
+ write(IIN_database) ispec,2,ibool(1,2,1,ispec),ibool(1,2,2,ispec),-1,-1
end if
end do
end if
if(interfaces(NE)) then
- write(15) addressing(iproc_xi+1,iproc_eta+1),nspec_interface(NE)
+ write(IIN_database) addressing(iproc_xi+1,iproc_eta+1),nspec_interface(NE)
do ispec = 1,nspec
if((iMPIcut_xi(2,ispec) .eqv. .true.) .and. (iMPIcut_eta(2,ispec) .eqv. .true.)) then
- write(15) ispec,2,ibool(2,2,1,ispec),ibool(2,2,2,ispec),-1,-1
+ write(IIN_database) ispec,2,ibool(2,2,1,ispec),ibool(2,2,2,ispec),-1,-1
end if
end do
end if
if(interfaces(SE)) then
- write(15) addressing(iproc_xi+1,iproc_eta-1),nspec_interface(SE)
+ write(IIN_database) addressing(iproc_xi+1,iproc_eta-1),nspec_interface(SE)
do ispec = 1,nspec
if((iMPIcut_xi(2,ispec) .eqv. .true.) .and. (iMPIcut_eta(1,ispec) .eqv. .true.)) then
- write(15) ispec,2,ibool(2,1,1,ispec),ibool(2,1,2,ispec),-1,-1
+ write(IIN_database) ispec,2,ibool(2,1,1,ispec),ibool(2,1,2,ispec),-1,-1
end if
end do
end if
if(interfaces(SW)) then
- write(15) addressing(iproc_xi-1,iproc_eta-1),nspec_interface(SW)
+ write(IIN_database) addressing(iproc_xi-1,iproc_eta-1),nspec_interface(SW)
do ispec = 1,nspec
if((iMPIcut_xi(1,ispec) .eqv. .true.) .and. (iMPIcut_eta(1,ispec) .eqv. .true.)) then
- write(15) ispec,2,ibool(1,1,1,ispec),ibool(1,1,2,ispec),-1,-1
+ write(IIN_database) ispec,2,ibool(1,1,1,ispec),ibool(1,1,2,ispec),-1,-1
end if
end do
end if
else
- write(15) 0,0
+ write(IIN_database) 0,0
end if
- close(15)
+ close(IIN_database)
-
end subroutine save_databases
-
-
-
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/Makefile.in 2011-11-06 02:02:36 UTC (rev 19152)
@@ -133,7 +133,8 @@
CUDA_OBJECTS = \
$O/check_fields_cuda.cuda.o \
- $O/compute_add_sources_cuda.cuda.o \
+ $O/compute_add_sources_acoustic_cuda.cuda.o \
+ $O/compute_add_sources_elastic_cuda.cuda.o \
$O/compute_coupling_cuda.cuda.o \
$O/compute_forces_acoustic_cuda.cuda.o \
$O/compute_forces_elastic_cuda.cuda.o \
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_acoustic.f90 2011-11-06 02:02:36 UTC (rev 19152)
@@ -133,8 +133,8 @@
! write(*,*) "fortran dt = ", dt
! change dt -> DT
call compute_add_sources_ac_cuda(Mesh_pointer, phase_is_inner, &
- NSOURCES, SIMULATION_TYPE, &
- USE_FORCE_POINT_SOURCE, stf_pre_compute, myrank)
+ NSOURCES, SIMULATION_TYPE, &
+ stf_pre_compute, myrank)
endif
else ! .NOT. GPU_MODE
@@ -411,8 +411,8 @@
! only implements SIMTYPE=3
call compute_add_sources_ac_s3_cuda(Mesh_pointer, phase_is_inner, &
- NSOURCES, SIMULATION_TYPE, &
- USE_FORCE_POINT_SOURCE, stf_pre_compute, myrank)
+ NSOURCES, SIMULATION_TYPE, &
+ stf_pre_compute, myrank)
endif
else ! .NOT. GPU_MODE
@@ -512,5 +512,4 @@
if( myrank == 0 ) write(IOSTF,*) time_source,stf_used_total_all
endif
-
-end subroutine compute_add_sources_acoustic
+ end subroutine compute_add_sources_acoustic
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_add_sources_elastic.f90 2011-11-06 02:02:36 UTC (rev 19152)
@@ -133,11 +133,7 @@
! write(*,*) "fortran dt = ", dt
! change dt -> DT
call compute_add_sources_el_cuda(Mesh_pointer, &
- !NSPEC_AB, NGLOB_AB,
phase_is_inner,NSOURCES, &
- !it, DT, t0, &
- !SIMULATION_TYPE, NSTEP, NOISE_TOMOGRAPHY,&
- !USE_FORCE_POINT_SOURCE, &
stf_pre_compute, myrank)
else ! .NOT. GPU_MODE
@@ -363,7 +359,7 @@
endif
enddo ! nrec
else ! GPU_MODE == .true.
- call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
+ call add_sources_el_sim_type_2_or_3(Mesh_pointer,adj_sourcearrays,phase_is_inner, &
ispec_is_inner,ispec_is_elastic, &
ispec_selected_rec,myrank,nrec, &
NTSTEP_BETWEEN_READ_ADJSRC - mod(it-1,NTSTEP_BETWEEN_READ_ADJSRC), &
@@ -387,8 +383,8 @@
dble(NSTEP-it)*DT-t0-tshift_cmt(isource),hdur_gaussian(isource))
enddo
- call compute_add_sources_el_s3_cuda(Mesh_pointer, USE_FORCE_POINT_SOURCE,&
- stf_pre_compute, NSOURCES,phase_is_inner,myrank)
+ call compute_add_sources_el_s3_cuda(Mesh_pointer,stf_pre_compute, &
+ NSOURCES,phase_is_inner,myrank)
else ! .NOT. GPU_MODE
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/compute_forces_elastic.f90 2011-11-06 02:02:36 UTC (rev 19152)
@@ -124,7 +124,8 @@
call compute_forces_elastic_cuda(Mesh_pointer, iphase, &
nspec_outer_elastic, &
nspec_inner_elastic, &
- SIMULATION_TYPE,COMPUTE_AND_STORE_STRAIN,ATTENUATION)
+ SIMULATION_TYPE, &
+ COMPUTE_AND_STORE_STRAIN,ATTENUATION,ANISOTROPY)
endif ! GPU_MODE
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/initialize_simulation.f90 2011-11-06 02:02:36 UTC (rev 19152)
@@ -217,8 +217,6 @@
if( N_SLS /= 3 ) &
stop 'GPU mode does not support N_SLS /= 3 yet'
endif
- if( ANISOTROPY ) &
- stop 'GPU mode does not support ANISOTROPY yet'
endif
! absorbing surfaces
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/iterate_time.f90 2011-11-06 02:02:36 UTC (rev 19152)
@@ -184,6 +184,11 @@
endif
endif
endif
+ ! check stability of the code, exit if unstable
+ ! negative values can occur with some compilers when the unstable value is greater
+ ! than the greatest possible floating-point number of the machine
+ if(Usolidnorm > STABILITY_THRESHOLD .or. Usolidnorm < 0) &
+ call exit_MPI(myrank,'forward simulation became unstable and blew up')
! compute the maximum of the maxima for all the slices using an MPI reduction
call max_all_cr(Usolidnorm,Usolidnorm_all)
@@ -207,6 +212,13 @@
endif
endif
endif
+ ! check stability of the code, exit if unstable
+ ! negative values can occur with some compilers when the unstable value is greater
+ ! than the greatest possible floating-point number of the machine
+ if(b_Usolidnorm > STABILITY_THRESHOLD .or. b_Usolidnorm < 0) &
+ call exit_MPI(myrank,'backward simulation became unstable and blew up')
+
+ ! compute max of all slices
call max_all_cr(b_Usolidnorm,b_Usolidnorm_all)
endif
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/prepare_timerun.f90 2011-11-06 02:02:36 UTC (rev 19152)
@@ -780,7 +780,7 @@
nrec, nrec_local, &
SIMULATION_TYPE, &
USE_MESH_COLORING_GPU,nspec_acoustic,nspec_elastic, &
- ncuda_devices)
+ myrank,ncuda_devices)
call min_all_i(ncuda_devices,ncuda_devices_min)
call max_all_i(ncuda_devices,ncuda_devices_max)
@@ -828,7 +828,12 @@
num_free_surface_faces, &
ACOUSTIC_SIMULATION, &
num_colors_outer_elastic,num_colors_inner_elastic, &
- num_elem_colors_elastic)
+ num_elem_colors_elastic, &
+ ANISOTROPY, &
+ c11store,c12store,c13store,c14store,c15store,c16store, &
+ c22store,c23store,c24store,c25store,c26store, &
+ c33store,c34store,c35store,c36store, &
+ c44store,c45store,c46store,c55store,c56store,c66store)
if( SIMULATION_TYPE == 3 ) &
call prepare_fields_elastic_adj_dev(Mesh_pointer, NDIM*NGLOB_AB, &
Modified: seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90
===================================================================
--- seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90 2011-11-05 01:28:57 UTC (rev 19151)
+++ seismo/3D/SPECFEM3D/branches/SPECFEM3D_SUNFLOWER/src/specfem3D/read_mesh_databases.f90 2011-11-06 02:02:36 UTC (rev 19152)
@@ -407,17 +407,17 @@
kappastore,mustore,rho_vp,rho_vs, &
DT,model_speed_max,min_resolved_period )
else if( ACOUSTIC_SIMULATION ) then
- allocate(rho_vp(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
- if( ier /= 0 ) stop 'error allocating array rho_vp'
- allocate(rho_vs(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
- if( ier /= 0 ) stop 'error allocating array rho_vs'
- rho_vp = sqrt( kappastore / rhostore ) * rhostore
- rho_vs = 0.0_CUSTOM_REAL
- call check_mesh_resolution(myrank,NSPEC_AB,NGLOB_AB, &
+ allocate(rho_vp(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array rho_vp'
+ allocate(rho_vs(NGLLX,NGLLY,NGLLZ,NSPEC_AB),stat=ier)
+ if( ier /= 0 ) stop 'error allocating array rho_vs'
+ rho_vp = sqrt( kappastore / rhostore ) * rhostore
+ rho_vs = 0.0_CUSTOM_REAL
+ call check_mesh_resolution(myrank,NSPEC_AB,NGLOB_AB, &
ibool,xstore,ystore,zstore, &
kappastore,mustore,rho_vp,rho_vs, &
DT,model_speed_max,min_resolved_period )
- deallocate(rho_vp,rho_vs)
+ deallocate(rho_vp,rho_vs)
endif
! reads adjoint parameters
More information about the CIG-COMMITS
mailing list